## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre sobre este tema

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [162]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from scipy.spatial import distance
import math
import random
import bz2
import itertools
import pickle
from sklearn.feature_extraction.text import CountVectorizer

In [163]:
#expand jupyter cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Opening movie reviews

In [164]:
messages = pd.read_csv('./chatdata/movie_lines_pre_processed.tsv', delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [165]:
messages.head()

Unnamed: 0,msg_line,user_id,movie_id,msg,msg_pre_processed,msg_2,target
0,L49,u0,m0,Did you change your hair?,you change your hair,No.,1
1,L50,u3,m0,No.,no,You might think about it,0
2,L51,u0,m0,You might wanna think about it,you might think about it,maybe...,0
3,L59,u9,m0,I missed you.,i missed you,It says here you exposed yourself to a group o...,0
4,L60,u8,m0,It says here you exposed yourself to a group o...,it say here you exposed yourself to a group of...,It was a bratwurst. I was eating lunch.,0


### Defining the list of questions and answers

In [166]:
questions = set(messages[messages['target'] == 1]['msg_pre_processed'])

In [167]:
answers = set(messages[messages['target'] == 0]['msg_pre_processed'])

In [168]:
len(answers)

175694

In [169]:
len(questions)

76902

In [170]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))
    
    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))
    
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [171]:
msg_raw = 'I heard you are a good guy. Is it right?'
#msg_raw = 'yes i heard you all right 20000000 thats quite a lot isnt it'
msg = pre_processing_text(msg_raw)
print(msg)

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
p = tokenizer.texts_to_matrix([msg])

model = load_model('chatbot_model.h5')
res = model.predict(p)

print(res)

i heard you are a good guy is it right
[[0.79494965]]


## Returning the conversation for the message using Jaccard Similarity

In [172]:
def jaccard_similarity(f1, f2):    
    f1 = set(str(f1).split(' '))
    f2 = set(str(f2).split(' '))
    
    intersecao = f1.intersection(f2)
    uniao = f1.union(f2)
    
    return len(intersecao) / len(uniao)

In [173]:
def return_conversation_by_jaccard(msg, res, questions, answers, threshold=None):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions
        similarity = [jaccard_similarity(msg, str(m)) for m in questions]        
    else:
        similarity = [jaccard_similarity(msg, str(m)) for m in answers]
        msg_list = answers
    
    result = {} 
    for key in msg_list: 
        for value in similarity:
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=True)}    

In [174]:
%%time
conversations = return_conversation_by_jaccard(msg, res, questions, answers)
conversations

CPU times: user 1.55 s, sys: 0 ns, total: 1.55 s
Wall time: 1.56 s


{'is it a good a i hope it is': 0.45454545454545453,
 'you guy hear a joke i heard': 0.4166666666666667,
 'are you sure this is a good idea': 0.38461538461538464,
 'are you kidding because i think it good': 0.38461538461538464,
 'is it a good morning': 0.36363636363636365,
 'you are a therapist right': 0.36363636363636365,
 'what is it what are you': 0.36363636363636365,
 'are you a good lawyer': 0.36363636363636365,
 'is it a good room': 0.36363636363636365,
 'are you guy all right': 0.36363636363636365,
 'you heard me is it': 0.36363636363636365,
 'barton what time is it are you all right': 0.35714285714285715,
 'are you surprised that i know what it is': 0.35714285714285715,
 'i just heard you poor sausage are you all right': 0.35714285714285715,
 'just a moment are you saying it is he': 0.35714285714285715,
 'how is it youre good you on a list': 0.35714285714285715,
 'is it such a chore i dont understand you': 0.35714285714285715,
 'i heard you screaming wa it a bad one': 0.3571428

In [175]:
#get the first item in the dict
def get_the_next_conversation(conversations):
    keys_view = conversations.keys()
    keys_iterator = iter(keys_view)
    conversation = next(keys_iterator)
    return conversation

In [176]:
conversation = get_the_next_conversation(conversations)
conversation

'is it a good a i hope it is'

### The returned message

In [177]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

>>> I heard you are a good guy. Is it right?
<<< Better sir.


## Calculate PageRank
- create a bi-directional graph of messages using similarity bigger then a threshould

In [178]:
"""
Classe para criação de um nó (página) do grafo.
O cosntrutuor recebe recebe o nome do nó, a lista de nós de entrada e saída.
"""
class Node(object):
    def __init__(self, node_name: str, inlinks: list, outlinks: list):
        self.node_name = node_name
        self.inlinks = inlinks
        self.outlinks = outlinks

"""
Classe para criação do grafo de páginas.
O construtor inicializa um dicionário vazio de lista de adjacência. 
"""
class Graph(object):
    def __init__(self):
        self.adj_list = dict()

    #Adiciona um nó ao grafo com seus nós de entrada e sapida
    def add_node(self, node_name: str, inlinks: list, outlinks: list):
        node = Node(node_name = node_name, inlinks=inlinks, outlinks=outlinks)
        self.adj_list[node_name] = node

    #Imprime os dados do grafo criado
    def print_graph(self):
        for key in self.adj_list:
            print(f"{key}:")
            print(f"\tEntrada: {self.adj_list[key].inlinks}")
            print(f"\tSaída: {self.adj_list[key].outlinks}")

In [179]:
import numpy as np

"""
A classe PageRank possui métodos para computar o Page Rank de cada página dado um número de iterações.
"""
class PageRank(object):
    #Construtor da classe que recebe um objeto Graph, inicializa um dicionário vazio de scores
    def __init__(self, graph: Graph):
        self.graph = graph
        self.scores = dict()
        self.__initialize_scores()

    #Inicializa os scores do Page Rank com o valor inicial 1/n, onde n é o número de nós (páginas) do grafo
    def __initialize_scores(self):
        n = len(self.graph.adj_list)
        for key in self.graph.adj_list:
            self.scores[key] = 1/n

    #Calcula o Page Rank para cada página dado o número de iterações. Ainda não utiliza a convergence_rate no cálculos
    def compute(self, iterations: int = 10, convergence_rate: float = 0.01):
        new_scores = dict()
        for i in range(iterations):
            for  node in self.graph.adj_list:
                in_to_node = np.asarray([
                    self.scores[x] for x in self.graph.adj_list[node].inlinks
                ])
                out_to_node = self.graph.adj_list[node].inlinks
                amount_out_to_node = np.asarray([
                    len(self.graph.adj_list[x].outlinks) for x in out_to_node
                ])
                score = np.sum(in_to_node / amount_out_to_node)
                new_scores[node] = score
            #print(self.scores)
            self.scores = new_scores.copy()
        return self.scores

    def power_method(self, iterations: int = 10):
        #Gera a matrix de probabilidades de navegação a cada nó
        lenght = len(self.graph.adj_list)
        matrix = np.zeros((lenght, lenght))
        i = j = 0
        for node in self.graph.adj_list:
            for link in self.graph.adj_list[node].outlinks:
                for row in self.graph.adj_list:
                    if link == row:
                        matrix[i][j] = 1/len(self.graph.adj_list[node].outlinks)
                    i = i + 1
                i = 0
            j = j + 1
        print("Matriz de probabilidades ")
        print(matrix)
        #muliplica a matrix pelo score inicial
        print("Page Rank das iterações")
        scores_arr = np.asarray([
                    self.scores[key] for key in self.graph.adj_list
                ])
        itn = np.dot(matrix, scores_arr)
        print(itn)
        for i in range(iterations - 1):
            itn = np.dot(matrix, itn)
            print(itn)

In [180]:
def make_in_links(conversations, threshold=None):
    li = list()
    if threshold is None:
        for c in conversations.keys():
            li.append(c)
    else:
        for c in conversations.keys():
            if conversations[c] >= threshold:
                li.append(c)
    return li

In [181]:
def get_conversations(msg, res):
    return return_conversation_by_jaccard(msg, res, questions, answers)

In [182]:
def make_graph(qea, res, threshold=None):
    g = Graph()
    lenght = len(qea)
    i = 1
    
    for k in qea:
        conversations = get_conversations(k, res)
        if conversations is not None:        
            #in_links = {k: v for k, v in conversations.items() if v >= threshold and v != 1}
            in_links = make_in_links(conversations, threshold=threshold)
            in_links.remove(k)
            #if in_links:
            g.add_node(k, in_links, in_links)
                #print(g.print_graph())
        if (i % 100) == 0:
            print('Processed '+ str(i) +' of '+ str(lenght))    
        i += 1
    return g

In [183]:
def save_page_compute(qea, res, file_name, threshold=None, iterations=3):

    g = make_graph(qea=qea, res=res, threshold=threshold)

    p = PageRank(graph=g)
    pc = p.compute(iterations=iterations)
    pc = {k: v for k, v in sorted(pc.items(), key=lambda item: item[1], reverse=True)}

    f = open( file_name + '.txt', 'w' )
    f.write( repr(pc) )
    f.close()
    return pc

In [184]:
pc_q = save_page_compute(qea=questions, res=1, threshold=0.5, file_name='page_rank_questions')

Processed 100 of 76902
Processed 200 of 76902
Processed 300 of 76902
Processed 400 of 76902
Processed 500 of 76902
Processed 600 of 76902
Processed 700 of 76902
Processed 800 of 76902
Processed 900 of 76902
Processed 1000 of 76902
Processed 1100 of 76902
Processed 1200 of 76902
Processed 1300 of 76902
Processed 1400 of 76902
Processed 1500 of 76902
Processed 1600 of 76902
Processed 1700 of 76902
Processed 1800 of 76902
Processed 1900 of 76902
Processed 2000 of 76902
Processed 2100 of 76902
Processed 2200 of 76902
Processed 2300 of 76902
Processed 2400 of 76902
Processed 2500 of 76902
Processed 2600 of 76902
Processed 2700 of 76902
Processed 2800 of 76902
Processed 2900 of 76902
Processed 3000 of 76902
Processed 3100 of 76902
Processed 3200 of 76902
Processed 3300 of 76902
Processed 3400 of 76902
Processed 3500 of 76902
Processed 3600 of 76902
Processed 3700 of 76902
Processed 3800 of 76902
Processed 3900 of 76902
Processed 4000 of 76902
Processed 4100 of 76902
Processed 4200 of 76902
P

Processed 33400 of 76902
Processed 33500 of 76902
Processed 33600 of 76902
Processed 33700 of 76902
Processed 33800 of 76902
Processed 33900 of 76902
Processed 34000 of 76902
Processed 34100 of 76902
Processed 34200 of 76902
Processed 34300 of 76902
Processed 34400 of 76902
Processed 34500 of 76902
Processed 34600 of 76902
Processed 34700 of 76902
Processed 34800 of 76902
Processed 34900 of 76902
Processed 35000 of 76902
Processed 35100 of 76902
Processed 35200 of 76902
Processed 35300 of 76902
Processed 35400 of 76902
Processed 35500 of 76902
Processed 35600 of 76902
Processed 35700 of 76902
Processed 35800 of 76902
Processed 35900 of 76902
Processed 36000 of 76902
Processed 36100 of 76902
Processed 36200 of 76902
Processed 36300 of 76902
Processed 36400 of 76902
Processed 36500 of 76902
Processed 36600 of 76902
Processed 36700 of 76902
Processed 36800 of 76902
Processed 36900 of 76902
Processed 37000 of 76902
Processed 37100 of 76902
Processed 37200 of 76902
Processed 37300 of 76902


Processed 66200 of 76902
Processed 66300 of 76902
Processed 66400 of 76902
Processed 66500 of 76902
Processed 66600 of 76902
Processed 66700 of 76902
Processed 66800 of 76902
Processed 66900 of 76902
Processed 67000 of 76902
Processed 67100 of 76902
Processed 67200 of 76902
Processed 67300 of 76902
Processed 67400 of 76902
Processed 67500 of 76902
Processed 67600 of 76902
Processed 67700 of 76902
Processed 67800 of 76902
Processed 67900 of 76902
Processed 68000 of 76902
Processed 68100 of 76902
Processed 68200 of 76902
Processed 68300 of 76902
Processed 68400 of 76902
Processed 68500 of 76902
Processed 68600 of 76902
Processed 68700 of 76902
Processed 68800 of 76902
Processed 68900 of 76902
Processed 69000 of 76902
Processed 69100 of 76902
Processed 69200 of 76902
Processed 69300 of 76902
Processed 69400 of 76902
Processed 69500 of 76902
Processed 69600 of 76902
Processed 69700 of 76902
Processed 69800 of 76902
Processed 69900 of 76902
Processed 70000 of 76902
Processed 70100 of 76902


In [None]:
pc_a = save_page_compute(qea=answers, res=0, threshold=0.5, file_name='page_rank_answers')

Processed 100 of 175694
Processed 200 of 175694
Processed 300 of 175694
Processed 400 of 175694
Processed 500 of 175694
Processed 600 of 175694
Processed 700 of 175694
Processed 800 of 175694
Processed 900 of 175694
Processed 1000 of 175694


## Similary of Jaccard based on Page Rank

In [None]:
def return_conversation_by_jaccard_page_rank(msg, conversations, page_compute):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    #similarity = [jaccard_similarity(msg, str(m)) for m in questions]   
    similarity = {k: jaccard_similarity(msg, str(k)) for k, v in conversations.items()}
    
    result = dict()
    for k, v in similarity.items():        
        result[k] = page_compute[k] * v
    
    result = {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=True)}
    return next(iter(result))

In [None]:
conversation = return_conversation_by_jaccard_page_rank(msg, conversations, page_compute=pc_q)
conversation

In [None]:
print('Conversation: '+ conversation)
print('Page compute: '+ str(pc[conversation]))
print('Similarity: '+ str(conversations[conversation]))

In [None]:
print('Original: '+ msg)
print('Most similar: '+conversation)

print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

## Return the result using the Cossine Similarity

In [None]:
bow = CountVectorizer()

In [None]:
def return_conversation_by_cossine(msg, res, questions, answers, bow):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions    
    else:
        msg_list = answers
       
    similarity = []
    for m in msg_list:
        new_msg_list = [msg, m]
        vector_bow = bow.fit_transform(new_msg_list)
        msg_bow = vector_bow.todense()[0]
        m_bow   = vector_bow.todense()[1]
        similarity.append(distance.cosine(msg_bow, m_bow))
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=False)}
    

In [None]:
conversations = return_conversation_by_cossine(msg, res, questions, answers, bow)
conversations

In [None]:
conversation = get_the_next_conversation(conversations)
conversation

### Return result

In [None]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

## Get result using Cossine Similarity with Embedding

In [None]:
def download_embedding(get_it):
    if get_it:
        !gdown https://drive.google.com/uc?id=1zI8pGfbUHuU_0wY_FV4tD6w6ZCUJTQbh
    print('Download finished')

In [None]:
#The embedding is already downloaded
#Change to True to download
download_embedding(False)

In [None]:
%%time
#get the embedding
newfilepath = "embedding_wiki_100d_pt.txt"
filepath = "ptwiki_20180420_100d.txt.bz2"
with open(newfilepath, 'wb') as new_file, bz2.BZ2File(filepath, 'rb') as file:
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(data)

In [None]:
%%time
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=False)

In [None]:
word_vectors

In [None]:
def calculate_embedding(phrase):
    """
    Return the mean of embeddings of a phrase
    """
    
    arr = np.array([word_vectors[word] for word in phrase if word in word_vectors.vocab])
    
    sum = np.zeros(len(arr[0]))
    for a in arr:
        sum = sum + a
        
    arr_mean = sum / len(arr) 
    
    return arr_mean

In [None]:
def return_conversation_by_cossine_embedding(msg, res, questions, answers, word_vectors):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions    
    else:
        msg_list = answers       
    
    msg = msg.split(' ')
    
    similarity = []
    for m in msg_list:        
        m = m.split(' ')
        
        try:
            msg_vector_embedding = calculate_embedding(msg)
            m_vector_embedding   = calculate_embedding(m)
        
            similarity.append(distance.cosine(msg_vector_embedding, m_vector_embedding))
        except:
            print("An exception occurred")
            print('> '+ ' '.join(m))
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=False)}
    

In [None]:
%%time
conversations = return_conversation_by_cossine_embedding(msg, res, questions, answers, word_vectors)
conversations

In [None]:
conversation = get_the_next_conversation(conversations)
conversation

### Return result

In [None]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)