## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre sobre este tema

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [3]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math
import random

[nltk_data] Downloading package wordnet to /home/douglas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Opening movie reviews

In [4]:
messages = pd.read_csv('./chatdata/movie_lines_pre_processed.tsv', delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [5]:
messages.head()

Unnamed: 0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed,target
0,L49,u0,m0,Did you change your hair?,No.,did you change your hair,1
1,L50,u3,m0,No.,You might wanna think about it,no,0
2,L51,u0,m0,You might wanna think about it,maybe...,you might wanna think about it,0
3,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...,i missed you,0
4,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.,it say here you exposed yourself to a group of...,0


### Processing for deep learning

In [6]:
#setting the sample data for tests
i = 0
#n = 20000
n = len(list(messages.index))
print(str(n))

257095


In [7]:
X_train, X_test, y_train, y_test = train_test_split(messages['msg_pre_processed'][i:n].astype(str), messages['target'][i:n].astype(str), test_size=0.33, random_state=42)

In [8]:
#dataframe with sample X and y
df_small = pd.DataFrame()

In [9]:
df_small['msg'] = X_train

In [10]:
df_small['target'] = y_train

In [11]:
df_small.head()

Unnamed: 0,msg,target
70522,and the people here don't want u are you kiddi...,1
255135,you need a table,0
93689,you asked me to model remember,1
165248,pete have a brother,1
249995,we wish to commune with the spirit of mr feur ...,1


In [13]:
df_small.shape

(172253, 2)

In [14]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [15]:
X_train

70522     and the people here don't want u are you kiddi...
255135                                     you need a table
93689                        you asked me to model remember
165248                                  pete have a brother
249995    we wish to commune with the spirit of mr feur ...
                                ...                        
119879                          we are don't worry about it
103694              do you think there is any life up there
131932    miss windham can you tell u what you'd been do...
146867                                   detective crockett
121958    in the city it happened all the time some kid ...
Name: msg_pre_processed, Length: 172253, dtype: object

In [16]:
y_train

70522     1
255135    0
93689     1
165248    1
249995    1
         ..
119879    0
103694    1
131932    1
146867    0
121958    1
Name: target, Length: 172253, dtype: object

In [17]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)

MemoryError: Unable to allocate 52.3 GiB for an array with shape (172253, 40761) and data type float64

In [None]:
X_train_token

In [None]:
X_train_token.shape

In [None]:
#set the number of rows of X_train
num_rows, num_cols = X_train_token.shape

In [None]:
classes = set(df_small['target'])
classes

In [None]:
df_small['target'] = df_small['target'].astype('int')

In [None]:
df_small.head()

### Training the model

In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(20, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
%%time
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(loss='BinaryCrossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(X_train_token, df_small['target'], epochs=10, batch_size=20, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")

### Testing the prototipe

In [None]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):   
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', corpus)
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [None]:
msg_raw = 'I heard you are a good guy. Is it right?'

In [None]:
msg = pre_processing_text(msg_raw)

In [None]:
p = tokenizer.texts_to_matrix([msg])

In [None]:
p.shape

In [None]:
res = model.predict(p)

In [None]:
res

### Defining the list of questions and answers

In [None]:
questions = set(df_small[df_small['target'] == 1]['msg'])

In [None]:
answers = set(df_small[df_small['target'] == 0]['msg'])

In [None]:
answers

## Returning the conversation for the message using Jaccard Similarity

In [None]:
def jaccard_similarity(f1, f2):
    f1 = set(f1)
    f2 = set(f2)
    
    intersecao = f1.intersection(f2)
    uniao = f1.union(f2)
    
    return len(intersecao) / len(uniao)

In [None]:
def return_conversation_by_jaccard(msg, res, questions, answers):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions
        similarity = [jaccard_similarity(msg, m) for m in questions]     
    else:
        similarity = [jaccard_similarity(msg, m) for m in answers]
        msg_list = answers
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=True)}
    

In [None]:
conversations = return_conversation_by_jaccard(msg, res, questions, answers)
conversations

In [None]:
#get the first item in the dict
def get_the_next_conversation(conversations):
    keys_view = conversations.keys()
    keys_iterator = iter(keys_view)
    conversation = next(keys_iterator)
    return conversation

In [None]:
conversation = get_the_next_conversation(conversations)
conversation

### The returned message

In [None]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

## Return the result using the Cossine Similarity

In [None]:
bow = CountVectorizer()

In [None]:
def return_conversation_by_cossine(msg, res, questions, answers, bow):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions    
    else:
        msg_list = answers
       
    similarity = []
    for m in msg_list:
        new_msg_list = [msg, m]
        vector_bow = bow.fit_transform(new_msg_list)
        msg_bow = vector_bow.todense()[0]
        m_bow   = vector_bow.todense()[1]
        similarity.append(distance.cosine(msg_bow, m_bow))
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=False)}
    

In [None]:
conversations = return_conversation_by_cossine(msg, res, questions, answers, bow)
conversations

In [None]:
conversation = get_the_next_conversation(conversations)
conversation

### Return result

In [None]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

## Get result using Cossine Similarity with Embedding

In [None]:
import bz2

In [None]:
def download_embedding(get_it):
    if get_it:
        !gdown https://drive.google.com/uc?id=1zI8pGfbUHuU_0wY_FV4tD6w6ZCUJTQbh
    print('Download finished')

In [None]:
#The embedding is already downloaded
#Change to True to download
download_embedding(False)

In [None]:
%%time
#get the embedding
newfilepath = "embedding_wiki_100d_pt.txt"
filepath = "ptwiki_20180420_100d.txt.bz2"
with open(newfilepath, 'wb') as new_file, bz2.BZ2File(filepath, 'rb') as file:
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(data)

In [None]:
%%time
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=False)

In [None]:
word_vectors

In [None]:
def calculate_embedding(phrase):
    """
    Return the mean of embeddings of a phrase
    """
    
    arr = np.array([word_vectors[word] for word in phrase if word in word_vectors.vocab])
    
    sum = np.zeros(len(arr[0]))
    for a in arr:
        sum = sum + a
        
    arr_mean = sum / len(arr) 
    
    return arr_mean

In [None]:
def return_conversation_by_cossine_embedding(msg, res, questions, answers, word_vectors):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions    
    else:
        msg_list = answers       
    
    msg = msg.split(' ')
    
    similarity = []
    for m in msg_list:        
        m = m.split(' ')
        
        try:
            msg_vector_embedding = calculate_embedding(msg)
            m_vector_embedding   = calculate_embedding(m)
        
            similarity.append(distance.cosine(msg_vector_embedding, m_vector_embedding))
        except:
            print("An exception occurred")
            print('> '+ ' '.join(m))
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=False)}
    

In [None]:
%%time
conversations = return_conversation_by_cossine_embedding(msg, res, questions, answers, word_vectors)
conversations

In [None]:
conversation = get_the_next_conversation(conversations)
conversation

### Return result

In [None]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)