## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre sobre este tema

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [391]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math
import random
import bz2
import itertools
from keras.callbacks import ModelCheckpoint, EarlyStopping

[nltk_data] Downloading package wordnet to /home/douglas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [392]:
#expand jupyter cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Opening movie reviews

In [393]:
messages = pd.read_csv('./chatdata/movie_lines_pre_processed.tsv', delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [394]:
messages.head()

Unnamed: 0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed,target
0,L49,u0,m0,Did you change your hair?,No.,you change your hair,1
1,L50,u3,m0,No.,You might think about it,no,0
2,L51,u0,m0,You might wanna think about it,can you explain it better?,you might think about it,0
3,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...,i missed you,0
4,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.,it say here you exposed yourself to a group of...,0


### Processing for deep learning

In [395]:
#setting the sample data for tests
i = 0
n = 20000

In [396]:
X_train, X_test, y_train, y_test = train_test_split(messages['msg_pre_processed'][i:n].astype(str), messages['target'][i:n].astype(str), test_size=0.33, stratify=messages['target'][i:n], random_state=42)

In [397]:
#dataframe with sample X and y
df_small = pd.DataFrame()

In [398]:
df_small['msg_pre_processed'] = X_train

In [399]:
df_small['target'] = y_train

In [400]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
6272,switchboard how may i direct your call,0
159,i cant date her sister until that one get a bo...,0
16688,evening baxter,0
16365,general the severe food shortage that face the...,1
6940,no im all alone,0


In [401]:
df_small.shape

(13400, 2)

In [402]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [458]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [457]:
X_train

6272                switchboard how may i direct your call
159      i cant date her sister until that one get a bo...
16688                                       evening baxter
16365    general the severe food shortage that face the...
6940                                       no im all alone
                               ...                        
2434                     she dropped in on me holding this
5487                                         saying did it
11969                                that wa a great night
11056                               not far enough come on
10839            of course now away before i get any older
Name: msg_pre_processed, Length: 13400, dtype: object

In [404]:
y_train

6272     0
159      0
16688    0
16365    1
6940     0
        ..
2434     0
5487     1
11969    0
11056    0
10839    0
Name: target, Length: 13400, dtype: object

In [405]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)

In [406]:
X_train_token

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [407]:
X_train_token.shape

(13400, 9805)

In [408]:
#set the number of rows of X_train
num_rows, num_cols = X_train_token.shape

In [409]:
classes = set(df_small['target'])
classes

{'0', '1'}

In [410]:
df_small['target'] = df_small['target'].astype('int')

In [411]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
6272,switchboard how may i direct your call,0
159,i cant date her sister until that one get a bo...,0
16688,evening baxter,0
16365,general the severe food shortage that face the...,1
6940,no im all alone,0


### Search for the best parameters

In [412]:
def create_model(X, y, activation='relu', momentum=0.9, learn_rate=0.01, decay=1e-6,
                 dropout_rate=0.5, weight_constraint=1, neurons=20, init='uniform',
                 optimizer='SGD', nesterov=False, num_cols=10, pos_fix='',
                 epochs=10, validation_split=0.3, batch_size=20):
        
    model = Sequential()
    model.add(Dense(neurons, input_dim=num_cols, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons/2, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    #model.summary()
    
    # Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
    sgd = SGD(lr=learn_rate, decay=decay, momentum=momentum, nesterov=nesterov)
    model.compile(loss='BinaryCrossentropy', optimizer=sgd, metrics=['accuracy'])
    
    callbacks = [EarlyStopping(monitor='val_accuracy', patience=3, verbose=0),
                tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),]
    
    hist = model.fit(X, y, epochs=epochs, validation_split=validation_split, batch_size=batch_size, verbose=1, callbacks=callbacks)
    
    model_name = './models/chatbot_model_'+ pos_fix +'_.h5'
    #model.save(model_name, hist)

    print('model '+ model_name +' created')
    
    return model

### Training the model with fixed parameters

In [413]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(20, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 20)                196120    
_________________________________________________________________
dropout_22 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_34 (Dense)             (None, 10)                210       
_________________________________________________________________
dropout_23 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 11        
Total params: 196,341
Trainable params: 196,341
Non-trainable params: 0
_________________________________________________________________


In [414]:
%%time
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(loss='BinaryCrossentropy', optimizer=sgd, metrics=['accuracy'])

CPU times: user 15.6 ms, sys: 15.6 ms, total: 31.2 ms
Wall time: 39.2 ms


In [415]:
%%time

callbacks = [EarlyStopping(monitor='val_accuracy', patience=10, verbose=0),
                ModelCheckpoint(filepath='model.{epoch:02d}-{val_accuracy:.2f}.h5'),
            ]

#fitting and saving the model
hist = model.fit(X_train_token, df_small['target'], epochs=500, validation_split=0.3, batch_size=20, verbose=1, callbacks=callbacks)
model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
model created
CPU times: user 1min 33s, sys: 11.5 s, total: 1min 45s
Wall time: 1min 1s


### Testing the prototipe

In [416]:
from tensorflow.keras.models import load_model

In [425]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))
    
    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))
    
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [426]:
msg_raw = 'I heard you are a good guy. Is it right?'

In [427]:
msg = pre_processing_text(msg_raw)

In [428]:
p = tokenizer.texts_to_matrix([msg])

In [429]:
p.shape

(1, 9805)

In [430]:
res = model.predict(p)



In [432]:
res

array([[0.79494965]], dtype=float32)

### Defining the list of questions and answers

In [433]:
questions = set(df_small[df_small['target'] == 1]['msg_pre_processed'])

In [434]:
answers = set(df_small[df_small['target'] == 0]['msg_pre_processed'])

In [435]:
answers

{'im sorry to hear that',
 'thats my max',
 'i know you did but i assure you there wa an impending failure',
 'i wasnt finished',
 'whatever you say',
 'we have to prepare you for an audience with sophie',
 'im going to see what else i can find out about mr',
 'yeah yeah i sorta dabble around you know',
 'this is real cooperation i appreciate it',
 'we are launching a major offensive to expand our foraging territory',
 'she wa european',
 'once in church dude',
 'ettore',
 'i cant listen to this',
 'im not no',
 'two minute and already youre a dead man let passion overwhelm you colon',
 'of course it would be very easy for to find out now',
 'wait stop you dont seem to understand not really comprehending any of this i killed him i did it carnes im bateman i chopped owen fucking head off i tortured dozen of girl the whole message i left on your machine wa true',
 'nooooo',
 'at the school crossing in his bmw hurt some kid im gonna bust his as',
 'he busy being dead',
 'i know how you fe

## Returning the conversation for the message using Jaccard Similarity

In [436]:
def jaccard_similarity(f1, f2):
    f1 = set(f1)
    f2 = set(f2)
    
    intersecao = f1.intersection(f2)
    uniao = f1.union(f2)
    
    return len(intersecao) / len(uniao)

In [437]:
def return_conversation_by_jaccard(msg, res, questions, answers):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions
        similarity = [jaccard_similarity(msg, m) for m in questions]     
    else:
        similarity = [jaccard_similarity(msg, m) for m in answers]
        msg_list = answers
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=True)}
    

In [438]:
conversations = return_conversation_by_jaccard(msg, res, questions, answers)
conversations

{'is this your idea': 0.9230769230769231,
 'staedert whats going on you destroy it': 0.8666666666666667,
 'so you need to change right': 0.8666666666666667,
 'hi you have hard shell right': 0.8666666666666667,
 'why are you doing this': 0.8666666666666667,
 'so what are you doing with a dog': 0.8666666666666667,
 'is there anything you need money': 0.8666666666666667,
 'do you feel all right sir': 0.8666666666666667,
 'thats right you see that mr get it': 0.8571428571428571,
 'you hear what i said': 0.8571428571428571,
 'hey did you hear what he said': 0.8571428571428571,
 'who is that other guy': 0.8571428571428571,
 'whered you get this': 0.8571428571428571,
 'got his deer yet': 0.8461538461538461,
 'right see ready for the quiz': 0.8125,
 'do you want me to stay here tonight': 0.8125,
 'distracting enough for you': 0.8125,
 'then why the hell are you sitting around here': 0.8125,
 'hey dont you think a hair stylist got any interest in gettin it on': 0.8125,
 'why didnt you do anythi

In [439]:
#get the first item in the dict
def get_the_next_conversation(conversations):
    keys_view = conversations.keys()
    keys_iterator = iter(keys_view)
    conversation = next(keys_iterator)
    return conversation

In [440]:
conversation = get_the_next_conversation(conversations)
conversation

'is this your idea'

### The returned message

In [441]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

>>> I heard you are a good guy. Is it right?
<<< Mine?


## Return the result using the Cossine Similarity

In [442]:
bow = CountVectorizer()

In [443]:
def return_conversation_by_cossine(msg, res, questions, answers, bow):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions    
    else:
        msg_list = answers
       
    similarity = []
    for m in msg_list:
        new_msg_list = [msg, m]
        vector_bow = bow.fit_transform(new_msg_list)
        msg_bow = vector_bow.todense()[0]
        m_bow   = vector_bow.todense()[1]
        similarity.append(distance.cosine(msg_bow, m_bow))
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=False)}
    

In [444]:
conversations = return_conversation_by_cossine(msg, res, questions, answers, bow)
conversations

{'what is it patrick are you alright': 0.4654775161751512,
 'are you': 0.5,
 'is that it is this right': 0.5,
 'is it': 0.5,
 'what is it you want': 0.5256583509747431,
 'is it is it really': 0.5285954792089683,
 'is it is this what you want': 0.5285954792089683,
 'andy what is it andy are you okay': 0.5527864045000421,
 'is he with you are you travelling together': 0.5527864045000421,
 'is it two or is it three have you ever really noticed': 0.5580582617584078,
 'this is good who did it': 0.5669872981077806,
 'not having a good time are you': 0.5669872981077806,
 'are you sure it an opera': 0.5669872981077806,
 'no he absolutely right absolutely right ryan talk must be banned canine conversation are completely discouraged it really good of you to join u can i get you a drink': 0.5691797815723354,
 'so mom truly no grudge truly a little odd that you didnt come to see me when you heard i wa hurt but the important thing i want you to know is your son is happy im working again ill make do

In [445]:
conversation = get_the_next_conversation(conversations)
conversation

'what is it patrick are you alright'

### Return result

In [446]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

>>> I heard you are a good guy. Is it right?
<<< Stop sounding so Fucking sad! Jesus!


## Get result using Cossine Similarity with Embedding

In [447]:
def download_embedding(get_it):
    if get_it:
        !gdown https://drive.google.com/uc?id=1zI8pGfbUHuU_0wY_FV4tD6w6ZCUJTQbh
    print('Download finished')

In [448]:
#The embedding is already downloaded
#Change to True to download
download_embedding(False)

Download finished


In [449]:
%%time
#get the embedding
newfilepath = "embedding_wiki_100d_pt.txt"
filepath = "ptwiki_20180420_100d.txt.bz2"
with open(newfilepath, 'wb') as new_file, bz2.BZ2File(filepath, 'rb') as file:
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(data)

CPU times: user 1min 1s, sys: 891 ms, total: 1min 1s
Wall time: 1min 4s


In [450]:
%%time
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=False)

CPU times: user 3min 42s, sys: 15.6 s, total: 3min 57s
Wall time: 4min 53s


In [451]:
word_vectors

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f3793a02a00>

In [452]:
def calculate_embedding(phrase):
    """
    Return the mean of embeddings of a phrase
    """
    
    arr = np.array([word_vectors[word] for word in phrase if word in word_vectors.vocab])
    
    sum = np.zeros(len(arr[0]))
    for a in arr:
        sum = sum + a
        
    arr_mean = sum / len(arr) 
    
    return arr_mean

In [453]:
def return_conversation_by_cossine_embedding(msg, res, questions, answers, word_vectors):
    """
    Return a dictionary of message and similarity sorted by highter similarity
    """
    if res >= 0.5:
        msg_list = questions    
    else:
        msg_list = answers       
    
    msg = msg.split(' ')
    
    similarity = []
    for m in msg_list:        
        m = m.split(' ')
        
        try:
            msg_vector_embedding = calculate_embedding(msg)
            m_vector_embedding   = calculate_embedding(m)
        
            similarity.append(distance.cosine(msg_vector_embedding, m_vector_embedding))
        except:
            print("An exception occurred")
            print('> '+ ' '.join(m))
    
    result = {} 
    for key in msg_list: 
        for value in similarity: 
            result[key] = value
            similarity.remove(value) 
            break 
    
    return {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=False)}
    

In [454]:
%%time
conversations = return_conversation_by_cossine_embedding(msg, res, questions, answers, word_vectors)
conversations

An exception occurred
> fanfuckingtastic
An exception occurred
> cornilius
An exception occurred
> airsick
An exception occurred
> andand
An exception occurred
> magdelen
An exception occurred
> notbusy
An exception occurred
> segretti
An exception occurred
> cspan
An exception occurred
> whatisit
An exception occurred
> autoshop
An exception occurred
> wheres irth
An exception occurred
> divinely
An exception occurred
> mommymommy
An exception occurred
> verdell
An exception occurred
> didnt
CPU times: user 922 ms, sys: 31.2 ms, total: 953 ms
Wall time: 992 ms


{'what again': 0.0277536050874172,
 'this is your plan finch': 0.027874825816351256,
 'what happened': 0.02798507136473971,
 'are you alright wade': 0.02993621271078506,
 'why did you break into that particular building': 0.02995900551470454,
 'i got a bad feeling about this how much do you need': 0.03042500298242723,
 'why cant i ever fall in love with nice like you': 0.031356231883946206,
 'have you ever been in love': 0.031382567448420495,
 'so you cant tell me anything': 0.03160381381780497,
 'mr sheldrake yes very': 0.031605741698843226,
 'so whats it going to be coogans or u vayvudoo': 0.03206581654738938,
 'to put me to bed take my clothes off hold my head tuck me in turn off the light tiptoe out would wouldnt you eve': 0.03250713048281717,
 'always count on old for a good screwing why should i always pay more sell cheap why should you guy make all the money you and and jack right right': 0.033039857465363864,
 'are you herr mozart': 0.03304433045380917,
 'you got that': 0.03329

In [455]:
conversation = get_the_next_conversation(conversations)
conversation

'what again'

### Return result

In [456]:
print('>>> '+msg_raw)
msg2 = list(messages[messages['msg_pre_processed'] == conversation]['msg_2'])[0]
print('<<< '+msg2)

>>> I heard you are a good guy. Is it right?
<<< I could watch you play that last scene a thousand times and cry every time-
