## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre sobre este tema

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [1]:
!pip3 install gensim
!pip3 install tensorflow
!pip3 install keras



In [2]:
import string
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math
import random

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Opening movie reviews

In [3]:
def load_data():
  from google.colab import drive
  drive.mount('/content/drive/')
  import os
  os.chdir('/content/drive/My Drive/Colab Notebooks/NLP/')
  !ls
  !head eventos.csv

In [4]:
load_data()

Mounted at /content/drive/
 Aula_1_IAAM_O3_1_Douglas_Cardoso.ipynb      chatbot_model.h5
'Aula 2 - IAAM_3_1_Douglas_Cardoso.ipynb'    chatdata
 Aula_3_IAAM_3_1_Douglas_Cardoso.ipynb	    'TF - Chatbot.ipynb'
'Aula 4 - IAAM 03_1_Douglas_Cardoso.ipynb'  'TF - Chatbot-v2.ipynb'
'Aula 5 - IAAM 31 - Douglas_Cardoso.ipynb'
head: cannot open 'eventos.csv' for reading: No such file or directory


In [5]:
messages = pd.read_csv('./chatdata/movie_lines.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [6]:
messages.head(10)

Unnamed: 0,0,1,2,3,4
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
5,L924,u2,m0,CAMERON,Wow
6,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
7,L871,u2,m0,CAMERON,No
8,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...
9,L869,u0,m0,BIANCA,Like my fear of wearing pastels?


### Cleaning the index of the conversations

In [7]:
messages.columns = ['msg_line', 'user1_id', 'movie_id', 'user_name', 'msg']

In [8]:
def remove_char(txt):
    return re.sub('[^0-9]','', txt)

In [9]:
#leaving just the number of the index, so L872 changes to 872
messages['msg_line_clean'] = [remove_char(msg) for msg in messages['msg_line']]

In [10]:
#change the column type to number
messages['msg_line_clean'] = pd.to_numeric(messages['msg_line_clean'])

In [11]:
messages.sort_values(by=['msg_line_clean'])

Unnamed: 0,msg_line,user1_id,movie_id,user_name,msg,msg_line_clean
86,L49,u0,m0,BIANCA,Did you change your hair?,49
85,L50,u3,m0,CHASTITY,No.,50
84,L51,u0,m0,BIANCA,You might wanna think about it,51
648,L59,u9,m0,PATRICK,I missed you.,59
647,L60,u8,m0,MISS PERKY,It says here you exposed yourself to a group o...,60
...,...,...,...,...,...,...
304704,L666522,u9034,m616,VEREKER,So far only their scouts. But we have had repo...,666522
304679,L666546,u9027,m616,CHELMSFORD,Splendid site Crealock splendil I want to esta...,666546
304678,L666547,u9029,m616,CREALOCK,Certainly Sin,666547
304696,L666575,u9028,m616,COGHILL,Choose your targets men. That's right Watch th...,666575


In [12]:
#set the column as the index
messages = messages.set_index('msg_line_clean')

In [13]:
messages.head(10)

Unnamed: 0_level_0,msg_line,user1_id,movie_id,user_name,msg
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1045,L1045,u0,m0,BIANCA,They do not!
1044,L1044,u2,m0,CAMERON,They do to!
985,L985,u0,m0,BIANCA,I hope so.
984,L984,u2,m0,CAMERON,She okay?
925,L925,u0,m0,BIANCA,Let's go.
924,L924,u2,m0,CAMERON,Wow
872,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
871,L871,u2,m0,CAMERON,No
870,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...
869,L869,u0,m0,BIANCA,Like my fear of wearing pastels?


### Opening conversation sequence

In [14]:
#read the file with the conversation sequence
conv_seq = pd.read_csv('./chatdata/movie_conversations.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [15]:
conv_seq.head(10)

Unnamed: 0,0,1,2,3
0,u0,u2,m0,['L194' 'L195' 'L196' 'L197']
1,u0,u2,m0,['L198' 'L199']
2,u0,u2,m0,['L200' 'L201' 'L202' 'L203']
3,u0,u2,m0,['L204' 'L205' 'L206']
4,u0,u2,m0,['L207' 'L208']
5,u0,u2,m0,['L271' 'L272' 'L273' 'L274' 'L275']
6,u0,u2,m0,['L276' 'L277']
7,u0,u2,m0,['L280' 'L281']
8,u0,u2,m0,['L363' 'L364']
9,u0,u2,m0,['L365' 'L366']


In [16]:
conv_seq.columns = ['user1_id', 'user2_id', 'movie_id', 'sequence']

In [17]:
conv_seq.head(10)

Unnamed: 0,user1_id,user2_id,movie_id,sequence
0,u0,u2,m0,['L194' 'L195' 'L196' 'L197']
1,u0,u2,m0,['L198' 'L199']
2,u0,u2,m0,['L200' 'L201' 'L202' 'L203']
3,u0,u2,m0,['L204' 'L205' 'L206']
4,u0,u2,m0,['L207' 'L208']
5,u0,u2,m0,['L271' 'L272' 'L273' 'L274' 'L275']
6,u0,u2,m0,['L276' 'L277']
7,u0,u2,m0,['L280' 'L281']
8,u0,u2,m0,['L363' 'L364']
9,u0,u2,m0,['L365' 'L366']


### Build conversation sequence

In [18]:
def split_conversation(txt):
    txt_alt = txt.split(' ')
    return txt_alt

In [19]:
def seq_to_list(seq):
    seq_list = [remove_char(s) for s in seq]
    return seq_list

In [20]:
#initializing the msg_2 column
messages['msg_2'] = '-'

In [21]:
def link_conversations(seq_list, df, filter1, filter2):
    i = 0
    while i in range(len(seq_list)):
        if i+1 < len(seq_list):
            next_msg = df.loc[int(seq_list[i+1]), filter1]
            df.at[int(seq_list[i]), filter2] = next_msg
        i+=1

In [22]:
#link each message with its answer
for conv in conv_seq['sequence']:
    #split each sequence by space
    seq = split_conversation(conv)

    #remove the char L from the sequences
    txt_alt = [remove_char(s) for s in seq]

    #use the conversation sequence to build the target answer for each message
    link_conversations(txt_alt, messages, 'msg', 'msg_2')

In [23]:
messages.head(30)

Unnamed: 0_level_0,msg_line,user1_id,movie_id,user_name,msg,msg_2
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1045,L1045,u0,m0,BIANCA,They do not!,-
1044,L1044,u2,m0,CAMERON,They do to!,They do not!
985,L985,u0,m0,BIANCA,I hope so.,-
984,L984,u2,m0,CAMERON,She okay?,I hope so.
925,L925,u0,m0,BIANCA,Let's go.,-
924,L924,u2,m0,CAMERON,Wow,Let's go.
872,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.,-
871,L871,u2,m0,CAMERON,No,Okay -- you're gonna need to learn how to lie.
870,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,No
869,L869,u0,m0,BIANCA,Like my fear of wearing pastels?,-


### Processing for deep learning

In [24]:
#return generic answer
def generic_answer(txt):
  asw_list = ['talk more about it',
              'can you explain it better?',
              'i need to think more about it',
              'maybe...'
              ]
  if txt == '-':
    return random.choice(asw_list)
  return txt

In [25]:
#seting a generic answer to the messages without answer
messages['msg_2'] = [generic_answer(msg) for msg in messages['msg_2']]

In [26]:
messages.head(30)

Unnamed: 0_level_0,msg_line,user1_id,movie_id,user_name,msg,msg_2
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1045,L1045,u0,m0,BIANCA,They do not!,talk more about it
1044,L1044,u2,m0,CAMERON,They do to!,They do not!
985,L985,u0,m0,BIANCA,I hope so.,talk more about it
984,L984,u2,m0,CAMERON,She okay?,I hope so.
925,L925,u0,m0,BIANCA,Let's go.,can you explain it better?
924,L924,u2,m0,CAMERON,Wow,Let's go.
872,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.,can you explain it better?
871,L871,u2,m0,CAMERON,No,Okay -- you're gonna need to learn how to lie.
870,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,No
869,L869,u0,m0,BIANCA,Like my fear of wearing pastels?,maybe...


In [27]:
lemmatizer = WordNetLemmatizer()
def pre_processamento_texto(corpus):
    #remove duplicated spaces
    corpus_alt = re.sub(r' +', ' ', corpus)
    #capitalizacao
    corpus_alt = corpus_alt.lower()
    #lammatization
    sentence_words = lemmatizer.lemmatize(corpus_alt)
    #remover pontuacoes
    #corpus_alt = [t for t in corpus_alt if t not in string.punctuation]
    #corpus_alt = ''.join(corpus_alt)

    return corpus_alt

In [28]:
#cleaning the messages
messages["msg"] = [pre_processamento_texto(str(msg)) for msg in messages["msg"]]
messages["msg_2"] = [pre_processamento_texto(str(msg)) for msg in messages["msg_2"]]

In [29]:
messages.head(10)

Unnamed: 0_level_0,msg_line,user1_id,movie_id,user_name,msg,msg_2
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1045,L1045,u0,m0,BIANCA,they do not!,talk more about it
1044,L1044,u2,m0,CAMERON,they do to!,they do not!
985,L985,u0,m0,BIANCA,i hope so.,talk more about it
984,L984,u2,m0,CAMERON,she okay?,i hope so.
925,L925,u0,m0,BIANCA,let's go.,can you explain it better?
924,L924,u2,m0,CAMERON,wow,let's go.
872,L872,u0,m0,BIANCA,okay -- you're gonna need to learn how to lie.,can you explain it better?
871,L871,u2,m0,CAMERON,no,okay -- you're gonna need to learn how to lie.
870,"""L870",u0,m0,BIANCA,i'm kidding. you know how sometimes you just b...,no
869,L869,u0,m0,BIANCA,like my fear of wearing pastels?,maybe...


In [30]:
#setting the sample data for tests
i = 0
n = 2000

In [31]:
X_train, X_test, y_train, y_test = train_test_split(messages['msg'][i:n].astype(str), messages['msg_2'][i:n].astype(str), test_size=0.33, random_state=42)

In [32]:
#dataframe with sample X and y
df_small = pd.DataFrame()

In [33]:
df_small['msg'] = X_train

In [34]:
df_small['msg_2'] = y_train

In [35]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [36]:
X_train

msg_line_clean
573                                                 joey.
2204    you don't know anything! listen colon these ar...
3169                              what are you asking me?
132                                    an attempted slit.
3158    the men are out of quarters - practicing putti...
                              ...                        
2976         do you have any czech girls working for you?
3036    um...now is not a good time okay. detective ja...
1947                             god... that's in a week!
2638    because i don't like your ugly language. i hea...
2980                                 what are you saying?
Name: msg, Length: 1340, dtype: object

In [37]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)
# encode training data set
y_train_token = tokenizer.texts_to_matrix(y_train)

In [38]:
X_train_token

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 1., 1.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [39]:
X_train_token.shape

(1340, 2348)

In [40]:
#set the number of rows of X_train
num_rows, num_cols = X_train_token.shape

In [41]:
#create a label for each message using the similarity of the message and its answer
list_similarity = []
for i in range(num_rows):
    d = distance.cosine(X_train_token[i], y_train_token[i])
    if math.isnan(d):
        d = 0.0
    list_similarity.append(d*10)
    i+=0

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [42]:
#creating a column with the target
df_small['similarity'] = list_similarity

In [43]:
df_small = df_small.sort_index()
df_small

Unnamed: 0_level_0,msg,msg_2,similarity
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49,did you change your hair?,no.,10.000000
50,no.,you might wanna think about it,10.000000
51,you might wanna think about it,talk more about it,5.917517
59,i missed you.,it says here you exposed yourself to a group o...,8.333333
60,it says here you exposed yourself to a group o...,it was a bratwurst. i was eating lunch.,7.817821
...,...,...,...
5380,you are an awesomely weird cop. sure wish ther...,no you don't. if i ever get word of you steppi...,9.440983
5381,no you don't. if i ever get word of you steppi...,spare met jack. i'm into legit investments fro...,9.325800
5382,spare met jack. i'm into legit investments fro...,talk more about it,10.000000
5383,thanks.,no trouble jack. but listen suppose i stay a c...,10.000000


In [44]:
#changing the target to integer
#df_small = df_small.round({'similarity': 0})
df_small = df_small.astype({'similarity': 'int32'})
df_small

Unnamed: 0_level_0,msg,msg_2,similarity
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49,did you change your hair?,no.,10
50,no.,you might wanna think about it,10
51,you might wanna think about it,talk more about it,5
59,i missed you.,it says here you exposed yourself to a group o...,8
60,it says here you exposed yourself to a group o...,it was a bratwurst. i was eating lunch.,7
...,...,...,...
5380,you are an awesomely weird cop. sure wish ther...,no you don't. if i ever get word of you steppi...,9
5381,no you don't. if i ever get word of you steppi...,spare met jack. i'm into legit investments fro...,9
5382,spare met jack. i'm into legit investments fro...,talk more about it,10
5383,thanks.,no trouble jack. but listen suppose i stay a c...,10


In [45]:
classes = set(df_small['similarity'])
classes

{0, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [46]:
#changing the class 10 to 1 to make the continous range
df_small = df_small.reset_index()

for i in range(len(df_small['similarity'])):
  if df_small['similarity'][i] == 10:
    df_small['similarity'][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [47]:
df_small.head()

Unnamed: 0,msg_line_clean,msg,msg_2,similarity
0,49,did you change your hair?,no.,1
1,50,no.,you might wanna think about it,1
2,51,you might wanna think about it,talk more about it,5
3,59,i missed you.,it says here you exposed yourself to a group o...,8
4,60,it says here you exposed yourself to a group o...,it was a bratwurst. i was eating lunch.,7


In [48]:
classes = set(df_small['similarity'])
classes

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

### Training the model

In [63]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 128)               300672    
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 65        
Total params: 308,993
Trainable params: 308,993
Non-trainable params: 0
_________________________________________________________________


In [64]:
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='Poisson', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(X_train_token, df_small['similarity'], epochs=10, batch_size=20, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
model created


### Testing the prototipe

In [53]:
p = tokenizer.texts_to_matrix(['here\'s the file. cates checks the file.'])

In [54]:
p.shape

(1, 2348)

In [55]:
res = model.predict(p)

In [56]:
res

array([[0.]], dtype=float32)

#### Keras Template model

In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(2, input_dim=4, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))

model.summary()

In [None]:

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(([1,2,3,4],[1,2,3,4],[1,2,3,4]), [1,2,3], epochs=2, batch_size=3, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")