## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [3]:
!pip3 install gensim
!pip3 install tensorflow
!pip3 install keras



In [4]:
import pandas as pd
import re
import gensim
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math

### Opening movie reviews

In [18]:
messages = pd.read_csv('./chatdata/movie_lines.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [19]:
messages.head(10)

Unnamed: 0,0,1,2,3,4
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
5,L924,u2,m0,CAMERON,Wow
6,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
7,L871,u2,m0,CAMERON,No
8,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...
9,L869,u0,m0,BIANCA,Like my fear of wearing pastels?


### Cleaning the index of the conversations

In [20]:
messages.columns = ['msg_line', 'user1_id', 'movie_id', 'user_name', 'msg']

In [21]:
def remove_char(txt):
    return re.sub('[^0-9]','', txt)

In [22]:
messages['msg_line_clean'] = [remove_char(msg) for msg in messages['msg_line']]

In [23]:
messages.sort_values(by=['msg_line_clean'])

Unnamed: 0,msg_line,user1_id,movie_id,user_name,msg,msg_line_clean
561,L1000,u11,m0,WALTER,Oh Christ. Don't tell me you've changed your ...,1000
119574,L10000,u3525,m232,LINDSEY,Oh... chamber runs. Uh huh that's good. Well...,10000
142618,L100000,u4168,m278,JOANNE,No.,100000
142617,L100001,u4166,m278,DULANEY,Then why did you go see Mr. Koehler in the fir...,100001
142616,L100002,u4168,m278,JOANNE,Because he called me.,100002
...,...,...,...,...,...,...
142623,L99995,u4168,m278,JOANNE,What do you think I've got? A gun? Maybe I'm ...,99995
142622,L99996,u4168,m278,JOANNE,I'm gonna go to jail. I know they're gonna ma...,99996
142621,L99997,u4166,m278,DULANEY,Why'd you come here?,99997
142620,L99998,u4168,m278,JOANNE,To show you this. It's a letter from that law...,99998


In [24]:
messages.head(10)

Unnamed: 0,msg_line,user1_id,movie_id,user_name,msg,msg_line_clean
0,L1045,u0,m0,BIANCA,They do not!,1045
1,L1044,u2,m0,CAMERON,They do to!,1044
2,L985,u0,m0,BIANCA,I hope so.,985
3,L984,u2,m0,CAMERON,She okay?,984
4,L925,u0,m0,BIANCA,Let's go.,925
5,L924,u2,m0,CAMERON,Wow,924
6,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.,872
7,L871,u2,m0,CAMERON,No,871
8,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,870
9,L869,u0,m0,BIANCA,Like my fear of wearing pastels?,869


In [25]:
messages['msg_line_clean'] = pd.to_numeric(messages['msg_line_clean'])

In [30]:
messages = messages.sort_values(by=['msg_line_clean'])

In [31]:
messages.head(30)

Unnamed: 0,msg_line,user1_id,movie_id,user_name,msg,msg_line_clean
86,L49,u0,m0,BIANCA,Did you change your hair?,49
85,L50,u3,m0,CHASTITY,No.,50
84,L51,u0,m0,BIANCA,You might wanna think about it,51
648,L59,u9,m0,PATRICK,I missed you.,59
647,L60,u8,m0,MISS PERKY,It says here you exposed yourself to a group o...,60
646,L61,u9,m0,PATRICK,It was a bratwurst. I was eating lunch.,61
645,L62,u8,m0,MISS PERKY,With the teeth of your zipper?,62
266,L63,u7,m0,MICHAEL,You the new guy?,63
265,L64,u2,m0,CAMERON,So they tell me...,64
264,L65,u7,m0,MICHAEL,C'mon. I'm supposed to give you the tour.,65


In [16]:
messages = messages.set_index('msg_line_clean')

In [17]:
messages.head(10)

Unnamed: 0_level_0,msg_line,user1_id,movie_id,user_name,msg
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1045,L1045,u0,m0,BIANCA,They do not!
1044,L1044,u2,m0,CAMERON,They do to!
985,L985,u0,m0,BIANCA,I hope so.
984,L984,u2,m0,CAMERON,She okay?
925,L925,u0,m0,BIANCA,Let's go.
924,L924,u2,m0,CAMERON,Wow
872,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
871,L871,u2,m0,CAMERON,No
870,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...
869,L869,u0,m0,BIANCA,Like my fear of wearing pastels?


### Opening conversation sequence

In [12]:
conv_seq = pd.read_csv('./chatdata/movie_conversations.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [13]:
conv_seq.head(10)

Unnamed: 0,0,1,2,3
0,u0,u2,m0,['L194' 'L195' 'L196' 'L197']
1,u0,u2,m0,['L198' 'L199']
2,u0,u2,m0,['L200' 'L201' 'L202' 'L203']
3,u0,u2,m0,['L204' 'L205' 'L206']
4,u0,u2,m0,['L207' 'L208']
5,u0,u2,m0,['L271' 'L272' 'L273' 'L274' 'L275']
6,u0,u2,m0,['L276' 'L277']
7,u0,u2,m0,['L280' 'L281']
8,u0,u2,m0,['L363' 'L364']
9,u0,u2,m0,['L365' 'L366']


In [14]:
conv_seq.columns = ['user1_id', 'user2_id', 'movie_id', 'sequence']

In [15]:
conv_seq.head(10)

Unnamed: 0,user1_id,user2_id,movie_id,sequence
0,u0,u2,m0,['L194' 'L195' 'L196' 'L197']
1,u0,u2,m0,['L198' 'L199']
2,u0,u2,m0,['L200' 'L201' 'L202' 'L203']
3,u0,u2,m0,['L204' 'L205' 'L206']
4,u0,u2,m0,['L207' 'L208']
5,u0,u2,m0,['L271' 'L272' 'L273' 'L274' 'L275']
6,u0,u2,m0,['L276' 'L277']
7,u0,u2,m0,['L280' 'L281']
8,u0,u2,m0,['L363' 'L364']
9,u0,u2,m0,['L365' 'L366']


### Build conversation

In [16]:
def split_conversation(txt):
    txt_alt = txt.split(' ')
    return txt_alt

In [17]:
def seq_to_list(seq):
    seq_list = [remove_char(s) for s in seq]
    return seq_list

In [18]:
messages['msg_2'] = '-'

In [19]:
def link_conversations(seq_list, df, filter1, filter2):
    i = 0
    while i in range(len(seq_list)):
        if i+1 < len(seq_list):
            next_msg = df.loc[int(seq_list[i+1]), filter1]
            #print(str(i))
            #print(next_msg)
            #print(seq_list[i])
            df.at[int(seq_list[i]), filter2] = next_msg
        i+=1

In [20]:
link_conversations(['194', '195', '196', '197'], messages, 'msg', 'msg_2')

In [21]:
messages.loc[195, 'msg']

"Well I thought we'd start with pronunciation if that's okay with you."

In [22]:
for conv in conv_seq['sequence']:
    #print(conv)
    seq = split_conversation(conv)
    #print(seq)
    #txt_alt = remove_char(txt_alt)      
    txt_alt = [remove_char(s) for s in seq]
    #print(txt_alt)
    link_conversations(txt_alt, messages, 'msg', 'msg_2')

In [23]:
messages.loc[194, 'msg_2']

"Well I thought we'd start with pronunciation if that's okay with you."

In [24]:
messages.head(30)

Unnamed: 0_level_0,msg_line,user1_id,movie_id,user_name,msg,msg_2
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1045,L1045,u0,m0,BIANCA,They do not!,-
1044,L1044,u2,m0,CAMERON,They do to!,They do not!
985,L985,u0,m0,BIANCA,I hope so.,-
984,L984,u2,m0,CAMERON,She okay?,I hope so.
925,L925,u0,m0,BIANCA,Let's go.,-
924,L924,u2,m0,CAMERON,Wow,Let's go.
872,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.,-
871,L871,u2,m0,CAMERON,No,Okay -- you're gonna need to learn how to lie.
870,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,No
869,L869,u0,m0,BIANCA,Like my fear of wearing pastels?,-


### Processing for deep learning

In [25]:
#keeping the just the pair of messages
df = messages[messages['msg_2'] != '-']

In [26]:
df.head(10)

Unnamed: 0_level_0,msg_line,user1_id,movie_id,user_name,msg,msg_2
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1044,L1044,u2,m0,CAMERON,They do to!,They do not!
984,L984,u2,m0,CAMERON,She okay?,I hope so.
924,L924,u2,m0,CAMERON,Wow,Let's go.
871,L871,u2,m0,CAMERON,No,Okay -- you're gonna need to learn how to lie.
870,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...,No
868,"""L868",u2,m0,CAMERON,"The """"real you"""".""",Like my fear of wearing pastels?
867,L867,u0,m0,BIANCA,What good stuff?,"The """"real you""""."""
866,L866,u2,m0,CAMERON,I figured you'd get to the good stuff eventually.,What good stuff?
864,L864,u0,m0,BIANCA,Me. This endless ...blonde babble. I'm like b...,Thank God! If I had to hear one more story ab...
863,L863,u2,m0,CAMERON,What crap?,Me. This endless ...blonde babble. I'm like b...


In [27]:
def pre_processamento_texto(corpus):
    #tokenizacao
    corpus_alt = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    #capitalizacao
    corpus_alt = [t.lower() for t in corpus_alt]
    #lammatization
    #sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    
    #stemming
    #TODO
    
    #remover stopwords
    #stopwords_ = stopwords.words("portuguese")
    #corpus_alt = [t for t in corpus_alt if t not in stopwords_]
    #remover numero
    #corpus_alt = [re.sub(r"\d","",t) for t in corpus_alt]
    #remover pontuacoes
    #corpus_alt = [t for t in corpus_alt if t not in string.punctuation]

    return corpus_alt

In [53]:
i = 0
n = 2000

In [72]:
X_train, X_test, y_train, y_test = train_test_split(df['msg'][i:n].astype(str), df['msg_2'][i:n].astype(str), test_size=0.33, random_state=42)

In [73]:
df_small = pd.DataFrame()

In [74]:
df_small['msg'] = X_train

In [75]:
df_small['msg_2'] = y_train

In [76]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [77]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)
# encode training data set
y_train_token = tokenizer.texts_to_matrix(y_train)

In [78]:
num_rows, num_cols = X_train_token.shape

In [79]:
list_similarity = []
for i in range(num_rows):
    d = distance.cosine(X_train_token[i], y_train_token[i])
    if math.isnan(d):
        d = 0.0
    list_similarity.append(d)
    i+=0

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [80]:
df_small['similarity'] = list_similarity

In [81]:
df_small.head(20)

Unnamed: 0_level_0,msg,msg_2,similarity
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5426,When EVIL returns so shall we.,We will be ready Lord.,0.764298
4778,Yeah.,It looks like you bought it off one of the bro...,1.0
4787,Then what're you complainin' about? At least n...,I may call you worse than that.,0.858579
5296,When a guy hurts you then comes back bleeding ...,Hey Come on shrink time's over. They wouldn't ...,0.934205
5199,Maybe we should pay Luther a visit.,Let him get some sleep. He's going to need it.,1.0
5946,Love...,"Yes! But """"love"""" isn't the operative word her...",0.666667
5515,But what happens if instead of this... Ultimat...,White turns to black. Light to Dark. Life to ...,1.0
6265,And you don't know how they open is that what ...,That's what I'm saying.,0.711325
6054,Yes... you know there's a lot of differences b...,You noticed..,0.786799
5370,I'll buy ya the best dinner in San Francisco.....,Now you're talkin'. See ya...,0.891535


In [87]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(2, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 2)                 2328      
_________________________________________________________________
dropout_2 (Dropout)          (None, 2)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 6         
_________________________________________________________________
dropout_3 (Dropout)          (None, 2)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 3         
Total params: 2,337
Trainable params: 2,337
Non-trainable params: 0
_________________________________________________________________


In [88]:
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(X_train_token, df_small['similarity'], epochs=10, batch_size=20, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
model created


In [89]:
p = tokenizer.texts_to_matrix('what is the best movie?')

In [90]:
res = model.predict(np.array([p]))[0]



In [91]:
res

array([[0.5       ],
       [0.5       ],
       [0.5032889 ],
       [0.5       ],
       [0.5       ],
       [0.50109696],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.5       ],
       [0.50109696],
       [0.5       ],
       [0.5       ]], dtype=float32)

In [92]:
ERROR_THRESHOLD = 0.25
results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]

In [93]:
# sort by strength of probability
results.sort(key=lambda x: x[1], reverse=True)

In [None]:
return_list = []
i = 0
for i in range(len(results)):
    #return_list.append({"intent": df_small['msg_2'][i], "probability": str(i)})
    print(results[i])

#### Template model

In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(2, input_dim=4, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))

model.summary()

In [None]:

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(([1,2,3,4],[1,2,3,4],[1,2,3,4]), [1,2,3], epochs=2, batch_size=3, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")