## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre sobre este tema

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [391]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math
import random
import bz2
import itertools
from keras.callbacks import ModelCheckpoint, EarlyStopping

[nltk_data] Downloading package wordnet to /home/douglas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [392]:
#expand jupyter cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Opening movie reviews

In [393]:
messages = pd.read_csv('./chatdata/movie_lines_pre_processed.tsv', delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [394]:
messages.head()

Unnamed: 0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed,target
0,L49,u0,m0,Did you change your hair?,No.,you change your hair,1
1,L50,u3,m0,No.,You might think about it,no,0
2,L51,u0,m0,You might wanna think about it,can you explain it better?,you might think about it,0
3,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...,i missed you,0
4,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.,it say here you exposed yourself to a group of...,0


### Processing for deep learning

In [395]:
#setting the sample data for tests
i = 0
n = 20000

In [396]:
X_train, X_test, y_train, y_test = train_test_split(messages['msg_pre_processed'][i:n].astype(str), messages['target'][i:n].astype(str), test_size=0.33, stratify=messages['target'][i:n], random_state=42)

In [397]:
#dataframe with sample X and y
df_small = pd.DataFrame()

In [398]:
df_small['msg_pre_processed'] = X_train

In [399]:
df_small['target'] = y_train

In [400]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
6272,switchboard how may i direct your call,0
159,i cant date her sister until that one get a bo...,0
16688,evening baxter,0
16365,general the severe food shortage that face the...,1
6940,no im all alone,0


In [401]:
df_small.shape

(13400, 2)

In [402]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [458]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [457]:
X_train

6272                switchboard how may i direct your call
159      i cant date her sister until that one get a bo...
16688                                       evening baxter
16365    general the severe food shortage that face the...
6940                                       no im all alone
                               ...                        
2434                     she dropped in on me holding this
5487                                         saying did it
11969                                that wa a great night
11056                               not far enough come on
10839            of course now away before i get any older
Name: msg_pre_processed, Length: 13400, dtype: object

In [404]:
y_train

6272     0
159      0
16688    0
16365    1
6940     0
        ..
2434     0
5487     1
11969    0
11056    0
10839    0
Name: target, Length: 13400, dtype: object

In [405]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)

In [406]:
X_train_token

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [407]:
X_train_token.shape

(13400, 9805)

In [408]:
#set the number of rows of X_train
num_rows, num_cols = X_train_token.shape

In [409]:
classes = set(df_small['target'])
classes

{'0', '1'}

In [410]:
df_small['target'] = df_small['target'].astype('int')

In [411]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
6272,switchboard how may i direct your call,0
159,i cant date her sister until that one get a bo...,0
16688,evening baxter,0
16365,general the severe food shortage that face the...,1
6940,no im all alone,0


### Search for the best parameters

### Training the model with fixed parameters

In [413]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(20, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 20)                196120    
_________________________________________________________________
dropout_22 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_34 (Dense)             (None, 10)                210       
_________________________________________________________________
dropout_23 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 11        
Total params: 196,341
Trainable params: 196,341
Non-trainable params: 0
_________________________________________________________________


In [414]:
%%time
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(loss='BinaryCrossentropy', optimizer=sgd, metrics=['accuracy'])

CPU times: user 15.6 ms, sys: 15.6 ms, total: 31.2 ms
Wall time: 39.2 ms


In [415]:
%%time

callbacks = [EarlyStopping(monitor='val_accuracy', patience=10, verbose=0),
                ModelCheckpoint(filepath='model.{epoch:02d}-{val_accuracy:.2f}.h5'),
            ]

#fitting and saving the model
hist = model.fit(X_train_token, df_small['target'], epochs=500, validation_split=0.3, batch_size=20, verbose=1, callbacks=callbacks)
model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
model created
CPU times: user 1min 33s, sys: 11.5 s, total: 1min 45s
Wall time: 1min 1s


### Testing the prototipe

In [416]:
from tensorflow.keras.models import load_model

In [425]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))
    
    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))
    
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [426]:
msg_raw = 'I heard you are a good guy. Is it right?'

In [427]:
msg = pre_processing_text(msg_raw)

In [428]:
p = tokenizer.texts_to_matrix([msg])

In [429]:
p.shape

(1, 9805)

In [430]:
res = model.predict(p)



In [432]:
res

array([[0.79494965]], dtype=float32)