## Treinamento da rede neural Keras para classificação de perguntas (1) e respostas (0)

In [1]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math
import random
import bz2
import itertools
from keras.callbacks import ModelCheckpoint, EarlyStopping
import os
from os import listdir
from os.path import isfile, join
from sklearn.metrics import f1_score
from tensorflow.keras.models import load_model

[nltk_data] Downloading package wordnet to /home/douglas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#expand jupyter cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Opening movie reviews

In [38]:
messages = pd.read_csv('./chatdata/movie_lines_pre_processed_keras.tsv', delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [39]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'msg', 'msg_pre_processed', 'msg_2', 'target']

In [40]:
messages.head()

Unnamed: 0,msg_line,user_id,movie_id,msg,msg_pre_processed,msg_2,target
0,L50,u3,m0,No.,no,You might wanna think about it,0
1,L51,u0,m0,You might wanna think about it,you might wanna think about it,I need to think more about it,0
2,L59,u9,m0,I missed you.,i missed you,It says here you exposed yourself to a group o...,0
3,L60,u8,m0,It says here you exposed yourself to a group o...,it say here you exposed yourself to a group of...,It was a bratwurst. I was eating lunch.,0
4,L61,u9,m0,It was a bratwurst. I was eating lunch.,it wa a bratwurst i wa eating lunch,With the teeth of your zipper?,0


### Processing for deep learning

In [41]:
#setting the sample data for tests
i = 0
n = 20000

In [42]:
X_train, X_test, y_train, y_test = train_test_split(messages['msg_pre_processed'][i:n].astype(str), messages['target'][i:n].astype(str), test_size=0.33, stratify=messages['target'][i:n], random_state=42)

In [43]:
#dataframe with sample X and y
df_small = pd.DataFrame()

In [44]:
df_small['msg_pre_processed'] = X_train

In [45]:
df_small['target'] = y_train

In [46]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
17249,im ready,0
10205,im a human being ive got some,0
16875,ill say a little a possible,0
16512,hey what is this you got black soap,1
965,he doesnt speak english,0


In [47]:
df_small.shape

(13400, 2)

In [48]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [49]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
X_train

17249                                             im ready
10205                        im a human being ive got some
16875                          ill say a little a possible
16512                  hey what is this you got black soap
965                                he doesnt speak english
                               ...                        
6797                                          two three go
11892      of course now right away before i get any older
17980    not like him is it to do a thing like that wit...
17017                                          didnt i say
7508     hello im paul carey from the airline im here t...
Name: msg_pre_processed, Length: 13400, dtype: object

In [51]:
y_train

17249    0
10205    0
16875    0
16512    1
965      0
        ..
6797     0
11892    0
17980    1
17017    1
7508     0
Name: target, Length: 13400, dtype: object

In [52]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)

In [53]:
X_train_token

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [54]:
X_train_token.shape

(13400, 9711)

In [55]:
#set the number of rows of X_train
num_rows, num_cols = X_train_token.shape

In [56]:
classes = set(df_small['target'])
classes

{'0', '1'}

In [57]:
df_small['target'] = df_small['target'].astype('int')

In [58]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
17249,im ready,0
10205,im a human being ive got some,0
16875,ill say a little a possible,0
16512,hey what is this you got black soap,1
965,he doesnt speak english,0


### Search for the best parameters

### Training the model with fixed parameters

In [59]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(20, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 20)                194240    
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                210       
_________________________________________________________________
dropout_3 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 194,461
Trainable params: 194,461
Non-trainable params: 0
_________________________________________________________________


In [60]:
%%time
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(loss='BinaryCrossentropy', optimizer=sgd, metrics=['accuracy'])

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 13.7 ms


In [175]:
%%time

callbacks = [EarlyStopping(monitor='val_accuracy', patience=10, verbose=0),
                ModelCheckpoint(filepath='model.{val_accuracy:.2f}-{epoch:02d}.h5'),
            ]

#fitting and saving the model
hist = model.fit(X_train_token, df_small['target'], epochs=500, validation_split=0.3, batch_size=20, verbose=1, callbacks=callbacks)
#model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
model created
CPU times: user 1min 1s, sys: 7.88 s, total: 1min 9s
Wall time: 34.2 s


### Testing the prototipe

In [180]:
mypath = os.getcwd()
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f)) and 'model.' in f]

In [181]:
onlyfiles.sort(reverse=True)

In [182]:
onlyfiles

['model.0.81-14.h5',
 'model.0.81-13.h5',
 'model.0.81-12.h5',
 'model.0.81-10.h5',
 'model.0.81-09.h5',
 'model.0.81-08.h5',
 'model.0.81-07.h5',
 'model.0.81-06.h5',
 'model.0.81-05.h5',
 'model.0.81-04.h5',
 'model.0.81-02.h5',
 'model.0.81-01.h5',
 'model.0.80-16.h5',
 'model.0.80-15.h5',
 'model.0.80-11.h5',
 'model.0.80-03.h5']

In [184]:
print(onlyfiles[0])
model = load_model(onlyfiles[0])

model.0.81-14.h5


In [186]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))
    
    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))
    
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [187]:
msg_raw = 'I heard you are a good guy. Is it right?'

In [188]:
msg = pre_processing_text(msg_raw)

In [189]:
p = tokenizer.texts_to_matrix([msg])

In [190]:
p.shape

(1, 9711)

In [191]:
res = model.predict(p)

In [192]:
res

array([[0.99999243]], dtype=float32)

## Testing the model
- The model is overfited

In [203]:
# encode training data set
X_test_token = tokenizer.texts_to_matrix(X_test)

In [204]:
y_pred = model.predict(X_test_token)

In [205]:
y_test_int = [int(y) for y in y_test]

In [206]:
f1_score(y_test_int, y_pred.round())

0.6591084771965988