## Treinamento da rede neural Keras para classificação de perguntas (1) e respostas (0)

In [None]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math
import random
import bz2
import itertools
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
#expand jupyter cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Opening movie reviews

In [None]:
messages = pd.read_csv('./chatdata/movie_lines_pre_processed_keras.tsv', delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [None]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'msg', 'msg_2', 'msg_pre_processed', 'target']

In [None]:
messages.head()

### Processing for deep learning

In [None]:
#setting the sample data for tests
i = 0
n = 20000

In [None]:
X_train, X_test, y_train, y_test = train_test_split(messages['msg_pre_processed'][i:n].astype(str), messages['target'][i:n].astype(str), test_size=0.33, stratify=messages['target'][i:n], random_state=42)

In [None]:
#dataframe with sample X and y
df_small = pd.DataFrame()

In [None]:
df_small['msg_pre_processed'] = X_train

In [None]:
df_small['target'] = y_train

In [None]:
df_small.head()

In [None]:
df_small.shape

In [None]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
X_train

In [None]:
y_train

In [None]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)

In [None]:
X_train_token

In [None]:
X_train_token.shape

In [None]:
#set the number of rows of X_train
num_rows, num_cols = X_train_token.shape

In [None]:
classes = set(df_small['target'])
classes

In [None]:
df_small['target'] = df_small['target'].astype('int')

In [None]:
df_small.head()

### Search for the best parameters

### Training the model with fixed parameters

In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(20, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
%%time
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(loss='BinaryCrossentropy', optimizer=sgd, metrics=['accuracy'])

In [None]:
%%time

callbacks = [EarlyStopping(monitor='val_accuracy', patience=10, verbose=0),
                ModelCheckpoint(filepath='model.{epoch:02d}-{val_accuracy:.2f}.h5'),
            ]

#fitting and saving the model
hist = model.fit(X_train_token, df_small['target'], epochs=500, validation_split=0.3, batch_size=20, verbose=1, callbacks=callbacks)
model.save('chatbot_model.h5', hist)

print("model created")

### Testing the prototipe

In [None]:
from tensorflow.keras.models import load_model

In [None]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))
    
    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))
    
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [None]:
msg_raw = 'I heard you are a good guy. Is it right?'

In [None]:
msg = pre_processing_text(msg_raw)

In [None]:
p = tokenizer.texts_to_matrix([msg])

In [None]:
p.shape

In [None]:
res = model.predict(p)

In [None]:
res

## Testing the model

In [None]:
y_pred = model.predict(X_test)

In [None]:
model_metrics(y_test, y_pred)