## LSTM



In [30]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.layers.embeddings import Embedding
from sklearn.metrics import roc_auc_score,confusion_matrix, accuracy_score, make_scorer, f1_score,precision_score,recall_score, plot_confusion_matrix
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('rslp')
nltk.download('stopwords')
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from argparse import Namespace
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import RandomizedSearchCV
from componetes_preprocessamento import RemoveStopWords, Cleaner, Tokenizador, Stemmer, Joiner, pega_resultados, salvando_em_arquivo


[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Coleta de Dados

In [None]:
args = Namespace(
    train_split = 0.7,
    random_state = 42,
    vocab_size = 10000,
    embedding_dim = 16,
    max_length = 120,
    batch_size=128,
    num_epochs=5,
    early_stopping_criteria=2,
    dropout_p=0.1,
    model_storage="model_storage/lstm",
)

In [None]:
dataset = pd.read_csv("reviews.csv")
X = dataset["review_comment_message"].copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [None]:
for i in range(0,len(y)):
    if y[i] == -1:
        y[i] = 2
print(y)

[1 1 0 ... 0 1 2]


In [None]:
y_dummy = np_utils.to_categorical(y)
print(y_dummy)
print(y)

[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[1 1 0 ... 0 1 2]


## Pré-Processamento

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)

In [None]:
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words = args.vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

def preprocess(training_sentences, testing_sentences, max_length, vocab_size, trunc_type='post', oov_tok = "<OOV>"):
    """
    Args
        training_sentences
        training_labels
        testing_sentences
        testing_labels
    Return
        training_sentences
        training_labels
        testing_sentences
        testing_labels 
    """

    stopword = stopwords.words("portuguese")
    stem = RSLPStemmer()
    vectorizer = CountVectorizer()

    def clear(review):
        review = review.lower()
        # remove pula de linha 
        review = re.sub('\n', ' ', review)        
        review = re.sub('\r', ' ', review)

        # remove numero 
        review = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' #numero ', review)

        # remove caracters especiais 
        review = re.sub(r'R\$', ' ', review)
        review = re.sub(r'\W', ' ', review)
        review = re.sub(r'\s+', ' ', review)

        # remove links 
        urls = re.findall('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', review)
        if len(urls) > 0:
            for url in urls:
                for link in url:
                    review = review.replace(link, '')
            review = review.replace(':', '')
            review = review.replace('/', '')
        return review

    training_sentences = training_sentences.apply(lambda review: clear(review))
    testing_sentences = testing_sentences.apply(lambda review: clear(review))
    training_sentences = training_sentences.apply(lambda words_review: [word for word in words_review if word not in stopword])
    testing_sentences = testing_sentences.apply(lambda words_review: [word for word in words_review if word not in stopword])
    training_sentences = training_sentences.apply(lambda words_review: [stem.stem(word) for word in words_review ])
    testing_sentences = testing_sentences.apply(lambda words_review: [stem.stem(word) for word in words_review ])
    training_sentences = training_sentences.apply(lambda words_review: " ".join(words_review))
    testing_sentences = testing_sentences.apply(lambda words_review: " ".join(words_review))
    training_sentences = tokenizer.texts_to_sequences(training_sentences)
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

    training_padded = pad_sequences(training_sentences,maxlen=max_length, truncating=trunc_type)
    testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

    return training_padded, testing_padded

In [None]:
X_train_new, X_test_new = preprocess(X_train, X_test, args.max_length, args.vocab_size)

## Parâmetros Padrão

In [None]:
model = Sequential()
model.add(Embedding(args.vocab_size, args.embedding_dim, input_length=args.max_length))
model.add(Dropout(0.2))
model.add(LSTM(16,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(16))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 dropout (Dropout)           (None, 120, 16)           0         
                                                                 
 lstm (LSTM)                 (None, 120, 16)           2112      
                                                                 
 dropout_1 (Dropout)         (None, 120, 16)           0         
                                                                 
 lstm_1 (LSTM)               (None, 16)                2112      
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 3)                 5

In [None]:
result = model.fit(X_train_new, y_train, batch_size=128, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate(X_test_new,y_test)



[0.7904999852180481, 0.6591372489929199]

In [None]:
print(predicted_y)

[[0.13933723 0.04754004 0.8131227 ]
 [0.20462166 0.10067131 0.69470704]
 [0.3100253  0.34379622 0.34617847]
 ...
 [0.19093415 0.06603835 0.7430275 ]
 [0.18805502 0.7970381  0.01490686]
 [0.22491296 0.71880877 0.05627837]]


In [None]:
predicted_y = model.predict(X_test_new)
predicted_y_transform = [np.argmax(t) for t in predicted_y]
print(classification_report(y_test, predicted_y_transform))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2918
           1       0.68      0.90      0.78      6137
           2       0.62      0.79      0.69      3208

    accuracy                           0.66     12263
   macro avg       0.43      0.56      0.49     12263
weighted avg       0.50      0.66      0.57     12263



In [38]:
resultados = []

## Gridsearch Accuracy

In [None]:
def createLSTM(activation, neurons):
    model = Sequential()
    model.add(Embedding(args.vocab_size, args.embedding_dim, input_length=args.max_length))
    model.add(Dropout(0.2))
    model.add(LSTM(units=neurons,activation=activation))
    model.add(Dropout(0.2))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
parameters = dict()
parameters["activation"] = ["linear","tanh"]
parameters["neurons"] = [8,16]
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
model_lstm = KerasClassifier(build_fn=createLSTM, epochs=5, batch_size=128, verbose=1)

  


In [None]:
search = GridSearchCV(estimator=model_lstm, param_grid=parameters, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-1)

In [None]:
result = search.fit(X_train_new, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [28]:
best_model = result.best_estimator_
print(best_model)
print(result.best_params_)
predicted_y_gsac = best_model.predict(X_test_new)

<keras.wrappers.scikit_learn.KerasClassifier object at 0x7fd77e6e5cd0>
{'activation': 'tanh', 'neurons': 16}


In [33]:
cvres = result.cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.6304917379706508 {'activation': 'tanh', 'neurons': 16}
0.629792693909344 {'activation': 'tanh', 'neurons': 8}
0.611931732831537 {'activation': 'linear', 'neurons': 16}
0.6042071976416744 {'activation': 'linear', 'neurons': 8}


In [39]:
predicted_y_gsac = best_model.predict(X_test_new)
print(classification_report(y_test, predicted_y_gsac))
resultado = pega_resultados("lstm", "Grid Search", y_test, predicted_y_gsac, "Acuracia", result.best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2918
           1       0.66      0.88      0.75      6137
           2       0.57      0.72      0.64      3208

    accuracy                           0.63     12263
   macro avg       0.41      0.53      0.46     12263
weighted avg       0.48      0.63      0.54     12263

['lstm', 'Grid Search', 0.6296175487238033, 0.46378319315103894, 0.40967988224660395, 0.5345204218284878, 'Acuracia', {'activation': 'tanh', 'neurons': 16}]


## Gridsearch F1-Score

In [46]:
search_gsf1 = GridSearchCV(estimator=model_lstm, param_grid=parameters, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-1)

In [47]:
result_gsf1 = search_gsf1.fit(X_train_new, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [48]:
cvres = result_gsf1.cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.4636691352625547 {'activation': 'tanh', 'neurons': 16}
0.46025005181015954 {'activation': 'tanh', 'neurons': 8}
0.4352616421072371 {'activation': 'linear', 'neurons': 8}
0.4202837796353524 {'activation': 'linear', 'neurons': 16}


In [49]:
best_model_gsf1 = result_gsf1.best_estimator_
predicted_y_gsf1 = best_model_gsf1.predict(X_test_new)
print(classification_report(y_test, predicted_y_gsf1))
resultado = pega_resultados("lstm", "Grid Search", y_test, predicted_y_gsac, "F1-Score", result_gsf1.best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2918
           1       0.66      0.88      0.75      6137
           2       0.57      0.73      0.64      3208

    accuracy                           0.63     12263
   macro avg       0.41      0.54      0.46     12263
weighted avg       0.48      0.63      0.54     12263

['lstm', 'Grid Search', 0.6296175487238033, 0.46378319315103894, 0.40967988224660395, 0.5345204218284878, 'F1-Score', {'activation': 'tanh', 'neurons': 16}]


## Randomized Search Accuracy

In [None]:
del X_train
del X_test
del dataset

In [None]:
parameters_rs = dict()
parameters_rs["activation"] = ["linear","tanh"]
parameters_rs["neurons"] = [8,16]
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
model_lstm = KerasClassifier(build_fn=createLSTM, epochs=5, batch_size=128, verbose=-1)

  


In [None]:
random_search = RandomizedSearchCV(estimator=model_lstm, param_distributions=parameters_rs, scoring="accuracy", cv=kfold, verbose=-1, refit=True,n_jobs=-1)

In [None]:
result_random = random_search.fit(X_train_new, y_train)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


In [34]:
cvres = result_random.cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.6304917623960973 {'neurons': 16, 'activation': 'tanh'}
0.628289771771072 {'neurons': 8, 'activation': 'tanh'}
0.6146235513573098 {'neurons': 16, 'activation': 'linear'}
0.6111278425418487 {'neurons': 8, 'activation': 'linear'}


In [40]:
best_model_random = result_random.best_estimator_
predicted_y_rsac = best_model_random.predict(X_test_new)
print(classification_report(y_test, predicted_y_rsac))
resultado = pega_resultados("lstm", "Random Search", y_test, predicted_y_gsac, "Acuracia", result_random.best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

           0       0.25      0.00      0.00      2918
           1       0.65      0.90      0.75      6137
           2       0.59      0.68      0.63      3208

    accuracy                           0.63     12263
   macro avg       0.50      0.53      0.46     12263
weighted avg       0.54      0.63      0.54     12263

['lstm', 'Random Search', 0.6296175487238033, 0.46378319315103894, 0.40967988224660395, 0.5345204218284878, 'Acuracia', {'neurons': 16, 'activation': 'tanh'}]


## Randomized Search F1-Score

In [41]:
random_search_f1 = RandomizedSearchCV(estimator=model_lstm, param_distributions=parameters_rs, scoring="f1_macro", cv=kfold, verbose=-1, refit=True,n_jobs=-1)

In [42]:
result_rsf1 = random_search_f1.fit(X_train_new, y_train)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


In [44]:
cvres = result_rsf1.cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.463889197679724 {'neurons': 16, 'activation': 'tanh'}
0.46219077136016545 {'neurons': 8, 'activation': 'tanh'}
0.4207311603986293 {'neurons': 8, 'activation': 'linear'}
0.3993150638573066 {'neurons': 16, 'activation': 'linear'}


In [None]:
best_model_random = result_rsf1.best_estimator_
print(result_rsf1.best_params_)
predicted_y_rs = best_model_random.predict(X_test_new)
print(classification_report(y_test, predicted_y_rs))

In [45]:
best_model_rsf1 = result_rsf1.best_estimator_
predicted_y_rsf1 = best_model_rsf1 .predict(X_test_new)
print(classification_report(y_test, predicted_y_rsf1))
resultado = pega_resultados("lstm", "Random Search", y_test, predicted_y_gsac, "F1-Score",result_rsf1.best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

           0       0.67      0.00      0.00      2918
           1       0.64      0.92      0.75      6137
           2       0.60      0.64      0.62      3208

    accuracy                           0.63     12263
   macro avg       0.64      0.52      0.46     12263
weighted avg       0.63      0.63      0.54     12263

['lstm', 'Random Search', 0.6296175487238033, 0.46378319315103894, 0.40967988224660395, 0.5345204218284878, 'F1-Score', {'neurons': 16, 'activation': 'tanh'}]


In [51]:
salvando_em_arquivo("LSTM_resultados.csv", resultados)