In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.layers.embeddings import Embedding
from sklearn.metrics import roc_auc_score,confusion_matrix, accuracy_score, make_scorer, f1_score,precision_score,recall_score, plot_confusion_matrix
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('rslp')
nltk.download('stopwords')
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from argparse import Namespace
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import RandomizedSearchCV


[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
args = Namespace(
    train_split = 0.7,
    random_state = 42,
    vocab_size = 10000,
    embedding_dim = 16,
    max_length = 120,
    batch_size=128,
    num_epochs=5,
    early_stopping_criteria=2,
    dropout_p=0.1,
    model_storage="model_storage/lstm",
)

In [3]:
dataset = pd.read_csv("datasets/reviews.csv")
X = dataset["review_comment_message"].copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [4]:
for i in range(0,len(y)):
    if y[i] == -1:
        y[i] = 2
print(y)

[1 1 0 ... 0 1 2]


In [5]:
y_dummy = np_utils.to_categorical(y)
print(y_dummy)
print(y)

[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[1 1 0 ... 0 1 2]


## Pré-Processamento

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)

In [7]:
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words = args.vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

def preprocess(training_sentences, testing_sentences, max_length, vocab_size, trunc_type='post', oov_tok = "<OOV>"):
    """
    Args
        training_sentences
        training_labels
        testing_sentences
        testing_labels
    Return
        training_sentences
        training_labels
        testing_sentences
        testing_labels 
    """

    stopword = stopwords.words("portuguese")
    stem = RSLPStemmer()
    vectorizer = CountVectorizer()

    def clear(review):
        review = review.lower()
        # remove pula de linha 
        review = re.sub('\n', ' ', review)        
        review = re.sub('\r', ' ', review)

        # remove numero 
        review = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' #numero ', review)

        # remove caracters especiais 
        review = re.sub(r'R\$', ' ', review)
        review = re.sub(r'\W', ' ', review)
        review = re.sub(r'\s+', ' ', review)

        # remove links 
        urls = re.findall('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', review)
        if len(urls) > 0:
            for url in urls:
                for link in url:
                    review = review.replace(link, '')
            review = review.replace(':', '')
            review = review.replace('/', '')
        return review

    training_sentences = training_sentences.apply(lambda review: clear(review))
    testing_sentences = testing_sentences.apply(lambda review: clear(review))
    training_sentences = training_sentences.apply(lambda words_review: [word for word in words_review if word not in stopword])
    testing_sentences = testing_sentences.apply(lambda words_review: [word for word in words_review if word not in stopword])
    # training_sentences = training_sentences.apply(lambda review: word_tokenize(review))
    # testing_sentences = testing_sentences.apply(lambda review: word_tokenize(review))
    training_sentences = training_sentences.apply(lambda words_review: [stem.stem(word) for word in words_review ])
    testing_sentences = testing_sentences.apply(lambda words_review: [stem.stem(word) for word in words_review ])
    training_sentences = training_sentences.apply(lambda words_review: " ".join(words_review))
    testing_sentences = testing_sentences.apply(lambda words_review: " ".join(words_review))
    training_sentences = tokenizer.texts_to_sequences(training_sentences)
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

    training_padded = pad_sequences(training_sentences,maxlen=max_length, truncating=trunc_type)
    testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

    # training_padded = vectorizer.fit_transform(training_sentences)
    # testing_padded = vectorizer.fit_transform(testing_sequences)

    return training_padded, testing_padded

In [8]:
X_train_new, X_test_new = preprocess(X_train, X_test, args.max_length, args.vocab_size)

In [9]:
# def create_model():
#     input = tf.keras.Input(shape=(args.max_length))
#     x = tf.keras.layers.Embedding(args.vocab_size, args.embedding_dim, input_length=args.max_length)(input)

#     x = tf.keras.layers.LSTM(16, return_sequences=True)(x)
#     x = tf.keras.layers.LSTM(16, return_sequences=True)(x)
#     x = tf.keras.layers.LSTM(16)(x)

#     x = tf.keras.layers.Dropout(0.2)(x)

#     x = tf.keras.layers.Dense(32, activation='relu')(x)

#     output = tf.keras.layers.Dense(3, activation='softmax')(x)

#     return tf.keras.Model(input, output)

## Parâmetros Padrão

In [10]:
model = Sequential()
model.add(Embedding(args.vocab_size, args.embedding_dim, input_length=args.max_length))
model.add(Dropout(0.2))
model.add(LSTM(16,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(16))
model.add(Dropout(0.2))
# model.add(Dense(32,activation="linear"))
# model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 dropout (Dropout)           (None, 120, 16)           0         
                                                                 
 lstm (LSTM)                 (None, 120, 16)           2112      
                                                                 
 dropout_1 (Dropout)         (None, 120, 16)           0         
                                                                 
 lstm_1 (LSTM)               (None, 16)                2112      
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 3)                 5

In [10]:
result = model.fit(X_train_new, y_train, batch_size=128, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
model.evaluate(X_test_new,y_test)



[0.7904999852180481, 0.6591372489929199]

In [26]:
print(predicted_y)

[[0.13933723 0.04754004 0.8131227 ]
 [0.20462166 0.10067131 0.69470704]
 [0.3100253  0.34379622 0.34617847]
 ...
 [0.19093415 0.06603835 0.7430275 ]
 [0.18805502 0.7970381  0.01490686]
 [0.22491296 0.71880877 0.05627837]]


In [12]:
predicted_y = model.predict(X_test_new)
predicted_y_transform = [np.argmax(t) for t in predicted_y]
# y_test_transform = [np.argmax(t) for t in y_test]
print(classification_report(y_test, predicted_y_transform))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2918
           1       0.68      0.90      0.78      6137
           2       0.62      0.79      0.69      3208

    accuracy                           0.66     12263
   macro avg       0.43      0.56      0.49     12263
weighted avg       0.50      0.66      0.57     12263



In [64]:
# earlyStoppingCallback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=args.early_stopping_criteria)

# model = create_model()
# model.compile(
#   loss = tf.keras.losses.CategoricalCrossentropy(),
#   optimizer= tf.keras.optimizers.Adam(
#     learning_rate=0.0001),
#   metrics=['accuracy']
# )

# history = model.fit(
#     X_train_new,
#     np.array(y_train), 
#     epochs=15,
#     batch_size=args.batch_size,
#     callbacks= [earlyStoppingCallback],
# )

In [65]:
# model.evaluate(X_test_new, np.array(y_test))

## Gridsearch Accuracy

In [10]:
def createLSTM(activation, neurons):
    model = Sequential()
    model.add(Embedding(args.vocab_size, args.embedding_dim, input_length=args.max_length))
    model.add(Dropout(0.2))
    model.add(LSTM(units=neurons,activation=activation))
    model.add(Dropout(0.2))
    # model.add(LSTM(units=neurons,activation=activation))
    # model.add(Dropout(0.2))
    # model.add(Dense(units=neurons,activation=activation))
    # model.add(Dropout(0.2))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # model.fit(X_train_new, y_train, batch_size=128, epochs=5)

    return model

In [11]:
parameters = dict()
# parameters["epochs"] = [5,10]
parameters["activation"] = ["linear","relu"]
parameters["neurons"] = [16,32]
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
model_lstm = KerasClassifier(build_fn=createLSTM, epochs=5, batch_size=128, verbose=1)

  model_lstm = KerasClassifier(build_fn=createLSTM, epochs=5, batch_size=128, verbose=1)


In [13]:
search = GridSearchCV(estimator=model_lstm, param_grid=parameters, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-4)

In [14]:
result = search.fit(X_train_new, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
best_model = result.best_estimator_
predicted_y_gs = best_model.predict(X_test_new)

In [16]:
print(predicted_y_gs)
print(y_test)


[2 2 2 ... 2 1 2]
[0 2 2 ... 0 1 1]


In [17]:
# predicted_y_gs_transform = [np.argmax(t) for t in predicted_y_gs]
# y_test_transform = [np.argmax(t) for t in y_test]
print(classification_report(y_test, predicted_y_gs))

              precision    recall  f1-score   support

           0       0.58      0.00      0.00      2918
           1       0.68      0.84      0.75      6137
           2       0.53      0.78      0.64      3208

    accuracy                           0.62     12263
   macro avg       0.60      0.54      0.46     12263
weighted avg       0.62      0.62      0.54     12263



## Gridsearch F1-Score

In [12]:
search_gsf1 = GridSearchCV(estimator=model_lstm, param_grid=parameters, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-4)

In [13]:
result_gsf1 = search_gsf1.fit(X_train_new, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
best_model_gsf1 = result_gsf1.best_estimator_
predicted_y_gsf1 = best_model_gsf1.predict(X_test_new)

In [15]:
print(classification_report(y_test, predicted_y_gsf1))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2918
           1       0.63      0.90      0.74      6137
           2       0.57      0.64      0.60      3208

    accuracy                           0.62     12263
   macro avg       0.40      0.51      0.45     12263
weighted avg       0.47      0.62      0.53     12263



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Randomized Search Accuracy

In [11]:
del X_train
del X_test
del dataset

In [12]:
parameters_rs = dict()
# parameters_rs["epochs"] = [5,10]
parameters_rs["activation"] = ["linear","relu"]
parameters_rs["neurons"] = [8,16]
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
model_lstm = KerasClassifier(build_fn=createLSTM, epochs=5, batch_size=128, verbose=1)

  model_lstm = KerasClassifier(build_fn=createLSTM, epochs=5, batch_size=128, verbose=1)


In [13]:
random_search = RandomizedSearchCV(estimator=model_lstm, param_distributions=parameters_rs, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-4)

In [14]:
result_random = random_search.fit(X_train_new, y_train)



Fitting 10 folds for each of 4 candidates, totalling 40 fits


In [None]:
best_model_random = result_random.best_estimator_
predicted_y_rs = best_model_random.predict(X_test_new)
print(classification_report(y_test, predicted_y_rs))