# Parte 2 - Etapa de Pre-procesado de texto

In [1]:
import pandas as pd
import numpy as np
import re
import os
import joblib

from bs4 import BeautifulSoup 
import nltk
nltk.download("stopwords")  
from nltk.corpus import stopwords

from nltk.stem.porter import *
stemmer = PorterStemmer()

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from keras.models import Sequential
from keras.preprocessing import sequence
import sklearn.preprocessing as pr
from keras.layers import Embedding, LSTM, Dense, Dropout, GRUV2, SimpleRNN
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve

[nltk_data] Downloading package stopwords to /home/jose/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def preprocesado(path, size):
    
    '''carga de datos'''
    df = pd.read_json(path, lines=True, 
                        compression='gzip')[:size][['reviewText', 'overall']]
    
    df.overall = [1 if int(row) > 2 else 0 for row in df.overall] 
    
    '''Balanceo de etiquetas'''
    label_1, label_0 = df['overall'].value_counts()

    df = pd.concat([df[df.overall == 1].sample(label_0 * 1),
                    df[df.overall == 0]],
                   axis=0)    
    
    X_train, X_test, y_train, y_test = train_test_split(
        df.reviewText,
        df.overall,   
        test_size=0.3,
        random_state=42,
        shuffle=True
    )  
    
    len_vocabulary = set()
    
    def review_to_words(review):
        """Convert a raw review string into a sequence of words."""
        text = BeautifulSoup(review, "html5lib").get_text()
        text = re.sub(r"[^a-zA-Z0-9]", " ", review.lower())
        words = text.split()
        words = [w for w in words if len(w) > 3 and w not in stopwords.words("english")]
        words = [PorterStemmer().stem(w) for w in words] 
        for w in words:
            len_vocabulary.add(w)
        return words
    
    words_train = list(map(review_to_words, list(X_train)))
    words_test = list(map(review_to_words, X_test)) 
    
    vectorizer = CountVectorizer(max_features = len(len_vocabulary),
             preprocessor=lambda x: x, tokenizer=lambda x: x)  # already preprocessed   
    
    '''features para GradientBoostingClassifier'''
    features_train_gradient = vectorizer.fit_transform(words_train).toarray()
    features_test_gradient = vectorizer.transform(words_test).toarray()
    
    #normalizamos
    features_train_gradient = pr.normalize(features_train_gradient, axis=1)
    features_test_gradient = pr.normalize(features_test_gradient, axis=1)    
    
    '''Vocabulario'''
    vocabulary = vectorizer.vocabulary_
    
    '''features para redes'''    
    def features(words):
        features = []
        for sentence in words:            
            words = []
            for word in sentence:
                try:
                    words.append(vocabulary[word])
                except:
                    pass
            features.append(words)
        return features
    
    features_train = features(words_train)
    features_test = features(words_test)      
    
    maxlen = max(len(max(features_train)),len(max(features_test)))
    features_train = sequence.pad_sequences(features_train, maxlen = maxlen)    
    features_test = sequence.pad_sequences(features_test, maxlen = maxlen)
    
    return X_train,\
           X_test,\
           features_train,\
           features_test,\
           np.array(y_train),\
           np.array(y_test),\
           features_train_gradient,\
           features_test_gradient,\
           list(y_train),\
           list(y_test),\
           vocabulary,\
           maxlen

In [3]:
#!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Amazon_Instant_Video_5.json.gz
X_train,\
X_test,\
features_train,\
features_test,\
labels_train,\
labels_test,\
features_train_gradient,\
features_test_gradient,\
labels_train_gradient,\
labels_test_gradient,\
vocabulary,\
maxlen = preprocesado('reviews_Amazon_Instant_Video_5.json.gz',37126)     

# Parte 3 -  Etapa de entrenamiento y testeo de un modelo de análisis de sentimiento

## Machine Learning

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

n_estimators = 150

def classify_gboost(X_train, X_test, y_train, y_test):        
    clf = GradientBoostingClassifier(n_estimators = 150,
                                     learning_rate = 0.5,
                                     max_depth = 1, 
                                     random_state = 42)

    clf.fit(X_train, y_train)
    
    print("[{}] Accuracy: train = {}, test = {}".format(
            clf.__class__.__name__,
            clf.score(X_train, y_train),
            clf.score(X_test, y_test)))
    
    return clf


clf2 = classify_gboost(features_train_gradient,
                       features_test_gradient,
                       labels_train_gradient,
                       labels_test_gradient)

[GradientBoostingClassifier] Accuracy: train = 0.8463521015067407, test = 0.7816836262719704


## Deep Learning

In [5]:
def crear_model(mod, emb_size, vocabulary_size, max_words):
    embedding_size = emb_size
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_size, input_length = max_words))
    model.add(mod(100))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    return model

def entreno(batch_size, num_epochs, X_train, labels_train, model):
    
    X_valid, y_valid = X_train[:batch_size], labels_train[:batch_size]  # first batch_size samples
    X_train2, y_train2 = X_train[batch_size:], labels_train[batch_size:]  # rest for training

    model.fit(X_train2, y_train2,
              validation_data=(X_valid, y_valid),
              batch_size=batch_size, epochs=num_epochs)
    return model

def evaluacion(model, X_test, labels_test):        
    print("Test accuracy:", model.evaluate(X_test, labels_test, verbose=0)[1]) 

In [6]:
for mod in [LSTM, GRUV2, SimpleRNN]:
    print("\nNuevo modelo\n")
    model = crear_model(mod, 32, len(vocabulary), maxlen)    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print("\nComienza entrenamiento\n")
    entreno(64, 3, features_train, labels_train, model)
    print("\nComienza evaluación\n")
    evaluacion(model, features_test, labels_test)


Nuevo modelo

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 57, 32)            499840    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 553,141
Trainable params: 553,141
Non-trainable params: 0
_________________________________________________________________
None

Comienza entrenamiento

Epoch 1/3
Epoch 2/3
Epoch 3/3

Comienza evaluación

Test accuracy: 0.7946345806121826

Nuevo modelo

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 57, 32)            499840    
______________

## RNN + word2vec Embedding

In [7]:
import gensim
import multiprocessing as mp

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LSTM,
)
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [8]:
def pipeline_word2vec(X_train, X_test):
    # WORD2VEC
    W2V_SIZE = 300
    W2V_WINDOW = 7
    # 32
    W2V_EPOCH = 5
    W2V_MIN_COUNT = 1
    # KERAS
    SEQUENCE_LENGTH = 500
    
    def generate_tokenizer(train_df):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(train_df)
        vocab_size = len(tokenizer.word_index) + 1
        print(f"Total words: {vocab_size}")
        return tokenizer, vocab_size
    
    def generate_word2vec(train_df):
        documents = [_text.split() for _text in train_df.review]
        w2v_model = gensim.models.word2vec.Word2Vec(
            vector_size=W2V_SIZE,
            window=W2V_WINDOW,
            min_count=W2V_MIN_COUNT,
            workers=mp.cpu_count(),
        )
        w2v_model.build_vocab(documents)

        words = list(w2v_model.wv.index_to_key)
        vocab_size = len(words)
        print(f"Vocab size: {vocab_size}")
        w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

        return w2v_model
    
    def generate_embedding(word2vec_model, vocab_size, tokenizer):
        embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
        for word, i in tokenizer.word_index.items():
            if word in word2vec_model.wv:
                embedding_matrix[i] = word2vec_model.wv[word]
        return Embedding(
            vocab_size,
            W2V_SIZE,
            weights=[embedding_matrix],
            input_length=SEQUENCE_LENGTH,
            trainable=False,
        )
    
    X_train_words = pd.DataFrame(list(X_train), columns=["review"])
    X_test_words = pd.DataFrame(list(X_test), columns=["review"])
    
    tokenizer, vocab = generate_tokenizer(X_train_words.review)
    
    word2vec_model = generate_word2vec(X_train_words)
    
    max_words = 500

    X_train_words = sequence.pad_sequences(
        tokenizer.texts_to_sequences(X_train_words.review), maxlen=max_words)
    X_test_words = sequence.pad_sequences(
        tokenizer.texts_to_sequences(X_test_words.review), maxlen=max_words)
    
    embedding_layer = generate_embedding(word2vec_model, vocab, tokenizer)
   
    return embedding_layer, X_train_words, X_test_words

In [9]:
embedding_layer, X_train_words, X_test_words = pipeline_word2vec(X_train, X_test)

Total words: 25649
Vocab size: 56278


In [10]:
model_custom = Sequential()
model_custom.add(embedding_layer)
model_custom.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_custom.add(Dense(1, activation="sigmoid"))

In [11]:
model_custom.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
batch_size = 64
num_epochs = 5

X_train_words_valid, y_valid = X_train_words[:batch_size], labels_train[:batch_size]  # first batch_size samples
X_train_words2, y_train2 = X_train_words[batch_size:], labels_train[batch_size:]  # rest for training

model_custom.fit(X_train_words2, y_train2,
          validation_data=(X_train_words_valid, y_valid),
          batch_size=batch_size, epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f481a9d81c0>

In [41]:
evaluacion(model_custom, X_test_words, labels_test)

Test accuracy: 0.7132284641265869


# Parte 4 - Reporte de métricas y conclusiones

### LSTM, GRUV2
##### Vemos que en este caso el mejor modelo ha sido GRUV2 con:
 * val_accuracy: 0.8125
 * Test accuracy: 0.7987974286079407.

#### Sin embargo no mejora el resultado obtenido con la regresion logística en el notebook anterios.

In [37]:
df = pd.DataFrame(columns = ['c_params', 'train', 'test'])

for c in [0.01, 0.02, 0.03, 0.04, 0.05, 0.25, 0.5, 1, 10, 100, 1000]:
    
    lr = LogisticRegression(C = c, solver = 'lbfgs', max_iter = 500)
    lr.fit(features_train_gradient, labels_train_gradient)
    
    df = df.append({'c_params' : c,
                    'train' : accuracy_score(labels_train_gradient, 
                                             lr.predict(features_train_gradient)),
                    'test' : accuracy_score(labels_test_gradient,
                                            lr.predict(features_test_gradient))},
                   ignore_index = True)

print(df.sort_values('test').iloc[-1])

c_params    10.000000
train        0.952815
test         0.806198
Name: 8, dtype: float64


In [39]:
lr = LogisticRegression(C = 10, solver = 'lbfgs', max_iter = 500)
lr.fit(features_train_gradient, labels_train_gradient)
    
print(f'Confussion matrix:\n{confusion_matrix(labels_test_gradient, lr.predict(features_test_gradient))}')
print(f'\nClassification report:\n{classification_report(labels_test_gradient, lr.predict(features_test_gradient))}')
print(f'Accuracy score:{accuracy_score(labels_test_gradient, lr.predict(features_test_gradient))}')

Confussion matrix:
[[874 196]
 [223 869]]

Classification report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      1070
           1       0.82      0.80      0.81      1092

    accuracy                           0.81      2162
   macro avg       0.81      0.81      0.81      2162
weighted avg       0.81      0.81      0.81      2162

Accuracy score:0.8061979648473635


#### Se observa que tampoco mejora el resultado. Además vemos que existe overfitting
 * train        0.952815
 * test         0.806198