# 3. Etapa de entrenamiento y testeo de un modelo de análisis de sentimiento

In [33]:
from datasets import load_dataset
import pandas as pd
import nltk
import numpy as np
import matplotlib as plt
import multiprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, TreebankWordTokenizer, RegexpTokenizer
from nltk import ngrams
from nltk.probability import FreqDist
from collections import Counter
from wordcloud import WordCloud
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from time import time
from stop_words import get_stop_words
import unicodedata
import re

from sklearn.model_selection import train_test_split # Modelado
from sklearn.pipeline import Pipeline # Modelado
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Modelado
from sklearn.feature_selection import chi2 # Reporte
from sklearn.linear_model import LogisticRegression # Reporte
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve # Reporte

import matplotlib.pyplot as plt
import joblib



In [34]:
# Importamos los datos
path = r'C:\Users\CARLES\1.CARLES\FORMACIONES\BOOTCAMP KEEPCODING\10. NLP'

clean_df = pd.read_csv(f'{path}\\reduced_df_clean_nlp.csv')

# Modelado

# Separamos en conjunto de train y test

Creamos los conjuntos de entrenamiento (75% del total) y test (25%).

In [35]:
# Como mas adelante vemos que los Nan estan dando error, los vamos a eliminar
clean_df = clean_df.dropna(subset=['processedReview'])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    clean_df['processedReview'],
    clean_df['is_negative_sentiment'],
    train_size=0.75,
    test_size=0.25,
    random_state=42,
    shuffle=True
)

X_train.to_csv(r'C:\Users\CARLES\1.CARLES\FORMACIONES\BOOTCAMP KEEPCODING\10. NLP\X_train.csv', index = False)
X_test.to_csv(r'C:\Users\CARLES\1.CARLES\FORMACIONES\BOOTCAMP KEEPCODING\10. NLP\X_test.csv', index = False)
y_train.to_csv(r'C:\Users\CARLES\1.CARLES\FORMACIONES\BOOTCAMP KEEPCODING\10. NLP\y_train.csv', index = False)
y_test.to_csv(r'C:\Users\CARLES\1.CARLES\FORMACIONES\BOOTCAMP KEEPCODING\10. NLP\y_test.csv', index = False)


In [37]:
X_train.iloc[:10]

343241                                   like fast delivery
145064             worth rip hair dollar store quality hair
299900                                 great fun play price
460812                    great year old mickey mouse party
280141    bought pen tried different one loved concept w...
64064     product seems dry coming leave clean smooth li...
459291                                nice sturdy wife love
353847    excellent dry use applying la prairie serum ce...
411650    lightweight simply designed end curved allow u...
38306     good epilator still pas multiple time get ever...
Name: processedReview, dtype: object

In [38]:
y_train.iloc[:10]

343241    0
145064    1
299900    0
460812    0
280141    0
64064     1
459291    0
353847    0
411650    0
38306     0
Name: is_negative_sentiment, dtype: int64

Extracción de features

In [39]:
cv = TfidfVectorizer(
    max_df=0.95,
    min_df=3,
    max_features=10000,
    strip_accents='ascii',
    ngram_range=(1, 1)
)
cv.fit(X_train)

# Nombre de archivo donde se guardará el TfidfVectorizer
archivo_cv = 'cv_tfidf_vectorizer.pkl'

# Guardar el TfidfVectorizer
joblib.dump(cv, archivo_cv)

['cv_tfidf_vectorizer.pkl']

El TfidfVectorizer se configura con max_df=0.95 y min_df=3 para eliminar términos muy comunes y muy raros, reduciendo ruido y dimensionalidad. 
Se limitan las características a 10,000 palabras (max_features=10000) para mantener un buen equilibrio entre información y manejabilidad. 
La opción strip_accents='ascii' normaliza el texto eliminando acentos. 
ngram_range=(1, 1) se usa para enfocarse en unigramas

In [40]:
print(list(cv.vocabulary_.items())[:20])

[('like', 4926), ('fast', 3237), ('delivery', 2227), ('worth', 9901), ('rip', 7303), ('hair', 3968), ('dollar', 2564), ('store', 8425), ('quality', 6787), ('great', 3892), ('fun', 3637), ('play', 6383), ('price', 6600), ('year', 9948), ('old', 5885), ('mickey', 5384), ('mouse', 5566), ('party', 6122), ('bought', 955), ('pen', 6193)]


In [41]:
print(len(cv.vocabulary_))

10000


# Entrenamiento

In [42]:
# Transformación de Características de Texto con TF-IDF
X_train_ = cv.transform(X_train)
X_test_ = cv.transform(X_test)

Regresion Logistica

In [43]:
c_params = [0.01, 0.05, 0.25, 0.5, 1, 10, 100, 1000, 10000]

train_acc = list()
test_acc = list()
for c in c_params:
    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=500)
    lr.fit(X_train_, y_train)

    train_predict = lr.predict(X_train_)
    test_predict = lr.predict(X_test_)

    print ("Accuracy for C={}: {}".format(c, accuracy_score(y_test, test_predict)))

    train_acc.append(accuracy_score(y_train, train_predict))
    test_acc.append(accuracy_score(y_test, test_predict))

# Nombre de archivo donde se guardará el modelo
archivo_c_params = 'c_params.pkl'
archivo_modelo = 'logistic_regression_model_nlp.pkl'
archivo_predicciones = 'predicciones_test_lr.pkl'
archivo_train_acc = 'train_acc_lr.pkl'
archivo_test_acc = 'test_acc_lr.pkl'

# Guardar el modelo y las predicciones
joblib.dump(c_params, archivo_c_params)
joblib.dump(lr, archivo_modelo)
joblib.dump(test_predict, archivo_predicciones)
joblib.dump(train_acc, archivo_train_acc)
joblib.dump(test_acc, archivo_test_acc)


Accuracy for C=0.01: 0.7986858931215507
Accuracy for C=0.05: 0.8287033921119935
Accuracy for C=0.25: 0.8383783147126127
Accuracy for C=0.5: 0.8398758244716651
Accuracy for C=1: 0.8401113878045497


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=10: 0.8383951406649617


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=100: 0.8367546103109436


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=1000: 0.8364349172163145
Accuracy for C=10000: 0.8366957194777225


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['test_acc_lr.pkl']

In [44]:
# Metricas sobre los resultados

print('Confussion matrix:\n{}'.format(confusion_matrix(y_test, test_predict)))
print('\nClassification report:\n{}'.format(classification_report(y_test, test_predict)))
print('Accuracy score:{}'.format(accuracy_score(y_test, test_predict)))

Confussion matrix:
[[75188  7726]
 [11685 24265]]

Classification report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89     82914
           1       0.76      0.67      0.71     35950

    accuracy                           0.84    118864
   macro avg       0.81      0.79      0.80    118864
weighted avg       0.83      0.84      0.83    118864

Accuracy score:0.8366957194777225


REDES NEURONALES

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Separar los datos en características (X) y etiquetas (y)
X = clean_df['processedReview']
y = clean_df['is_negative_sentiment']

max_words = 1000  # Número máximo de palabras a considerar
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Padding para hacer todas las secuencias del mismo tamaño
maxlen = 20  # Longitud máxima de las secuencias
X_padded = pad_sequences(X_sequences, maxlen=maxlen)

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)


In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.metrics import Precision, Recall

# Crear el modelo de red neuronal
model = Sequential()

# Capa de Embedding
embedding_dim = 16
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=maxlen))

# Capa LSTM
model.add(LSTM(units=100))

# Capas densas
model.add(Dense(units=1, activation='sigmoid'))  # Salida lineal para regresión

# Compilar el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall()])

# Resumen del modelo
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 16)            16000     
                                                                 
 lstm_1 (LSTM)               (None, 100)               46800     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 62901 (245.71 KB)
Trainable params: 62901 (245.71 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [47]:
# Entrenar el modelo
epochs = 3
batch_size = 64

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]  # first batch_size samples
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]  # rest for training

model.fit(X_train2, y_train2,
          validation_data=(X_valid, y_valid),
          batch_size=batch_size, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2a4d818fdc0>

In [48]:
# Evaluar el modelo en el conjunto de prueba
scores = model.evaluate(X_test, y_test, verbose=0)  # Devuelve la pérdida y otras métricas especificadas en model.compile()

# Obtener los nombres de las métricas
metrics_names = model.metrics_names

# Imprimir cada métrica con su nombre correspondiente
for name, score in zip(metrics_names, scores):
    print(f"{name}: {score}")

loss: 0.3746207058429718
accuracy: 0.8264924883842468
precision_1: 0.7607714533805847
recall_1: 0.621552050113678


Guardamos el modelo de Redes Neuronales

In [49]:
import os
from tensorflow.keras.models import load_model

# Definir el directorio de almacenamiento del modelo
cache_dir = './models'
os.makedirs(cache_dir, exist_ok=True)
model_file = "lstm_model_nlp.keras"  # Formato nativo de Keras

# Guardar el modelo en el formato nativo de Keras
model.save(os.path.join(cache_dir, model_file))