# 3 Entrenamiento y test de modelos

In [1]:
import pandas as pd
import numpy as np

### 3.0 Copio los archivos del Drive


In [2]:
from google.colab import drive

# Monto Google Drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Copio los archivos del Drive

!cp "/content/drive/My Drive/Video_Games_Preprocessed.csv" "/content/Video_Games_Preprocessed.csv"

# Verifico que el archivo se haya descargado correctamente
!ls -l "/content/"

total 19596
drwx------ 5 root root     4096 Mar 28 15:40 drive
drwxr-xr-x 1 root root     4096 Mar 26 13:28 sample_data
-rw------- 1 root root 20054410 Mar 28 15:40 Video_Games_Preprocessed.csv


###3.1 Carga de los datos generados en el notebook anterior

In [4]:
# Ruta al archivo CSV
ruta_archivo_csv = f"/content/Video_Games_Preprocessed.csv"

# Cargar el archivo CSV en un DataFrame
df = pd.read_csv(ruta_archivo_csv)

# Mostrar las primeras filas del DataFrame para verificar
print(df.head())

   overall                                         reviewText  sentiment  \
0      4.0  I had to learn the hard way after ordering thi...          1   
1      4.0  I would recommend this learning game for anyon...          1   
2      5.0  Choose your career which sets your money for t...          1   
3      5.0  It took a few hours to get this up and running...          1   
4      5.0  I oredered this for a daughter who is now 33 a...          1   

                                              tokens  
0  ['learn', 'hard', 'way', 'ordering', 'macbook'...  
1  ['would', 'recommend', 'learning', 'game', 'an...  
2  ['choose', 'career', 'set', 'money', 'trip', '...  
3  ['took', 'hour', 'get', 'running', 'window', '...  
4  ['oredered', 'daughter', 'wanted', 'play', 'or...  


###3.2 Division en train y test  |  Preparacion de los datos

In [5]:
# Dividire los datos en train y test. Modificare los datos de texto formato numerico que los modelos puedan procesar como vectores de caracteristicas
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


# Reemplazo los valores NaN en 'reviewText' por una cadena de texto vacía para evitar errores
df['reviewText'] = df['reviewText'].fillna('')

X = df['reviewText']  # Las características
y = df['sentiment']   # Las etiquetas

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizacion de los textos a una representacion de Bag of Words
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


### 3.3 Seleccion y entrenamiento de modelos simples
Selecciono los modelos de :
*   *Regresion lineal* por razones fundamentales y beneficios practico que el modelo ofrece. Podemos destar modelo basado en probailidad, flexible, buen rendimiento en espacios de caracteristicas grandes, robusto y eficiente en costos de computo.
*   *Naive Bayes* por varias razones estrategicas y practicas relacionadas a las propiedades de este modelo. Entre ellas podemos destacar Simplicidad y eficiencia, basado en teorema de Bayes, buen rendimiento en datos de texto, rapido de entrenar.




In [6]:
# Importar librerias necesarias

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [7]:
# Escalado de los datos

# Preparar la escala de los datos
scaler = StandardScaler(with_mean=False)  # with_mean=False es necesario para matrices dispersas
X_train_scaled = scaler.fit_transform(X_train_vectorized)
X_test_scaled = scaler.transform(X_test_vectorized)


In [None]:
# Entrenar un modelo Regresion lineal

model_lr = LogisticRegression(solver='saga', max_iter=100000)
model_lr.fit(X_train_scaled, y_train)

In [None]:
# Entrenar un modelo Naive Bayes

model_nb = MultinomialNB()
model_nb.fit(X_train_scaled, y_train)

In [None]:
# Guardo los modelos para posterior uso

from joblib import dump

dump(model_lr, 'model_lr.pkl')
dump(model_nb, 'model_nb.pkl')


['model_nb.pkl']

In [None]:
# Copio los modelos de Colab a Drive

!cp "/content/model_lr.pkl" "/content/drive/My Drive/model_lr.pkl"
!cp "/content/model_nb.pkl" "/content/drive/My Drive/model_nb.pkl"

In [None]:
# Guardo los datos procesados

import numpy as np
from scipy.sparse import save_npz, load_npz

# Guardo X_train y X_test
save_npz('/content/X_train_scaled.npz', X_train_scaled)
save_npz('/content/X_test_scaled.npz', X_test_scaled)

# Guardo y_test como archivos .npy
np.save('/content/y_test.npy', y_test)


In [None]:
# Copiar datos de Colab a Drive
!cp "/content/X_train_scaled.npz" "/content/drive/My Drive/X_train_scaled.npz"
!cp "/content/X_test_scaled.npz" "/content/drive/My Drive/X_test_scaled.npz"
!cp "/content/y_test.npy" "/content/drive/My Drive/y_test.npy"

###3.4 Evaluación de los Modelos

In [6]:
# Preprocesamiento para RNN y LSTM

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizacion
tokenizer = Tokenizer(num_words=500)  # Conserva solo las 500 palabras más comunes
tokenizer.fit_on_texts(X_train)

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

# Pading
max_length = max([len(x) for x in X_train_tokens])  # Considerar un límite máximo si es muy grande

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [10]:
# Guardo en X_test_pad para su posterior uso

np.save('/content/X_test_pad.npy', X_test_pad)

In [11]:
# Copio X_test_pad de Colab al Drive para su posterior uso

!cp "/content/X_test_pad.npy" "/content/drive/My Drive/X_test_pad.npy"

In [21]:
# Implmentando una RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

model_rnn = Sequential([
    Embedding(input_dim=500, output_dim=64, input_length=max_length),
    SimpleRNN(64),
    Dense(1, activation='sigmoid')
])

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_rnn.summary()

# Entrenamiento
model_rnn.fit(X_train_pad, y_train, epochs=5, validation_data=(X_test_pad, y_test))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2351, 64)          32000     
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 40321 (157.50 KB)
Trainable params: 40321 (157.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c15ac376050>

In [23]:
# Para guardar el modelo RNN
model_rnn.save('/content/model_rnn.keras')

In [24]:
# Copiar el modelos al Drive
!cp "/content/model_rnn.keras" "/content/drive/My Drive/model_rnn.keras"

In [None]:
# Implemento una LSTM

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.layers import LSTM

model_lstm = Sequential([
    Embedding(input_dim=500, output_dim=64, input_length=max_length),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.summary()

# Entrenamiento
model_lstm.fit(X_train_pad, y_train, epochs=10, validation_data=(X_test_pad, y_test))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2351, 64)          32000     
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 65089 (254.25 KB)
Trainable params: 65089 (254.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x792d892d5e70>

In [None]:
# Para guardar el modelo LSTM
model_lstm.save('/content/model_lstm.h5')

In [None]:
# Copiar el modelos al Drive
!cp "/content/model_lstm.h5" "/content/drive/My Drive/model_lstm.h5"