# Excercise 5
## NLP with Pytorch 🔥

In [1]:
!pip install tensorflow



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Use keras framework to solve the below exercises.


In [4]:
import numpy as np
import keras
import pandas as pd
import matplotlib.pyplot as plt

## 5.1 Predict rating of a movie using Keras

**Exercise:** Use keras framework to predict rating.

In [5]:
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)

In [6]:
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [7]:
plots

Unnamed: 0,plot
3107,most is the story of a single father who takes...
900,a serial killer decides to teach the secrets o...
6724,"in sweden , a female blackmailer with a disfi..."
4704,"in a friday afternoon in new york , the presi..."
2582,"in los angeles , the editor of a publishing h..."
...,...
8417,""" our marriage , their wedding . "" it ' s l..."
1592,"the wandering barbarian , conan , alongside ..."
1723,"like a tale spun by scheherazade , kismet fol..."
7605,"mrs . brisby , a widowed mouse , lives in a..."


In [8]:
y

Unnamed: 0,rating
3107,1
900,0
6724,1
4704,1
2582,1
...,...
8417,0
1592,0
1723,0
7605,1


## Data Precosessing

- Remove stopwords
- Lowercase
- split the text in words
- pad_sequences

In [26]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import nltk
from nltk.corpus import stopwords

# Descargar las stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Cargar y preprocesar los datos

plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [28]:

# Preprocesamiento para eliminar stopwords y convertir a minúsculas
def preprocess_text(text):
    text = text.lower()  # Convertir a minúsculas
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Eliminar caracteres no alfabéticos
    tokens = text.split()  # Tokenizar el texto
    tokens = [word for word in tokens if word not in stop_words]  # Eliminar stopwords
    return ' '.join(tokens)

# Aplicar el preprocesamiento a las tramas
plots_clean = plots.apply(preprocess_text)


In [60]:
# Usar TfidfVectorizer con el texto preprocesado
vectorizer = TfidfVectorizer(max_features=1350)  # Puedes ajustar max_features según tu necesidad
X = vectorizer.fit_transform(plots_clean).toarray()

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build Model

Create a neural network to predict the rating of a movie, calculate the testing set accuracy.

In [61]:
# Definir un Dataset personalizado para PyTorch
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Crear DataLoaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [62]:
# Definir el modelo de red neuronal
class TextClassificationModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)

In [63]:
# Parámetros del modelo
input_dim = X.shape[1]  # Número de características del vectorizador
hidden_dim = 128
output_dim = 1

# Inicializar el modelo, la función de pérdida y el optimizador
model = TextClassificationModel(input_dim, hidden_dim, output_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

# Entrenamiento del modelo
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluación en el conjunto de prueba
model.eval()
all_labels = []
all_predictions = []
with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        predicted = (outputs.squeeze() > 0.5).float()
        all_labels.extend(labels.numpy())
        all_predictions.extend(outputs.squeeze().numpy())

# Calcular métricas
accuracy = accuracy_score(all_labels, (np.array(all_predictions) > 0.5).astype(int))
precision = precision_score(all_labels, (np.array(all_predictions) > 0.5).astype(int), average='weighted')
recall = recall_score(all_labels, (np.array(all_predictions) > 0.5).astype(int), average='weighted')
f1 = f1_score(all_labels, (np.array(all_predictions) > 0.5).astype(int), average='weighted')
auc = roc_auc_score(all_labels, all_predictions)

# Resultados
results = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1,
    "auc": auc
}

print(results)


Epoch [1/100], Loss: 0.6951
Epoch [2/100], Loss: 0.6941
Epoch [3/100], Loss: 0.6932
Epoch [4/100], Loss: 0.6923
Epoch [5/100], Loss: 0.6913
Epoch [6/100], Loss: 0.6902
Epoch [7/100], Loss: 0.6891
Epoch [8/100], Loss: 0.6879
Epoch [9/100], Loss: 0.6866
Epoch [10/100], Loss: 0.6852
Epoch [11/100], Loss: 0.6838
Epoch [12/100], Loss: 0.6823
Epoch [13/100], Loss: 0.6806
Epoch [14/100], Loss: 0.6789
Epoch [15/100], Loss: 0.6770
Epoch [16/100], Loss: 0.6751
Epoch [17/100], Loss: 0.6730
Epoch [18/100], Loss: 0.6709
Epoch [19/100], Loss: 0.6686
Epoch [20/100], Loss: 0.6663
Epoch [21/100], Loss: 0.6639
Epoch [22/100], Loss: 0.6614
Epoch [23/100], Loss: 0.6588
Epoch [24/100], Loss: 0.6561
Epoch [25/100], Loss: 0.6534
Epoch [26/100], Loss: 0.6507
Epoch [27/100], Loss: 0.6479
Epoch [28/100], Loss: 0.6452
Epoch [29/100], Loss: 0.6424
Epoch [30/100], Loss: 0.6396
Epoch [31/100], Loss: 0.6367
Epoch [32/100], Loss: 0.6339
Epoch [33/100], Loss: 0.6310
Epoch [34/100], Loss: 0.6282
Epoch [35/100], Loss: 0