## MODEL TESTING


In [2]:
import pandas as pd
import os

In [3]:
os.chdir("/mnt/c/Users/diego/OneDrive/Documentos/Data science projects/nlp 2/petsentiment_analysis")

In [None]:
data = pd.read_parquet("data/processed/train_data2.parquet")
data.head()

Unnamed: 0,rating,text,parent_asin,user_id,timestamp,Sentiment_target,word_count,processed_text
0,2.0,When I bought this I thought it would be effec...,B0C69JDLGY,AFASI57RME5JYIZMK2DOSMXVNFCA,2023-01-04 18:24:55.647000+00:00,0,96,when i bought this i thought it would be effec...
1,4.0,Says it holds 35 lbs but my 31 lb bag of food ...,B09CJVFHBD,AEIHJFJYIR4GRY6UZK65LJU2NUVA,2023-03-15 03:19:27.473000+00:00,1,15,says it holds 35 lbs but my 31 lb bag of food ...
2,2.0,I was hopeful that this would solve the trick ...,B07KRPLQ8P,AEYB5YOWHB7V72J3EG4IF6LVZL4A,2023-01-25 17:50:59.419000+00:00,0,42,i was hopeful that this would solve the trick ...
3,5.0,"Had to have this, even before I have the kitty...",B0BKKZZJX8,AFOZLZHEHJQDFX5XAFD7A4JWNIWA,2023-02-25 22:43:19.764000+00:00,1,23,had to have this even before i have the kitty ...
4,5.0,Perfect for my elderly cat who has an occasion...,B09T7P3Q7G,AHXPT5Q4HABHQ7P24Y7SRJISFJKQ,2023-06-25 00:02:14.844000+00:00,1,34,perfect for my elderly cat who has an occasion...


In [None]:
data.shape

(176800, 8)

In [None]:
data["text"].values[0]

"When I bought this I thought it would be effective in cutting my dog's nails. She already hates her paws being touched but when I pulled this out she would start to cry. I only managed to use it once because it was so difficult to do anything it with it. I couldn't see if where I was cutting was the space I wanted. I would have to maneuver myself in different directions because this thing was so big and blocked space. I'm assuming this is for bigger dogs but for medium to small, not recommended."

In [None]:
data["processed_text"].values[0]

'when i bought this i thought it would be effective in cutting my dogs nails she already hates her paws being touched but when i pulled this out she would start to cry i only managed to use it once because it was so difficult to do anything it with it i could nt see if where i was cutting was the space i wanted i would have to maneuver myself in different directions because this thing was so big and blocked space i m assuming this is for bigger dogs but for medium to small not recommended'

In [None]:
data["rating"].value_counts()

rating
2.0    45364
1.0    45306
5.0    43642
4.0    42488
Name: count, dtype: int64

In [None]:
import torch
import umap
import plotly.express as px
import pandas as pd
import re
import joblib
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from tqdm import tqdm

# Load model and tokenizer
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
save_dir = "models"

# Load dataset and take a sample of 5000 data points
df_sample = data.sample(n=10000, random_state=42).reset_index(drop=True)

# Function to clean text (without lemmatization or stopwords)

# Parameters for efficient processing
BATCH_SIZE = 6
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to obtain embeddings in batches
def get_embeddings(texts, batch_size=16):
    all_embeddings = []
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
            batch_texts = texts[i:i + batch_size].tolist()
            inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Extract [CLS] token
            all_embeddings.extend(batch_embeddings)
    return all_embeddings

# Obtain embeddings with batch processing
embeddings = get_embeddings(df_sample["processed_text"], BATCH_SIZE)

# Save embeddings in a CSV file
df_embeddings = pd.DataFrame(embeddings)
df_embeddings["Sentiment_target"] = df_sample["Sentiment_target"]
df_embeddings.to_csv("embeddings.csv", index=False)

# Normalize embeddings
scaled_embeddings = StandardScaler().fit_transform(embeddings)

# Apply PCA before UMAP
pca = PCA(n_components=50, random_state=42)
pca_embeddings = pca.fit_transform(scaled_embeddings)

# Reduce dimensionality with UMAP
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
low_dim_embeddings = reducer.fit_transform(pca_embeddings)

# Create DataFrame for visualization
df_umap = pd.DataFrame(low_dim_embeddings, columns=["Dim 1", "Dim 2"])
df_umap["Sentiment_target"] = df_sample["Sentiment_target"].astype(str)

# Visualization with Plotly
fig = px.scatter(df_umap, x="Dim 1", y="Dim 2", color="Sentiment_target",
                 title="Embedding Visualization with BERT/RoBERTa (PCA + UMAP)",
                 template="plotly_dark", opacity=0.7)
fig.update_traces(marker=dict(size=4))
fig.show()

# Train models with the generated embeddings
X = scaled_embeddings
y = df_sample["Sentiment_target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, os.path.join(save_dir, "xgboost_model.pkl"))

# Naive Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
joblib.dump(nb_model, os.path.join(save_dir, "naive_bayes_model.pkl"))

# Model evaluation
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nResults for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

# Evaluate XGBoost
evaluate_model(xgb_model, X_test, y_test, "XGBoost")

# Evaluate Naive Bayes
evaluate_model(nb_model, X_test, y_test, "Naive Bayes")

print("Models successfully evaluated.")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Procesando lotes: 100%|██████████| 1667/1667 [00:45<00:00, 36.71it/s]

'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.




Parameters: { "use_label_encoder" } are not used.





Resultados de XGBoost:
Accuracy: 0.8375
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      1019
           1       0.85      0.81      0.83       981

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000


Resultados de Naive Bayes:
Accuracy: 0.8010
              precision    recall  f1-score   support

           0       0.79      0.83      0.81      1019
           1       0.82      0.77      0.79       981

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.80      0.80      0.80      2000

Modelos evaluados con éxito.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import re
import joblib
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

# Load dataset and take a sample
df_sample = data.sample(n=50000, random_state=42).reset_index(drop=True)

# Function to obtain embeddings (optimized)
def get_embeddings(texts, batch_size=16):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
            batch = texts[i:i+batch_size].tolist()
            inputs = tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=128, 
                return_tensors="pt"
            ).to(device)
            
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(embeddings)
    return np.concatenate(all_embeddings, axis=0)

# Obtain embeddings
embeddings = get_embeddings(df_sample["processed_text"], batch_size=8)

# Improved normalization
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)

# Train-test split (with stratify)
X_train, X_test, y_train, y_test = train_test_split(
    scaled_embeddings, 
    df_sample["Sentiment_target"], 
    test_size=0.2, 
    random_state=42,
    stratify=df_sample["Sentiment_target"]  # Better class balance
)

# Conversion to tensors (optimized)
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train.values, dtype=torch.long)
)
test_dataset = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(y_test.values, dtype=torch.long)  # Key correction here
)

# DataLoader for more efficient training
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Improved model
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.BatchNorm1d(hidden_dim//2),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(hidden_dim//2, output_dim)
        )
        
    def forward(self, x):
        return self.net(x)

# Improved parameters
input_dim = X_train.shape[1]
hidden_dim = 256
output_dim = len(df_sample["Sentiment_target"].unique())

model_nn = SentimentClassifier(input_dim, hidden_dim, output_dim).to(device)

# Loss function with class balance (if imbalance exists)
class_weights = torch.tensor(
    len(y_train) / (output_dim * np.bincount(y_train)),
    dtype=torch.float32
).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Improved optimizer
optimizer = optim.AdamW(model_nn.parameters(), lr=0.0005, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

# Optimized training
epochs = 40
best_accuracy = 0

for epoch in range(epochs):
    model_nn.train()
    total_loss = 0
    correct = 0
    
    for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model_nn(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        nn.utils.clip_grad_norm_(model_nn.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        
        total_loss += loss.item()
        correct += (outputs.argmax(dim=1) == y_batch).sum().item()
    
    # Validation
    model_nn.eval()
    val_correct = 0
    with torch.no_grad():
        for X_val, y_val in test_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            outputs = model_nn(X_val)
            val_correct += (outputs.argmax(dim=1) == y_val).sum().item()
    
    train_acc = correct / len(train_dataset)
    val_acc = val_correct / len(test_dataset)
    scheduler.step(total_loss)
    
    # Save best model
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model_nn.state_dict(), "best_model.pth")
    
    print(f"Epoch {epoch+1}: Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

# Load best model
model_nn.load_state_dict(torch.load("best_model.pth"))

# Final evaluation
model_nn.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for X_val, y_val in test_loader:
        X_val = X_val.to(device)
        outputs = model_nn(X_val)
        y_pred.extend(outputs.argmax(dim=1).cpu().numpy())
        y_true.extend(y_val.cpu().numpy())

print(f"\nBest Test Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(classification_report(y_true, y_pred))

# Save final model
model_path = os.path.join("models","FNN_model.pth")
torch.save(model_nn.state_dict(), model_path)
print("Training successfully completed!")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Procesando lotes: 100%|██████████| 6250/6250 [04:25<00:00, 23.54it/s]
Epoch 1/40: 100%|██████████| 625/625 [00:02<00:00, 284.90it/s]


Epoch 1: Loss: 213.1518 | Train Acc: 0.8525 | Val Acc: 0.8732


Epoch 2/40: 100%|██████████| 625/625 [00:02<00:00, 283.04it/s]


Epoch 2: Loss: 187.9404 | Train Acc: 0.8722 | Val Acc: 0.8810


Epoch 3/40: 100%|██████████| 625/625 [00:02<00:00, 296.08it/s]


Epoch 3: Loss: 177.8846 | Train Acc: 0.8801 | Val Acc: 0.8824


Epoch 4/40: 100%|██████████| 625/625 [00:02<00:00, 277.31it/s]


Epoch 4: Loss: 172.5096 | Train Acc: 0.8832 | Val Acc: 0.8799


Epoch 5/40: 100%|██████████| 625/625 [00:02<00:00, 286.34it/s]


Epoch 5: Loss: 167.6784 | Train Acc: 0.8864 | Val Acc: 0.8845


Epoch 6/40: 100%|██████████| 625/625 [00:02<00:00, 296.62it/s]


Epoch 6: Loss: 160.3158 | Train Acc: 0.8920 | Val Acc: 0.8810


Epoch 7/40: 100%|██████████| 625/625 [00:01<00:00, 317.22it/s]


Epoch 7: Loss: 155.4456 | Train Acc: 0.8943 | Val Acc: 0.8853


Epoch 8/40: 100%|██████████| 625/625 [00:01<00:00, 324.06it/s]


Epoch 8: Loss: 150.0330 | Train Acc: 0.8982 | Val Acc: 0.8846


Epoch 9/40: 100%|██████████| 625/625 [00:01<00:00, 323.22it/s]


Epoch 9: Loss: 146.1300 | Train Acc: 0.9010 | Val Acc: 0.8825


Epoch 10/40: 100%|██████████| 625/625 [00:02<00:00, 295.56it/s]


Epoch 10: Loss: 142.0450 | Train Acc: 0.9028 | Val Acc: 0.8834


Epoch 11/40: 100%|██████████| 625/625 [00:02<00:00, 283.15it/s]


Epoch 11: Loss: 135.9961 | Train Acc: 0.9087 | Val Acc: 0.8823


Epoch 12/40: 100%|██████████| 625/625 [00:02<00:00, 264.87it/s]


Epoch 12: Loss: 132.4573 | Train Acc: 0.9093 | Val Acc: 0.8813


Epoch 13/40: 100%|██████████| 625/625 [00:02<00:00, 277.47it/s]


Epoch 13: Loss: 126.9263 | Train Acc: 0.9143 | Val Acc: 0.8862


Epoch 14/40: 100%|██████████| 625/625 [00:02<00:00, 254.53it/s]


Epoch 14: Loss: 122.0574 | Train Acc: 0.9181 | Val Acc: 0.8822


Epoch 15/40: 100%|██████████| 625/625 [00:03<00:00, 164.98it/s]


Epoch 15: Loss: 120.3851 | Train Acc: 0.9181 | Val Acc: 0.8849


Epoch 16/40: 100%|██████████| 625/625 [00:02<00:00, 269.31it/s]


Epoch 16: Loss: 115.6987 | Train Acc: 0.9235 | Val Acc: 0.8787


Epoch 17/40: 100%|██████████| 625/625 [00:02<00:00, 265.72it/s]


Epoch 17: Loss: 112.2411 | Train Acc: 0.9251 | Val Acc: 0.8799


Epoch 18/40: 100%|██████████| 625/625 [00:02<00:00, 277.85it/s]


Epoch 18: Loss: 106.8382 | Train Acc: 0.9284 | Val Acc: 0.8787


Epoch 19/40: 100%|██████████| 625/625 [00:02<00:00, 277.62it/s]


Epoch 19: Loss: 104.5501 | Train Acc: 0.9298 | Val Acc: 0.8778


Epoch 20/40: 100%|██████████| 625/625 [00:02<00:00, 278.54it/s]


Epoch 20: Loss: 103.0370 | Train Acc: 0.9306 | Val Acc: 0.8767


Epoch 21/40: 100%|██████████| 625/625 [00:02<00:00, 275.66it/s]


Epoch 21: Loss: 98.6830 | Train Acc: 0.9346 | Val Acc: 0.8774


Epoch 22/40: 100%|██████████| 625/625 [00:02<00:00, 289.58it/s]


Epoch 22: Loss: 96.4640 | Train Acc: 0.9361 | Val Acc: 0.8785


Epoch 23/40: 100%|██████████| 625/625 [00:02<00:00, 282.34it/s]


Epoch 23: Loss: 91.4801 | Train Acc: 0.9391 | Val Acc: 0.8798


Epoch 24/40: 100%|██████████| 625/625 [00:01<00:00, 322.25it/s]


Epoch 24: Loss: 89.1991 | Train Acc: 0.9427 | Val Acc: 0.8782


Epoch 25/40: 100%|██████████| 625/625 [00:01<00:00, 314.71it/s]


Epoch 25: Loss: 87.6035 | Train Acc: 0.9430 | Val Acc: 0.8776


Epoch 26/40: 100%|██████████| 625/625 [00:01<00:00, 321.97it/s]


Epoch 26: Loss: 83.1492 | Train Acc: 0.9461 | Val Acc: 0.8743


Epoch 27/40: 100%|██████████| 625/625 [00:02<00:00, 307.62it/s]


Epoch 27: Loss: 81.3364 | Train Acc: 0.9479 | Val Acc: 0.8765


Epoch 28/40: 100%|██████████| 625/625 [00:02<00:00, 271.32it/s]


Epoch 28: Loss: 81.3636 | Train Acc: 0.9471 | Val Acc: 0.8764


Epoch 29/40: 100%|██████████| 625/625 [00:03<00:00, 165.22it/s]


Epoch 29: Loss: 79.0231 | Train Acc: 0.9499 | Val Acc: 0.8742


Epoch 30/40: 100%|██████████| 625/625 [00:02<00:00, 247.28it/s]


Epoch 30: Loss: 76.3578 | Train Acc: 0.9512 | Val Acc: 0.8738


Epoch 31/40: 100%|██████████| 625/625 [00:02<00:00, 286.93it/s]


Epoch 31: Loss: 74.5117 | Train Acc: 0.9519 | Val Acc: 0.8759


Epoch 32/40: 100%|██████████| 625/625 [00:02<00:00, 252.40it/s]


Epoch 32: Loss: 72.3272 | Train Acc: 0.9536 | Val Acc: 0.8765


Epoch 33/40: 100%|██████████| 625/625 [00:02<00:00, 281.59it/s]


Epoch 33: Loss: 71.2203 | Train Acc: 0.9544 | Val Acc: 0.8766


Epoch 34/40: 100%|██████████| 625/625 [00:02<00:00, 296.24it/s]


Epoch 34: Loss: 66.8128 | Train Acc: 0.9567 | Val Acc: 0.8730


Epoch 35/40: 100%|██████████| 625/625 [00:02<00:00, 310.66it/s]


Epoch 35: Loss: 68.2576 | Train Acc: 0.9576 | Val Acc: 0.8782


Epoch 36/40: 100%|██████████| 625/625 [00:01<00:00, 333.97it/s]


Epoch 36: Loss: 64.9374 | Train Acc: 0.9597 | Val Acc: 0.8718


Epoch 37/40: 100%|██████████| 625/625 [00:01<00:00, 343.53it/s]


Epoch 37: Loss: 64.3268 | Train Acc: 0.9583 | Val Acc: 0.8738


Epoch 38/40: 100%|██████████| 625/625 [00:01<00:00, 334.12it/s]


Epoch 38: Loss: 62.5316 | Train Acc: 0.9609 | Val Acc: 0.8759


Epoch 39/40: 100%|██████████| 625/625 [00:01<00:00, 326.50it/s]


Epoch 39: Loss: 61.8753 | Train Acc: 0.9617 | Val Acc: 0.8741


Epoch 40/40: 100%|██████████| 625/625 [00:01<00:00, 318.93it/s]


Epoch 40: Loss: 60.3108 | Train Acc: 0.9623 | Val Acc: 0.8735

Mejor Accuracy en test: 0.8862
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      5154
           1       0.89      0.87      0.88      4846

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Entrenamiento finalizado con éxito!


In [None]:
### FastText model ###
import torch
import umap
import plotly.express as px
import pandas as pd
import re
import joblib
import fasttext
import random
from sklearn.model_selection import train_test_split

# Load dataset and take a sample
df_sample = data.sample(n=50000, random_state=42).reset_index(drop=True)

# Split into train and test
train_data, test_data = train_test_split(df_sample, test_size=0.2, random_state=42)

# Save data in text files for FastText
def save_fasttext_format(df, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for text, label in zip(df["processed_text"], df["Sentiment_target"]):
            f.write(f"__label__{label} {text}\n")

save_fasttext_format(train_data, "train.txt")
save_fasttext_format(test_data, "test.txt")

# Function to evaluate FastText models
def evaluate_fasttext(model, test_file):
    results = model.test(test_file)
    return results[1]  # Accuracy

# Define hyperparameter search space
param_grid = {
    "epoch": [10, 20, 30, 40, 50], 
    "lr": [0.1, 0.3, 0.5, 0.7, 1.0], 
    "wordNgrams": [1, 2, 3],
    "dim": [50, 100, 150, 200, 300],
    "loss": ["softmax", "hs", "ns"]
}

# Random search
num_iterations = 20
best_model = None
best_acc = 0
best_params = {}

for i in range(num_iterations):
    print(f"\n🔹 Iteration {i+1}/{num_iterations}")

    # Select random parameters
    params = {k: random.choice(v) for k, v in param_grid.items()}
    print(f"Testing parameters: {params}")

    # Train model with selected parameters
    model = fasttext.train_supervised(
        input="train.txt",
        epoch=params["epoch"],
        lr=params["lr"],
        wordNgrams=params["wordNgrams"],
        dim=params["dim"],
        loss=params["loss"]
    )

    # Evaluate model
    acc = evaluate_fasttext(model, "test.txt")
    print(f"Obtained accuracy: {acc:.4f}")

    # Save the best model
    if acc > best_acc:
        best_acc = acc
        best_model = model
        best_params = params

# Save the best model found
if best_model:
    model_path = os.path.join("models","fasttex_model.bin")
    best_model.save_model(model_path)
    print("\n✅ Best model saved with parameters:")
    print(best_params)
    print(f"🎯 Best accuracy: {best_acc:.4f}")



🔹 Iteración 1/20
Probando parámetros: {'epoch': 10, 'lr': 0.3, 'wordNgrams': 2, 'dim': 150, 'loss': 'hs'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  960804 lr:  0.000000 avg.loss:  0.148208 ETA:   0h 0m 0s 86.2% words/sec/thread: 1012236 lr:  0.041396 avg.loss:  0.167263 ETA:   0h 0m 0s


Accuracy obtenido: 0.8644

🔹 Iteración 2/20
Probando parámetros: {'epoch': 10, 'lr': 0.5, 'wordNgrams': 3, 'dim': 100, 'loss': 'hs'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  754427 lr:  0.000000 avg.loss:  0.086380 ETA:   0h 0m 0s


Accuracy obtenido: 0.8739

🔹 Iteración 3/20
Probando parámetros: {'epoch': 20, 'lr': 0.7, 'wordNgrams': 1, 'dim': 200, 'loss': 'softmax'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1627233 lr:  0.000000 avg.loss:  0.297775 ETA:   0h 0m 0s


Accuracy obtenido: 0.8476

🔹 Iteración 4/20
Probando parámetros: {'epoch': 30, 'lr': 0.7, 'wordNgrams': 1, 'dim': 200, 'loss': 'softmax'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1663811 lr: -0.000010 avg.loss:  0.275461 ETA:   0h 0m 0s

Accuracy obtenido: 0.8445

🔹 Iteración 5/20
Probando parámetros: {'epoch': 50, 'lr': 0.3, 'wordNgrams': 2, 'dim': 200, 'loss': 'ns'}


Progress: 100.0% words/sec/thread: 1663708 lr:  0.000000 avg.loss:  0.275461 ETA:   0h 0m 0s
Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  800866 lr: -0.000005 avg.loss:  0.161625 ETA:   0h 0m 0s

Accuracy obtenido: 0.8610

🔹 Iteración 6/20
Probando parámetros: {'epoch': 30, 'lr': 0.5, 'wordNgrams': 3, 'dim': 100, 'loss': 'hs'}


Progress: 100.0% words/sec/thread:  800835 lr:  0.000000 avg.loss:  0.161625 ETA:   0h 0m 0s
Read 1M words1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  813266 lr:  0.000000 avg.loss:  0.033212 ETA:   0h 0m 0s


Accuracy obtenido: 0.8697

🔹 Iteración 7/20
Probando parámetros: {'epoch': 10, 'lr': 1.0, 'wordNgrams': 3, 'dim': 50, 'loss': 'ns'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1057056 lr:  0.000000 avg.loss:  0.367673 ETA:   0h 0m 0s
Read 1M words

Accuracy obtenido: 0.8694

🔹 Iteración 8/20
Probando parámetros: {'epoch': 20, 'lr': 0.5, 'wordNgrams': 2, 'dim': 50, 'loss': 'ns'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1510445 lr:  0.000000 avg.loss:  0.353554 ETA:   0h 0m 0s


Accuracy obtenido: 0.8606

🔹 Iteración 9/20
Probando parámetros: {'epoch': 30, 'lr': 0.3, 'wordNgrams': 2, 'dim': 200, 'loss': 'ns'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  834873 lr:  0.000000 avg.loss:  0.247607 ETA:   0h 0m 0s


Accuracy obtenido: 0.8594

🔹 Iteración 10/20
Probando parámetros: {'epoch': 20, 'lr': 0.3, 'wordNgrams': 3, 'dim': 150, 'loss': 'hs'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  571392 lr:  0.000000 avg.loss:  0.051879 ETA:   0h 0m 0s
Read 1M words

Accuracy obtenido: 0.8747

🔹 Iteración 11/20
Probando parámetros: {'epoch': 40, 'lr': 0.5, 'wordNgrams': 2, 'dim': 200, 'loss': 'ns'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  797920 lr:  0.000000 avg.loss:  0.215926 ETA:   0h 0m 0s100.0% words/sec/thread:  797939 lr: -0.000009 avg.loss:  0.215926 ETA:   0h 0m 0s


Accuracy obtenido: 0.8589

🔹 Iteración 12/20
Probando parámetros: {'epoch': 20, 'lr': 1.0, 'wordNgrams': 1, 'dim': 200, 'loss': 'softmax'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1510908 lr:  0.000000 avg.loss:  0.305064 ETA:   0h 0m 0s


Accuracy obtenido: 0.8482

🔹 Iteración 13/20
Probando parámetros: {'epoch': 50, 'lr': 0.5, 'wordNgrams': 2, 'dim': 100, 'loss': 'hs'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1322146 lr:  0.000000 avg.loss:  0.038426 ETA:   0h 0m 0s
Read 1M words

Accuracy obtenido: 0.8591

🔹 Iteración 14/20
Probando parámetros: {'epoch': 50, 'lr': 0.5, 'wordNgrams': 2, 'dim': 200, 'loss': 'hs'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread:  896279 lr:  0.000000 avg.loss:  0.038994 ETA:   0h 0m 0s 70.4% words/sec/thread:  908617 lr:  0.147780 avg.loss:  0.054657 ETA:   0h 0m 1s
Read 1M words
Number of words:  29132
Number of labels: 2


Accuracy obtenido: 0.8584

🔹 Iteración 15/20
Probando parámetros: {'epoch': 10, 'lr': 0.5, 'wordNgrams': 3, 'dim': 50, 'loss': 'hs'}


Progress: 100.0% words/sec/thread: 1057002 lr:  0.000000 avg.loss:  0.090972 ETA:   0h 0m 0s1233035 lr:  0.324696 avg.loss:  0.240322 ETA:   0h 0m 0s


Accuracy obtenido: 0.8726

🔹 Iteración 16/20
Probando parámetros: {'epoch': 50, 'lr': 0.1, 'wordNgrams': 1, 'dim': 150, 'loss': 'ns'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1392209 lr:  0.000000 avg.loss:  1.105953 ETA:   0h 0m 0s 23.9% words/sec/thread: 1404997 lr:  0.076091 avg.loss:  1.403685 ETA:   0h 0m 2s


Accuracy obtenido: 0.8402

🔹 Iteración 17/20
Probando parámetros: {'epoch': 30, 'lr': 1.0, 'wordNgrams': 1, 'dim': 200, 'loss': 'hs'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1511075 lr:  0.000000 avg.loss:  0.285611 ETA:   0h 0m 0s


Accuracy obtenido: 0.8444

🔹 Iteración 18/20
Probando parámetros: {'epoch': 40, 'lr': 0.5, 'wordNgrams': 1, 'dim': 150, 'loss': 'hs'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1627740 lr:  0.000000 avg.loss:  0.268499 ETA:   0h 0m 0s


Accuracy obtenido: 0.8396

🔹 Iteración 19/20
Probando parámetros: {'epoch': 10, 'lr': 1.0, 'wordNgrams': 1, 'dim': 150, 'loss': 'ns'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1321559 lr:  0.000000 avg.loss:  1.459242 ETA:   0h 0m 0s
Read 1M words

Accuracy obtenido: 0.8505

🔹 Iteración 20/20
Probando parámetros: {'epoch': 30, 'lr': 0.5, 'wordNgrams': 2, 'dim': 100, 'loss': 'ns'}


Read 1M words
Number of words:  29132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1220451 lr:  0.000000 avg.loss:  0.244853 ETA:   0h 0m 0s100.0% words/sec/thread: 1220536 lr: -0.000008 avg.loss:  0.244853 ETA:   0h 0m 0s


Accuracy obtenido: 0.8618

✅ Mejor modelo guardado con parámetros:
{'epoch': 20, 'lr': 0.3, 'wordNgrams': 3, 'dim': 150, 'loss': 'hs'}
🎯 Mejor accuracy: 0.8747


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset, ClassLabel
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_NAME = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

df_sample = data.sample(n=100000, random_state=42).reset_index(drop=True)

le = LabelEncoder()
df_sample["labels"] = le.fit_transform(df_sample["Sentiment_target"])

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=df_sample["labels"].unique(),
    y=df_sample["labels"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(le.classes_)
).to(device)

def tokenize_function(examples):
    return tokenizer(
        examples["processed_text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

dataset = Dataset.from_pandas(df_sample[["processed_text", "labels"]])
dataset = dataset.map(tokenize_function, batched=True)

dataset = dataset.cast_column("labels", ClassLabel(num_classes=2))

dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="labels", seed=42)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    logging_steps=50,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "macro_f1": f1_score(labels, predictions, average="macro")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

y_pred = trainer.predict(test_dataset).predictions.argmax(axis=1)
y_true = test_dataset["labels"]

print(f"\nAccuracy Final: {accuracy_score(y_true, y_pred):.4f}")
print(classification_report(y_true, y_pred, target_names=le.classes_))

model_name = "sentiment_transformer_model"
model_path = os.path.join("models", model_name)
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 100000/100000 [00:07<00:00, 13420.42 examples/s]
Casting the dataset: 100%|██████████| 100000/100000 [00:00<00:00, 532522.12 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.2326,0.240279,0.90635,0.906114
2,0.2286,0.231773,0.911,0.91095
3,0.1562,0.249118,0.9135,0.913206
4,0.1937,0.24035,0.91555,0.915464
5,0.1458,0.256451,0.91465,0.914502



Accuracy Final: 0.9155


TypeError: object of type 'numpy.int8' has no len()