In [1]:
from transformers import DistilBertTokenizerFast, DistilBertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch.nn.functional as F
import numpy as np
import pandas as pd
from math import ceil
from numpy import load

In [2]:
label_to_id = {"personal_care": 0, "book": 1, "home": 2}
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [10]:
def load_emb_data(file_name):
    # load dict of arrays
    dict_data = load(f"../../data/processed/{file_name}.npz")
    # extract the first array
    return dict_data['arr_0']

# load dict of arrays
# feature_bert_vec = load_emb_data("feature_bert_vec")
# feature_index = load_emb_data("feature_index")
# feature_label = load_emb_data("feature_label")

In [25]:
title_bert_vec = load_emb_data("title_bert_vec")
title_index = load_emb_data("title_index")
title_label = load_emb_data("title_label")

In [26]:
# Assuming you have `inputs` and `targets` lists
inputs = title_bert_vec
targets = title_label

# Convert to PyTorch tensors
inputs = torch.tensor(inputs, dtype=torch.float32)
targets = torch.tensor(targets, dtype=torch.long)

# Create dataset
dataset = TensorDataset(inputs, targets)

# Split dataset into training and testing sets (80% train, 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model
class ClassificationModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(ClassificationModel, self).__init__()
        self.dense1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(0.3)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dense2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # x = x.view(x.size(0), -1)
        x = F.relu(self.dense1(x))
        x = self.dropout(x)
        x = x.unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]
        out = self.dense2(lstm_out)
        return out

# Hyperparameters
input_size = 768
hidden_size = 128
num_classes = 3
num_epochs = 20
learning_rate = 0.001

# Initialize the model, loss function, and optimizer
model = ClassificationModel(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation loop
model.eval()
all_targets = []
all_predictions = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_targets.extend(targets.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(all_targets, all_predictions)
f1 = f1_score(all_targets, all_predictions, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')


Epoch [1/20], Loss: 0.0138
Epoch [2/20], Loss: 0.1809
Epoch [3/20], Loss: 0.1968
Epoch [4/20], Loss: 0.0117
Epoch [5/20], Loss: 0.0326
Epoch [6/20], Loss: 0.0172
Epoch [7/20], Loss: 0.1063
Epoch [8/20], Loss: 0.0163
Epoch [9/20], Loss: 0.0295
Epoch [10/20], Loss: 0.0826
Epoch [11/20], Loss: 0.1208
Epoch [12/20], Loss: 0.0115
Epoch [13/20], Loss: 0.1715
Epoch [14/20], Loss: 0.2943
Epoch [15/20], Loss: 0.1552
Epoch [16/20], Loss: 0.0595
Epoch [17/20], Loss: 0.2522
Epoch [18/20], Loss: 0.0605
Epoch [19/20], Loss: 0.0119
Epoch [20/20], Loss: 0.0108
Accuracy: 0.9790
F1 Score: 0.9790


In [27]:
from sklearn.metrics import confusion_matrix


confusion_matrix(all_targets, all_predictions)

array([[193391,    808,   5181],
       [   319, 199062,    834],
       [  4070,   1402, 194884]], dtype=int64)

In [28]:
from sklearn.metrics import classification_report


print(classification_report(all_targets, all_predictions))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97    199380
           1       0.99      0.99      0.99    200215
           2       0.97      0.97      0.97    200356

    accuracy                           0.98    599951
   macro avg       0.98      0.98      0.98    599951
weighted avg       0.98      0.98      0.98    599951



In [29]:
model_path = "../../src/models/nn_bert_feature.pth"

# Save the model's state dictionary
torch.save(model.state_dict(), model_path)