In [1]:
from transformers import DistilBertTokenizerFast, DistilBertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch.nn.functional as F
import numpy as np
import pandas as pd 
from math import ceil
from numpy import load

In [2]:
def load_emb_data(file_name):
    # load dict of arrays
    dict_data = load(f"../../data/processed/{file_name}.npz")
    # extract the first array
    return dict_data['arr_0']

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
feature_vec = load_emb_data("combine_feature")
title_vec = load_emb_data("combine_title")
index = load_emb_data("combine_index")
label = load_emb_data("combine_label")

In [48]:
class ParallelLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, lstm_hidden_size, num_classes, num_lstm_layers=3, dropout_prob=0.3):
        super(ParallelLSTMClassifier, self).__init__()
        
        # Define the layers for the first path
        self.path1_fc1 = nn.Linear(input_size, hidden_size)
        self.path1_dropout1 = nn.Dropout(dropout_prob)
        self.path1_fc2 = nn.Linear(hidden_size, int(hidden_size / 2))
        self.path1_dropout2 = nn.Dropout(dropout_prob)
        
        # Define the layers for the second path
        self.path2_fc1 = nn.Linear(input_size, hidden_size)
        self.path2_dropout1 = nn.Dropout(dropout_prob)
        self.path2_fc2 = nn.Linear(hidden_size, int(hidden_size / 2))
        self.path2_dropout2 = nn.Dropout(dropout_prob)
        
        # Define the LSTM layer
        # self.lstm_layers = nn.ModuleList()
        # self.dropout_layers = nn.ModuleList()
        # for _ in range(num_lstm_layers):
        #     self.lstm_layers.append(nn.LSTM(hidden_size * 2, hidden_size, batch_first=True))
        #     self.dropout_layers.append(nn.Dropout(0.3))
        
        # self.dense2 = nn.Linear(hidden_size, num_classes)

        # self.lstm = nn.LSTM(hidden_size * 2, lstm_hidden_size, num_layers=3, dropout=0.3, batch_first=True)
        self.lstm = nn.LSTM(hidden_size, lstm_hidden_size, num_layers=3, dropout=0.3, batch_first=True)
        
        # Define the final dense layer
        self.final_fc = nn.Linear(lstm_hidden_size, num_classes)
        
    def forward(self, x):
        # Split the input into two subarrays
        x1, x2 = x[:, 0, :], x[:, 1, :]
        
        # First path
        x1 = self.path1_fc1(x1)
        x1 = self.path1_dropout1(x1)
        x1 = torch.relu(x1)
        x1 = self.path1_fc2(x1)
        x1 = self.path1_dropout2(x1)
        x1 = torch.relu(x1)
        
        # Second path
        x2 = self.path2_fc1(x2)
        x2 = self.path2_dropout1(x2)
        x2 = torch.relu(x2)
        x2 = self.path2_fc2(x2)
        x2 = self.path2_dropout2(x2)
        x2 = torch.relu(x2)
        
        # Combine the outputs of the two paths
        combined = torch.cat((x1, x2), dim=1)
        combined = combined.unsqueeze(1)  # Add sequence length dimension

        lstm_out, _ = self.lstm(combined)
        lstm_out = lstm_out[:, -1, :]  # Get the output of the last LSTM cell
        
        # Final dense layer
        out = self.final_fc(lstm_out)

        # # Combine the outputs of the two paths
        # x = torch.cat((x1, x2), dim=1)
        # x = x.unsqueeze(1)  # Add sequence length dimension
        
        # # LSTM layer
        # for lstm, dropout in zip(self.lstm_layers, self.dropout_layers):
        #     x, _ = lstm(x)
        #     x = dropout(x)
        # x = x[:, -1, :]  # Get the output of the last LSTM cell
        # # out = self.final_fc(x)

        return out

In [49]:
# Create dataset
batch_size = 32
inputs = torch.tensor(np.stack((title_vec, feature_vec), axis=1), dtype=torch.float32)
targets = torch.tensor(label, dtype=torch.long)

dataset = TensorDataset(inputs, targets)

# Split dataset into training and testing sets (80% train, 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [50]:
# Hyperparameters
input_size = 768
hidden_size = 512
lstm_hidden_size = 256
num_classes = 3
dropout_prob = 0.3

# Initialize the model
model = ParallelLSTMClassifier(input_size, hidden_size, lstm_hidden_size, 
                               num_classes, dropout_prob).to(device)

# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 75
# Training loop
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation loop
model.eval()
all_targets = []
all_predictions = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_targets.extend(targets.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(all_targets, all_predictions)
f1 = f1_score(all_targets, all_predictions, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Epoch [1/75], Loss: 0.3513
Epoch [2/75], Loss: 0.0557
Epoch [3/75], Loss: 0.2428
Epoch [4/75], Loss: 0.5779
Epoch [5/75], Loss: 0.3174
Epoch [6/75], Loss: 0.1347
Epoch [7/75], Loss: 0.2390
Epoch [8/75], Loss: 0.4163
Epoch [9/75], Loss: 0.2016
Epoch [10/75], Loss: 0.1599
Epoch [11/75], Loss: 0.2639
Epoch [12/75], Loss: 0.6175
Epoch [13/75], Loss: 0.0996
Epoch [14/75], Loss: 0.1604
Epoch [15/75], Loss: 0.1087
Epoch [16/75], Loss: 0.3207
Epoch [17/75], Loss: 0.1346
Epoch [18/75], Loss: 0.1590
Epoch [19/75], Loss: 0.0266
Epoch [20/75], Loss: 0.4836
Epoch [21/75], Loss: 0.2472
Epoch [22/75], Loss: 0.1887
Epoch [23/75], Loss: 0.1093
Epoch [24/75], Loss: 0.2671
Epoch [25/75], Loss: 0.0468
Epoch [26/75], Loss: 0.1075
Epoch [27/75], Loss: 0.1826
Epoch [28/75], Loss: 0.2489
Epoch [29/75], Loss: 0.0337
Epoch [30/75], Loss: 0.0512
Epoch [31/75], Loss: 0.1299
Epoch [32/75], Loss: 0.3195
Epoch [33/75], Loss: 0.0520
Epoch [34/75], Loss: 0.1287
Epoch [35/75], Loss: 0.0357
Epoch [36/75], Loss: 0.0779
E

In [52]:
from sklearn.metrics import confusion_matrix


confusion_matrix(all_targets, all_predictions)

array([[2177,   47,  283],
       [  23, 2377,   64],
       [ 402,  101, 2026]], dtype=int64)

In [53]:
from sklearn.metrics import classification_report


print(classification_report(all_targets, all_predictions))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      2507
           1       0.94      0.96      0.95      2464
           2       0.85      0.80      0.83      2529

    accuracy                           0.88      7500
   macro avg       0.88      0.88      0.88      7500
weighted avg       0.88      0.88      0.88      7500



In [29]:
# model_path = "../../src/models/nn_bert_feature.pth"

# # Save the model's state dictionary
# torch.save(model.state_dict(), model_path)

# Load trained model

In [6]:
# Load the model state dictionary
model = ClassificationModel(input_size, hidden_size, num_classes)
model.load_state_dict(torch.load("../../src/models/nn_bert_title.pth"))
model.to(device)

ClassificationModel(
  (dense1): Linear(in_features=768, out_features=128, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(128, 128, batch_first=True)
  (dense2): Linear(in_features=128, out_features=3, bias=True)
)

In [7]:
inputs = title_bert_vec
targets = title_label

# Convert to PyTorch tensors
inputs = torch.tensor(inputs, dtype=torch.float32)
targets = torch.tensor(targets, dtype=torch.long)

# Create dataset
dataset = TensorDataset(inputs, targets)

# Create data loaders
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [8]:
# Evaluation loop
model.eval()
all_targets = []
all_predictions = []

with torch.no_grad():
    for batch_inputs, batch_targets in data_loader:
        batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)
        outputs = model(batch_inputs)
        _, predicted = torch.max(outputs, 1)
        all_targets.extend(batch_targets.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(all_targets, all_predictions)
f1 = f1_score(all_targets, all_predictions, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.9809
F1 Score: 0.9809
