In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [17]:
import torch
import kagglehub
import pandas as pd
from data_loader import CustomDataLoader
from train import TrainingLoop
from eval import Evaluate

## Downoad Dataset

In [18]:
# Download the dataset to the specified path
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

Path to dataset files: /home/adityadev/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


## Read data into CSV files

In [19]:
train_df = pd.read_csv(f"{path}/twitter_training.csv", header = None)
val_df = pd.read_csv(f"{path}/twitter_validation.csv", header = None)

In [20]:
train_df.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


## Load data in CustomDataLoader

In [21]:
text_col = 3    # Column index for input text
label_col = 2   # Column index for labels
batch_size = 8 # Batch size for training and validation

# Initialize CustomDataLoader
custom_loader = CustomDataLoader(train_df, val_df, text_col, label_col, batch_size)

# Get train and validation loaders
train_loader = custom_loader.get_train_loader(shuffle=True)
val_loader = custom_loader.get_val_loader(shuffle=True)

## Define Model Params

In [22]:
model_params = {
    "vocab_size": train_loader.dataset.vocab_size,
    "num_embeddings": 64,
    "block_size": train_loader.dataset.block_size,
    "num_heads": 4,
    "num_layers": 4,
    "output_classes": len(train_loader.dataset.labels_lookup_dict),
    "dropout": 0.2,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu'
}

## Define Training Params

In [23]:
train_params = {
    "num_epochs": 100,
    "eval_interval": 10,
    "eval_iters": 10,
    "learning_rate": 0.01
}

## Training Loop

In [24]:
save_models_path = "./models_v1"
TrainingLoop(model_params, train_params).train(train_loader, val_loader, save_models_path, resume_path="/home/adityadev/GPTDecoder/models_v1/best_model.pth")

  checkpoint = torch.load(load_path, map_location=self.device)  # Ensure checkpoint is loaded to the correct device
[32m2024-11-23 20:34:58.091[0m | [1mINFO    [0m | [36mtrain[0m:[36mload_checkpoint[0m:[36m91[0m - [1mCheckpoint loaded from: /home/adityadev/GPTDecoder/models_v1/best_model.pth, resuming from epoch 0[0m
[32m2024-11-23 20:34:58.801[0m | [1mINFO    [0m | [36mtrain[0m:[36mtrain[0m:[36m130[0m - [1mFor epoch 0: Train loss-> 1.3554059267044067 | Val loss-> 1.3709877729415894[0m
[32m2024-11-23 20:34:59.095[0m | [1mINFO    [0m | [36mtrain[0m:[36msave_checkpoint[0m:[36m72[0m - [1mCheckpoint saved: ./models_v1/checkpoint_epoch_0.pth[0m
[32m2024-11-23 20:35:00.911[0m | [1mINFO    [0m | [36mtrain[0m:[36mtrain[0m:[36m130[0m - [1mFor epoch 10: Train loss-> 1.420730435848236 | Val loss-> 1.3868310570716857[0m
[32m2024-11-23 20:35:01.212[0m | [1mINFO    [0m | [36mtrain[0m:[36msave_checkpoint[0m:[36m72[0m - [1mCheckpoint saved: ./

## Evaluate Best model- load it from checkpoint

In [25]:
val_loader = custom_loader.get_val_loader(shuffle=False)
best_model_path = "/home/adityadev/GPTDecoder/models_v1/best_model.pth"
label_mapping = train_loader.dataset.reverse_labels_lookup_dict

report = Evaluate(model_params, best_model_path).evaluate(val_loader, label_mapping)

  checkpoint = torch.load(self.best_model_path, map_location=self.device)
[32m2024-11-23 20:35:51.928[0m | [1mINFO    [0m | [36meval[0m:[36mload_best_model[0m:[36m27[0m - [1mBest model loaded from /home/adityadev/GPTDecoder/models_v1/best_model.pth[0m
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
print(report)

              precision    recall  f1-score   support

  Irrelevant     0.0000    0.0000    0.0000       172
    Negative     0.0000    0.0000    0.0000       266
     Neutral     0.3067    0.2561    0.2792       285
    Positive     0.2754    0.7545    0.4035       277

    accuracy                         0.2820      1000
   macro avg     0.1455    0.2527    0.1707      1000
weighted avg     0.1637    0.2820    0.1913      1000

