In [720]:
!which python
!whoami 
!hostname
!pwd

/home/farshed.abdukhakimov/miniconda3/envs/main/bin/python
farshed.abdukhakimov
srv-01
/home/farshed.abdukhakimov/projects/twin-polyak/experiments


In [721]:
%load_ext autoreload
%autoreload 2

import os
import datetime
from collections import defaultdict
import random

import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler

import torchvision
from torchvision.transforms import v2
import torchvision.models as pt_models

import numpy as np
import pandas as pd
import sklearn
import scipy 
import matplotlib.pyplot as plt

import sps
import sls
from pt_methods import *
import models

# import utils

import lightning as L
import torchmetrics
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from dotenv import load_dotenv
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [722]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
device

device(type='cuda')

In [723]:
path = f"{os.getenv("DATASETS_DIR")}/Emotions/train.txt"

data = pd.read_csv(path, sep=';', header=None, names=['text', 'label'], engine='python')
        
labels = data['label'].unique()


label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for idx, label in enumerate(labels)}
num_labels = len(label2id)

label2id, id2label, num_labels

({'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5},
 {0: 'sadness', 1: 'anger', 2: 'love', 3: 'surprise', 4: 'fear', 5: 'joy'},
 6)

In [724]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


text = "I love Machine Learning! Tokenization is awesome."
encoded_text = tokenizer(text,
                         add_special_tokens=True,
                         padding='max_length',
                         truncation=True,
                         max_length=128,
                         return_attention_mask=True,
                         return_tensors='pt')
print(encoded_text)

tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids.squeeze(0))
print(tokens)

{'input_ids': tensor([[  101,  1045,  2293,  3698,  4083,   999, 19204,  3989,  2003, 12476,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [733]:
class EmotionsDataset(Dataset):
    
    def __init__(self, path: str):
        super().__init__()
        self.data = pd.read_csv(path, sep=';', header=None, names=['text', 'label'], engine='python')
        
        labels = self.data['label'].unique()

        self.label2id = {label: idx for idx, label in enumerate(labels)}
        self.id2label = {idx: label for idx, label in enumerate(labels)}

        
        self.data['label'] = [self.label2id.get(label, -1) for label in self.data['label']]
        
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = torch.tensor(self.data.iloc[index]['label'])
    
        tokens = self.tokenizer(text,
                                add_special_tokens=True,
                                padding='max_length',
                                truncation=True,
                                max_length=128,
                                return_attention_mask=True,
                                return_tensors='pt')
        
        out = {
            'input_ids': tokens['input_ids'].squeeze(0),
            'attention_mask': tokens['attention_mask'].squeeze(0),
            'labels': label
        }
        
        return out
    
    
class EmotionsDataModule(L.LightningDataModule):
    
    def __init__(self, batch_size: int = 32):
        super().__init__()

        self.data_dir = f"{os.getenv("DATASETS_DIR")}/Emotions"
        self.batch_size = batch_size

    def setup(self, stage: str):
        
        if stage  == 'fit':
            self.train_dataset = EmotionsDataset(f"{self.data_dir}/train.txt")
            self.val_dataset = EmotionsDataset(f"{self.data_dir}/val.txt")
        if stage == 'test':
            self.test_dataset = EmotionsDataset(f"{self.data_dir}/test.txt")
        if stage == 'predict':
            self.test_dataset = EmotionsDataset(f"{self.data_dir}/test.txt")
            
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=2, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=2, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2, shuffle=False)


In [735]:
from torchmetrics.classification import MulticlassAccuracy

class EmotionsClassifier(L.LightningModule):
    
    def __init__(self, num_labels, config: dict):
        super().__init__()
        self.save_hyperparameters()
        self.config = config
        
        self.model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

        for param in self.model.base_model.parameters():
            param.requires_grad = False

        for name, param in self.model.base_model.named_parameters():
            if 'pooler' in name:
                param.requires_grad = True
        
        self.valid_acc = MulticlassAccuracy(num_classes=num_labels)
        self.test_acc = self.valid_acc.clone()
        
    def training_step(self, batch, batch_idx):
        output = self.model(**batch)
        loss = output.loss
        self.log("train_loss", loss, on_epoch=True, prog_bar=True)
        self.log("step_size", self.optimizers().param_groups[0]["lr"], on_step=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        output = self.model(**batch)
        self.valid_acc(output.logits, batch["labels"])
        self.log('valid_acc', self.valid_acc, on_epoch=True, prog_bar=True)
        self.log('valid_loss', output.loss, on_epoch=True, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        logits = self.model(**batch).logits
        self.test_acc(logits, batch["labels"])
        self.log('test_acc', self.test_acc, on_epoch=True) 
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.config['lr'])
        scheduler = torch.optim.lr_scheduler.LinearLR(
            optimizer, 
            start_factor=1e-8,
            total_iters=self.config['warmup'] * self.config['train_size']
        )
        return [optimizer], [scheduler]
        


In [117]:
import wandb

assert wandb.login() == True

In [None]:
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.loggers import WandbLogger

seed_everything(0, workers=True)

wandb_logger = WandbLogger(project="TwinPolyak")

data_module = EmotionsDataModule(batch_size=256)
data_module.setup('fit')

config = {
    'batch_size': data_module.batch_size,
    'lr': 1e-5,
    'warmup': 0.1,
    'train_size': len(data_module.train_dataloader()),
}

model = EmotionsClassifier(num_labels=num_labels, config=config)

# wandb_logger.watch(model, log='all')

trainer = L.Trainer(max_epochs=100, logger=None)

# try: 
trainer.validate(model=model, datamodule=data_module)
trainer.fit(model, data_module)
trainer.test(model=model, datamodule=data_module)
# finally:
#     wandb.finish()

Seed set to 0


In [32]:
wandb.finish()

0,1
epoch,▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇███
train_loss_epoch,▃█▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▄▅▂▄▄▃▃▃▄▄▆▅▆▅▃▄▂▂▄▅▂▄▂▂▇▂▄▄▄▄▅▄▄▃▄▅▁▇
trainer/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇████
valid_acc,█▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,13.0
train_loss_epoch,1.57687
train_loss_step,1.4944
trainer/global_step,6899.0
valid_acc,0.16667
