In [1]:
import pandas as pd
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig, EarlyStoppingCallback, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers.data.processors.utils import InputFeatures
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
import numpy as np
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("processed.csv")
train, test = train_test_split(data, test_size=0.3, random_state=42, stratify=data['emotion'] )
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train['emotion'] )

emotion = pd.unique(data['emotion'])
label_map = { v:i for i, v in enumerate(emotion) }

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3080 Ti


In [4]:
class ClassificationDataset(Dataset):
    def __init__(self, text, target, label_map):
        super(ClassificationDataset).__init__()
        """
        Args:
        text (List[str]): List of the training text
        target (List[str]): List of the training labels
        tokenizer_name (str): The tokenizer name (same as model_name).
        max_len (int): Maximum sentence length
        label_map (Dict[str,int]): A dictionary that maps the class labels to integer
        """
        self.text = text
        self.target = target
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.max_len = 64
        self.label_map = label_map
      

    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())
        
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )      
        return InputFeatures(**inputs,label=self.label_map[self.target[item]])

In [5]:
train_dataset = ClassificationDataset(
    train["text"].to_list(),
    train["emotion"].to_list(),
    label_map
)
val_dataset = ClassificationDataset(
    val["text"].to_list(),
    val["emotion"].to_list(),
    label_map
)
test_dataset = ClassificationDataset(
    test["text"].to_list(),
    test["emotion"].to_list(),
    label_map
)

In [6]:
import torch.nn.functional as F

def softmax_focal_loss_with_regularization(inputs, targets, model, alpha = None, gamma = 2, reduction = "none", reg_type: str = "l2", reg_weight: float = 1e-3):
    # Compute the cross-entropy loss
    ce_loss = F.cross_entropy(inputs, targets, reduction="none")
    p = torch.exp(-ce_loss)
    loss = ce_loss * ((1 - p) ** gamma)

    # Apply the alpha weighting
    if alpha is not None:
        batch_size, num_classes = inputs.size()
        class_mask = torch.zeros((batch_size, num_classes), device=inputs.device)
        class_mask.scatter_(1, targets.unsqueeze(1), 1.)
        alpha_t = torch.sum(alpha.to(inputs.device) * class_mask.to(inputs.device), dim=1)
        loss = alpha_t * loss

    # Compute the regularization term
    reg_loss = 0
    for param in model.parameters():
        if reg_type == "l1":
            reg_loss += torch.sum(torch.abs(param))
        elif reg_type == "l2":
            reg_loss += torch.sum(param ** 2)
        else:
            raise ValueError(f"Invalid value for 'reg_type': {reg_type}")

    # Combine the focal loss and the regularization term
    loss = loss + reg_weight * reg_loss

    # Apply the reduction
    if reduction == "none":
        pass
    elif reduction == "mean":
        loss = loss.mean()
    elif reduction == "sum":
        loss = loss.sum()
    else:
        raise ValueError(f"Invalid Value for arg 'reduction': '{reduction} \\n Supported reduction modes: 'none', 'mean', 'sum'")
    
    return loss

In [7]:
class CustomSequenceClassificationModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(self.distilbert.config.dim, 128)
        self.dropout = nn.Dropout(0.2)
        self.classifier1 = nn.Linear(128, 64)
        self.classifier2 = nn.Linear(64, 5)
        print(self.distilbert.config.dim)
        print(self.distilbert.config.seq_classif_dropout)

    def forward(self, input_ids, attention_mask=None, labels=None):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = distilbert_output[0]
        out = hidden_state[:, 0]
        out = self.pre_classifier(out)
        out = nn.ReLU()(out)
        out = self.dropout(out)
        out = self.classifier1(out)
        out = nn.ReLU()(out)
        out = self.dropout(out)
        logits = self.classifier2(out)
        #weights = torch.tensor([0.00045, 0.00321, 0.00321, 0.00321, 0.00321]).to(device)
        #loss_fct = nn.CrossEntropyLoss()
        #loss = loss_fct(logits.view(-1, 5), labels.view(-1))
        
        '''
        reg_weight = 1e-3
        l1_reg = 0
        for param in self.parameters():
            l1_reg += torch.sum(torch.abs(param))
        loss = loss + reg_weight * l1_reg
        '''
        alpha = torch.tensor([0.1, 0.4, 0.4, 0.4, 0.4])
        loss = softmax_focal_loss_with_regularization(logits.view(-1, 5), labels.view(-1), self, alpha=alpha)
        

        return (loss.mean(), logits)
    
model = CustomSequenceClassificationModel()
#num_labels = 5
#model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768
0.2


In [8]:
def compute_metrics(p): 
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)
    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    precision = precision_score(p.label_ids,preds,average='macro', zero_division=1)
    recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {       
      'macro_f1' : macro_f1,
      'Accuracy': acc,
      'Precision': precision,
      'Recall' : recall,
}

In [9]:
training_args = TrainingArguments( 
    output_dir= "./train",    
    adam_epsilon = 1e-8,
    learning_rate = 2e-5,
    per_device_train_batch_size = 16, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2, # use this to scale batch size without needing more memory
    num_train_epochs= 20,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, # this allows to automatically get the best model at the end based on whatever metric we want
    metric_for_best_model = 'macro_f1',
    greater_is_better = True,
    seed = 42,
    report_to="none",
    weight_decay=0.0001
  )

In [10]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3, # Number of epochs with no improvement
    early_stopping_threshold=0.01, # Minimum improvement in validation loss to be considered an improvement
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[early_stopping_callback]
)

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Precision,Recall
1,No log,138.091721,0.544685,0.780331,0.695613,0.643453
2,151.021300,113.102364,0.741838,0.847426,0.729934,0.760184
3,117.083900,93.797806,0.679491,0.77451,0.617326,0.800596
4,92.951900,78.750313,0.749697,0.850797,0.728458,0.781123
5,75.151000,66.823456,0.713177,0.801777,0.667806,0.795789
6,75.151000,57.391476,0.732338,0.837929,0.694384,0.782415
7,61.811900,49.855453,0.7164,0.823529,0.668208,0.787047
8,51.741800,43.741138,0.706648,0.806066,0.663172,0.779706
9,43.964300,38.758881,0.730939,0.847426,0.698727,0.775579
10,37.938000,34.696865,0.71625,0.823836,0.665388,0.787312


RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\caffe2\serialize\inline_container.cc:337] . unexpected pos 211570112 vs 211570000