In [94]:
import pandas as pd
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers.data.processors.utils import InputFeatures
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
import numpy as np

In [95]:
data = pd.read_csv("processed.csv")
train, test = train_test_split(data, test_size=0.3, random_state=42, stratify=data['emotion'] )
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train['emotion'] )

emotion = pd.unique(data['emotion'])
label_map = { v:i for i, v in enumerate(emotion) }

In [96]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3070


In [97]:
class ClassificationDataset(Dataset):
    def __init__(self, text, target, label_map):
        super(ClassificationDataset).__init__()
        """
        Args:
        text (List[str]): List of the training text
        target (List[str]): List of the training labels
        tokenizer_name (str): The tokenizer name (same as model_name).
        max_len (int): Maximum sentence length
        label_map (Dict[str,int]): A dictionary that maps the class labels to integer
        """
        self.text = text
        self.target = target
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.max_len = 64
        self.label_map = label_map
      

    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())
        
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )      
        return InputFeatures(**inputs,label=self.label_map[self.target[item]])

In [98]:
train_dataset = ClassificationDataset(
    train["text"].to_list(),
    train["emotion"].to_list(),
    label_map
)
val_dataset = ClassificationDataset(
    val["text"].to_list(),
    val["emotion"].to_list(),
    label_map
)
test_dataset = ClassificationDataset(
    test["text"].to_list(),
    test["emotion"].to_list(),
    label_map
)

In [99]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                            return_dict=True, num_labels=len(label_map))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [100]:
def compute_metrics(p): 
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)
    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    precision = precision_score(p.label_ids,preds,average='macro')
    recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {       
      'macro_f1' : macro_f1,
      'Accuracy': acc,
      'Precision': precision,
      'Recall' : recall,
}

In [101]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments( 
    output_dir= "./train",
    adam_epsilon = 1e-8,
    learning_rate = 2e-5,
    per_device_train_batch_size = 32, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2, # use this to scale batch size without needing more memory
    num_train_epochs= 6,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, 
    metric_for_best_model = 'macro_f1',
    greater_is_better = True,
    seed = 42
  )

In [102]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [10]:
trainer.train()



Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Precision,Recall
1,No log,0.401548,0.734436,0.875306,0.796456,0.689396
2,No log,0.366139,0.765653,0.877451,0.808697,0.730388
3,0.448000,0.371782,0.774365,0.875919,0.773293,0.776125
4,0.448000,0.40054,0.768074,0.870711,0.762385,0.777989
5,0.185400,0.411523,0.762159,0.868566,0.758742,0.766629
6,0.185400,0.423151,0.764689,0.870098,0.762846,0.768018


TrainOutput(global_step=1224, training_loss=0.2827819141687131, metrics={'train_runtime': 139.14, 'train_samples_per_second': 562.915, 'train_steps_per_second': 8.797, 'total_flos': 1296991452833280.0, 'train_loss': 0.2827819141687131, 'epoch': 6.0})

In [None]:
# Data Augmentation section

In [110]:
from textattack.transformations import WordSwapEmbedding, WordSwapRandomCharacterDeletion, WordSwapQWERTY, CompositeTransformation, WordSwapContract, WordSwapExtend
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.augmentation import Augmenter

In [125]:
# Set up transformations to apply to the text
# WordSwapQWERTY swaps letters with their QWERTY neighbors
# WordSwapRandomCharacterDeletion deletes random characters
# WordSwapContract contracts words (e.g. "I'm" -> "I am")
# WordSwapExtend extends words (e.g. "I am" -> "I'm")
transformations = CompositeTransformation([WordSwapQWERTY(), WordSwapContract(), WordSwapExtend(), WordSwapEmbedding()])

# Set up constraints to avoid modifying the same word multiple times and to prevent the replacement of stopwords
constraints = [RepeatModification(), StopwordModification()]
# Create an augmenter instance with the chosen transformation and constraints
augmenter = Augmenter(transformation=transformations, constraints=constraints, pct_words_to_swap=0.2, transformations_per_example=1)


# additional parameters
augmenter.enable_advanced_metrics = False # Set to True to get more metrics
augmenter.fast_augment = True
# Set to True to get more augmented examples
augmenter.high_yield = False

# Example:
s = "This university is the best institution known to mankind, I don't want to graduate."
s_res = augmenter.augment(s)
print(s_res)


["This university is the best entity known to mankind, I don't want to superior."]


In [108]:
text = train_dataset.text

# Initialize a list to store the augmented data
augmented_text_list = []
# Iterate over each text sample in the list and generate augmented samples
for original in text:
    try:
        augmented_text = augmenter.augment(original)
    except:
        augmented_text_list.append(original)
        continue
    augmented_text_list.append(augmented_text[0])

augmented_dataset = ClassificationDataset(
    augmented_text_list,
    train_dataset.target,
    label_map
)


    