In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    TrainingArguments, Trainer
)
from sklearn.metrics import accuracy_score, f1_score
from transformers import DataCollatorWithPadding


# Loading the model from huggingface

In [2]:
model_name = "distilbert/distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name,  num_labels=7)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Building the tokenized pyarrow Dataset object for the MELD dataset using the helper functions below

In [3]:


dataset_root_path = "/media/cv/Extreme Pro1/MELD.Raw/MELD.Raw"


class_labels = sorted(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}


def filename(row):
    d_id = str(row['Dialogue_ID'])
    u_id = str(row['Utterance_ID'])
    return f'dia{d_id}_utt{u_id}'
    
def load_df(path):
    df = pd.read_csv(path)
    df['filename'] = df.apply(lambda row: filename(row), axis=1)
    df['Emotion'] = df['Emotion'].apply(lambda x: label2id[x])
    df = df[['Utterance', 'Emotion', 'filename']]
    df = df.rename(
        columns={
            'Utterance': 'text',
            'Emotion': 'label'
        })
    return df

def get_dataset(dataset_root_path):
    train = load_df(os.path.join(dataset_root_path, 'train.csv'))
    train = Dataset.from_pandas(train)
    
    test = load_df(os.path.join(dataset_root_path, 'test.csv'))
    test = Dataset.from_pandas(test)
    
    dev = load_df(os.path.join(dataset_root_path, 'dev.csv'))
    dev = Dataset.from_pandas(dev)
    return (train, test, dev)

def get_tokenized_dataset(dataset_root_path, tokenizer):
    def tokenize(example):
        return tokenizer(example['text'], padding="max_length", truncation=True)
    
    train, test, dev = get_dataset(dataset_root_path)
    train = train.map(tokenize, batched=True)
    test = test.map(tokenize, batched=True)
    dev = dev.map(tokenize, batched=True)
    return (train, test, dev)    

train, test, dev = get_tokenized_dataset(dataset_root_path, tokenizer)

Map:   0%|          | 0/9989 [00:00<?, ? examples/s]

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

Map:   0%|          | 0/1109 [00:00<?, ? examples/s]

## Loading the huggingface trainer and defining hyperparameters for training

In [5]:


save_path = './bert-base-finetuned-meld'
batch_size=16
lr = 5e-5
num_epochs = 11

training_args = TrainingArguments(
    output_dir = save_path,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1_score",
    max_steps= (len(train) // batch_size) * num_epochs,
    save_total_limit =3,
)

## Loading the hugging face traininer and using custom metrics of accuracy and f1 score for evaluation

In [8]:

# Prepare data collator for padding sequences
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=dev,
    compute_metrics=lambda pred: {
        'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1)),
        'f1_score': f1_score(pred.label_ids, pred.predictions.argmax(-1), average='weighted')
    },
    # data_collator=data_collator
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,1.2741,1.41067,0.508566,0.400113
2,1.2049,1.22021,0.601443,0.570873
3,1.1313,1.221747,0.596934,0.546669
4,0.9501,1.264851,0.581605,0.534915
5,0.757,1.345549,0.588819,0.560679
6,0.5652,1.395394,0.574391,0.554237
7,0.4375,1.594224,0.561767,0.548553
8,0.4217,1.760334,0.550947,0.54733
9,0.2419,1.879075,0.556357,0.54627
10,0.1723,2.096506,0.546438,0.536662




KeyboardInterrupt: 

In [11]:
trainer.evaluate(test)



{'eval_loss': 2.4425811767578125,
 'eval_accuracy': 0.5823754789272031,
 'eval_f1_score': 0.5832735709957536}