In [2]:
import torch 
print(torch.cuda.is_available())

True


In [3]:
from datasets import load_dataset
from warnings import filterwarnings 

filterwarnings("ignore")

emotion_dataset = load_dataset("emotion")
emotion_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [4]:
emotion_dataset["train"][0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [5]:
emotion_df = emotion_dataset["train"].to_pandas()
emotion_df

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [6]:
features = emotion_dataset["train"].features 
features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [7]:
features["label"].int2str(0)

'sadness'

In [8]:
id2label = {idx:features["label"].int2str(idx) for idx in range(6)}
id2label

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

In [9]:
label2id = {v:k for k,v in id2label.items()}
label2id

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [10]:
emotion_df["label"].value_counts(normalize=True).sort_index()

label
0    0.291625
1    0.335125
2    0.081500
3    0.134937
4    0.121063
5    0.035750
Name: proportion, dtype: float64

#### Tokenize all the things

In [11]:
from transformers import AutoTokenizer 

model_ckpt = "microsoft/MiniLM-L12-H384-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [12]:
tokenizer(emotion_dataset["train"]["text"][0])

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [13]:
def tokenize_text(examples):
    return tokenizer(examples["text"], truncation = True, max_length = 512)

In [14]:
emotion_dataset = emotion_dataset.map(tokenize_text, batched = True)
emotion_dataset

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

#### Dealing with imbalanced class

In [15]:
class_weights = (1 - (emotion_df["label"].value_counts().sort_index()/ len(emotion_df))).values 
class_weights

array([0.708375 , 0.664875 , 0.9185   , 0.8650625, 0.8789375, 0.96425  ])

In [16]:
import torch 

class_weights = torch.from_numpy(class_weights).float().to("cuda")
class_weights

tensor([0.7084, 0.6649, 0.9185, 0.8651, 0.8789, 0.9643], device='cuda:0')

In [17]:
emotion_dataset = emotion_dataset.rename_column("label", "labels")

In [24]:
from torch import nn 
import torch 
from transformers import Trainer 

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_func = nn.CrossEntropyLoss(weight = class_weights)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss


#### Putting it all together

In [19]:
from transformers import AutoModelForSequenceClassification 

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = 6, id2label = id2label, label2id = label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}


#### Training

In [21]:
from transformers import TrainingArguments 

batch_size = 64
logging_steps = len(emotion_dataset["train"]) // batch_size 

output_dir = "transformer-training"
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_eval_batch_size=batch_size,
                                  per_device_train_batch_size=batch_size, 
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps = logging_steps,
                                  fp16=True)

In [25]:
trainer = WeightedLossTrainer(model = model, 
                              args = training_args, 
                              compute_metrics=compute_metrics, 
                              train_dataset=emotion_dataset["train"], 
                              eval_dataset = emotion_dataset["validation"], 
                              tokenizer = tokenizer)

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,1.4163,1.041749,0.595663
2,0.8917,0.682104,0.853281
3,0.617,0.507585,0.892664
4,0.4723,0.426393,0.905262
5,0.4041,0.39174,0.913972


TrainOutput(global_step=1250, training_loss=0.7602637145996094, metrics={'train_runtime': 49.5196, 'train_samples_per_second': 1615.521, 'train_steps_per_second': 25.243, 'total_flos': 581870908894464.0, 'train_loss': 0.7602637145996094, 'epoch': 5.0})

In [27]:
trainer.save_model("transformer-training/trained_model")
tokenizer.save_pretrained("transformer-training/trained_model")

('transformer-training/trained_model\\tokenizer_config.json',
 'transformer-training/trained_model\\special_tokens_map.json',
 'transformer-training/trained_model\\vocab.txt',
 'transformer-training/trained_model\\added_tokens.json',
 'transformer-training/trained_model\\tokenizer.json')

In [29]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("transformer-training/trained_model")
tokenizer = AutoTokenizer.from_pretrained("transformer-training/trained_model")

# Create pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Test it on an example text
result = classifier("I am very happy with my experience!")
print(result)


Device set to use cuda:0


[{'label': 'joy', 'score': 0.9064372181892395}]
