<a href="https://colab.research.google.com/github/gupta24789/hugging-face/blob/main/05_emotion_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Multi - Class Classification

In [None]:
!rm -rf checkpoints_logs logs mlruns

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
os.environ['TOKENIZERS_PARALLELISM'] = "0"

In [None]:
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

## Load data

In [None]:
dataset = load_dataset("sg247/multiclass-classification")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2000
    })
})

## Transform data

In [None]:
dataset = dataset.map(lambda x: {"text" : x['text'].split(';')[0], "labels": x['text'].split(';')[1]})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2000
    })
})

In [None]:
dataset['train'][0]

{'text': 'i didnt feel humiliated', 'labels': 'sadness'}

In [None]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='string', id=None)}

In [None]:
unique_labels = dataset['train'].to_pandas()['labels'].unique().tolist()
unique_labels

['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']

In [None]:
## Define tokenizer
model_name = 'albert-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
id2label = dict(enumerate(unique_labels))
label2id = {w:i for i,w in id2label.items()}

In [None]:
def tokenized_text(row):
    text, label = row['text'], row['labels']
    encoded_text = tokenizer(text, padding='max_length', max_length= 50, truncation=True)
    encoded_text['labels'] = label2id[label]
    return encoded_text

In [None]:
tokenized_dataset = dataset.map(tokenized_text)
tokenized_dataset = tokenized_dataset.remove_columns("text")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

## Build Model

In [None]:
## CONFIG
TRAIN_BATCH_SIZE = 128
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 1e-5
EPOCHS = 5

train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['test']
metric = evaluate.combine(['f1','precision','recall'])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

## Training argument
args = TrainingArguments("checkpoints_logs", evaluation_strategy="epoch", save_strategy="epoch",
                  per_device_train_batch_size= TRAIN_BATCH_SIZE,per_device_eval_batch_size= EVAL_BATCH_SIZE,
                  use_cpu=False, learning_rate= LEARNING_RATE, num_train_epochs=EPOCHS, weight_decay= .01,
                  warmup_steps = 100, logging_dir= "logs", logging_steps=100, run_name= None)

## Model
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels = len(id2label),
                                                           id2label = id2label,
                                                           label2id = label2id)
## Trainer
trainer = Trainer(model,
                  args,
                  train_dataset= train_dataset,
                  eval_dataset=eval_dataset,
                  compute_metrics= compute_metrics
                  )

## Model training
trainer.train()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,1.5733,1.105911,0.460237,0.388837,0.5845
2,1.002,0.506952,0.84812,0.860331,0.8575
3,0.5472,0.326872,0.902739,0.907503,0.904
4,0.2546,0.275435,0.913067,0.918216,0.9145
5,0.207,0.255573,0.914568,0.916676,0.915


  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory checkpoints_logs/checkpoint-125 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory checkpoints_logs/checkpoint-250 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=625, training_loss=0.6353464401245117, metrics={'train_runtime': 258.4605, 'train_samples_per_second': 309.525, 'train_steps_per_second': 2.418, 'total_flos': 186777744000000.0, 'train_loss': 0.6353464401245117, 'epoch': 5.0})

## Inference

In [None]:
model = model.eval().to("cpu")

In [None]:
text = "I love you"
inputs = tokenizer(text, return_tensors= "pt")
logits = model(**inputs)['logits'].detach().numpy()
preds = np.argmax(logits,axis=-1)[0]
id2label[preds]

'joy'

In [None]:
text = "how dare you"
inputs = tokenizer(text, return_tensors= "pt")
logits = model(**inputs)['logits'].detach().numpy()
preds = np.argmax(logits,axis=-1)[0]
id2label[preds]

'fear'