# Downloads

In [1]:
! pip install datasets transformers[torch] -Uqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports

In [2]:
import torch
import huggingface_hub
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import sklearn.metrics as metrics

## Hugging Face Model Upload

In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# How to Fine-Tune Transformers

## 1. Load Data

In [4]:
# Let's load data from huggingface hub
data = load_dataset(
    path = "emotion"
)

print(data)

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [5]:
data["train"].to_pandas().head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [6]:
print(data["train"].features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}


## 2. Load Tokenizer & Encode Data

In [7]:
model_checkpoint = "distilbert-base-uncased"

# Initialize tokenizer from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path = model_checkpoint
)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# Encode Data
def encode_data(batch):
    return tokenizer(
        text = batch["text"],
        padding = True,
        truncation = True
    )

encoded_data = data.map(
    function = encode_data,
    batched = True,
    batch_size = None
)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
encoded_data["train"].to_pandas().head()

Unnamed: 0,text,label,input_ids,attention_mask
0,i didnt feel humiliated,0,"[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,i can go from feeling so hopeless to so damned...,0,"[101, 1045, 2064, 2175, 2013, 3110, 2061, 2062...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,im grabbing a minute to post i feel greedy wrong,3,"[101, 10047, 9775, 1037, 3371, 2000, 2695, 104...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
3,i am ever feeling nostalgic about the fireplac...,2,"[101, 1045, 2572, 2412, 3110, 16839, 9080, 128...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,i am feeling grouchy,3,"[101, 1045, 2572, 3110, 24665, 7140, 11714, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


## 3. Load Model

In [10]:
labels = data["train"].features["label"].names
print(labels)

num_labels = len(labels)
print(num_labels)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
6


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = model_checkpoint,
    num_labels = num_labels
).to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Prepare Compute Metrics

In [12]:
from transformers import EvalPrediction
# Create compute metrics function
def compute_metrics(pred:EvalPrediction):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = metrics.accuracy_score(
        y_true = labels,
        y_pred = preds
    )

    f1 = metrics.f1_score(
        y_true = labels,
        y_pred = preds,
        average = "weighted"
    )

    return {
        "accuracy_score" : acc,
        "f1_score" : f1,
    }

## 5. Prepare TrainingArguments

In [13]:
# Let's prepare training arguments
training_args = TrainingArguments(
    output_dir = f"{model_checkpoint}-emotion-ai",
    evaluation_strategy = "epoch",
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    learning_rate = 2e-5,
    num_train_epochs = 2,
    push_to_hub = True,
    weight_decay = 0.01,
    disable_tqdm = False,
    log_level = "error",
    logging_steps = len(encoded_data["train"]) // 64
)

## 6. Fine-Tune with Trainer

In [16]:
# Let's prepare Trainer for Fine-Tuning
trainer = Trainer(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = encoded_data["train"],
    eval_dataset = encoded_data["validation"]
)

print(trainer)

<transformers.trainer.Trainer object at 0x79540998b1f0>


In [17]:
# Let's fine-tune
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy Score,F1 Score
1,0.8119,0.311978,0.9095,0.908839
2,0.2464,0.2142,0.9265,0.926536


TrainOutput(global_step=500, training_loss=0.5291275634765625, metrics={'train_runtime': 236.9861, 'train_samples_per_second': 135.029, 'train_steps_per_second': 2.11, 'total_flos': 720342861696000.0, 'train_loss': 0.5291275634765625, 'epoch': 2.0})

In [18]:
from transformers import pipeline

In [19]:
model = pipeline(
    task = "text-classification",
    model = "/content/distilbert-base-uncased-emotion-ai"
)

In [22]:
prediction = model("I saw a movie today and it was really good.", top_k = 6)
prediction

[{'label': 'LABEL_1', 'score': 0.9569684863090515},
 {'label': 'LABEL_0', 'score': 0.013363274745643139},
 {'label': 'LABEL_2', 'score': 0.007976988330483437},
 {'label': 'LABEL_3', 'score': 0.007941574789583683},
 {'label': 'LABEL_5', 'score': 0.0075563229620456696},
 {'label': 'LABEL_4', 'score': 0.006193348206579685}]

In [23]:
dir(prediction)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']