## Preparing Working Environment

In [None]:
! pip install transformers datasets peft evaluate -q

In [3]:
! pip install python-dotenv -q

In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
import os
from huggingface_hub import login

login(token=os.getenv("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Emoji-reaction-coach-with-lora"
os.environ["WANDB_NOTES"] = "Fine tune model with low rank adaptation for an emoji reaction coach"
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use only one GPU

## Download and inspect the dataset

In [3]:
from datasets import load_dataset

ds = load_dataset("tweet_eval", "emoji")

README.md:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.61M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.05M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/282k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [5]:
ds["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 45000
})

## Tokenise and Rename the Label Column

In [7]:
from transformers import AutoTokenizer

checkpoint = "FacebookAI/roberta-base"
tok = AutoTokenizer.from_pretrained(checkpoint)

def tokenise(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=128)

ds_tok = ds.map(tokenise, batched=True)
ds_tok = ds_tok.rename_column("label", "labels") # hf classification models often expect the key "labels" (plural)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [8]:
ds_tok

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

## Attach a LoRA adapter and train

In [117]:
from evaluate import load
import torch
import numpy as np

acc_metric = load("accuracy")
f1_metric = load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    metrics = {}
    # accuracy
    metrics.update(acc_metric.compute(predictions=preds, references=labels))
    # macro average across 20 emoji classes
    metrics.update(f1_metric.compute(predictions=preds, references=labels, average="macro"))
    # top-3 accuracy
    top3 = np.any(np.argsort(logits, axis=-1)[:, -3:] == labels[:, None], axis=1)
    metrics["top3_accuracy"] = top3.mean().item()
    
    return metrics

Macro-F1 helps with Datasets with skewed frequency (tweet_eval / emoji is highly skewed: “😂” appears ~9× more than “😩”).

Top-k accuracy shows k candidates for the ground truth label.

In [131]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [132]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=20
)
print_trainable_parameters(model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 124661012 || all params: 124661012 || trainable%: 100.00


In [133]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        "query", "value", "key"
    ],
    lora_dropout=0.05,
    bias="none",
    modules_to_save=["classifier"]
)

lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

trainable params: 2375444 || all params: 127036456 || trainable%: 1.87


In [134]:
from uuid import uuid4
from transformers import DataCollatorWithPadding

model_name = checkpoint.split("/")[-1]
model_id = f"{model_name}-with-tweet-eval-emoji-full"

training_args = TrainingArguments(
    output_dir=model_id,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_ratio=0.05,
    num_train_epochs=4,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    max_grad_norm=0.5,
    label_smoothing_factor=0.1,
    save_total_limit=3,
    logging_steps=30,
    fp16=True,
    push_to_hub=False,
    run_name = f"emoji-{uuid4().hex[:8]}",
    report_to="wandb",
    label_names=["labels"],
)
data_collator = DataCollatorWithPadding(tok, pad_to_multiple_of=8)

In [135]:
from transformers import Trainer
from sklearn.utils.class_weight import compute_class_weight

labels = np.array(ds_tok["train"]["labels"])
weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(weights, dtype=torch.float)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = torch.nn.functional.cross_entropy(
            logits, labels, weight=class_weights.to(logits.device)
        )
        return (loss, outputs) if return_outputs else loss


In [136]:
import wandb

trainer = WeightedTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Top3 Accuracy
1,2.1401,2.417112,0.2224,0.220445,0.4152
2,2.1157,2.372503,0.2412,0.2463,0.4376
3,2.0143,2.351388,0.2508,0.255062,0.4448
4,1.9346,2.357327,0.2594,0.263203,0.45


0,1
eval/accuracy,▁▅▆█
eval/f1,▁▅▇█
eval/loss,█▃▁▂
eval/runtime,▃█▅▁
eval/samples_per_second,▆▁▄█
eval/steps_per_second,▆▁▄█
eval/top3_accuracy,▁▆▇█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▂▅▄▆▆▄▃▆▄▄▅▄▅▆█▆▅▅▆▄▆▆▅▇▆▆▆▇▅▇▆▆▇██▆█▆█

0,1
eval/accuracy,0.2594
eval/f1,0.2632
eval/loss,2.35733
eval/runtime,10.4631
eval/samples_per_second,477.869
eval/steps_per_second,1.911
eval/top3_accuracy,0.45
total_flos,1.217029238784e+16
train/epoch,4.0
train/global_step,1408.0


In [124]:
import torch
import gc

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

gc.collect()

10058

In [139]:
metrics = trainer.evaluate(ds_tok["test"])
print(metrics)

# save weights and tokenizer
lora_model.save_pretrained(model_id)
tok.save_pretrained(model_id)

{'eval_loss': 1.9591624736785889, 'eval_accuracy': 0.4286, 'eval_f1': 0.33462876199193464, 'eval_top3_accuracy': 0.65028, 'eval_runtime': 104.9624, 'eval_samples_per_second': 476.361, 'eval_steps_per_second': 1.867, 'epoch': 4.0}


('roberta-base-with-tweet-eval-emoji-full/tokenizer_config.json',
 'roberta-base-with-tweet-eval-emoji-full/special_tokens_map.json',
 'roberta-base-with-tweet-eval-emoji-full/vocab.json',
 'roberta-base-with-tweet-eval-emoji-full/merges.txt',
 'roberta-base-with-tweet-eval-emoji-full/added_tokens.json',
 'roberta-base-with-tweet-eval-emoji-full/tokenizer.json')

In [None]:
wandb.finish()

In [140]:
!ls -lh {model_id}

total 14M
-rw-r--r-- 1 root root  870 Jun 23 18:44 adapter_config.json
-rw-r--r-- 1 root root 9.1M Jun 23 18:44 adapter_model.safetensors
drwxr-xr-x 2 root root 4.0K Jun 23 18:32 checkpoint-1056
drwxr-xr-x 2 root root 4.0K Jun 23 16:59 checkpoint-1408
drwxr-xr-x 2 root root 4.0K Jun 23 17:03 checkpoint-2112
-rw-r--r-- 1 root root 446K Jun 23 18:44 merges.txt
-rw-r--r-- 1 root root 5.0K Jun 23 18:44 README.md
-rw-r--r-- 1 root root  280 Jun 23 18:44 special_tokens_map.json
-rw-r--r-- 1 root root 1.3K Jun 23 18:44 tokenizer_config.json
-rw-r--r-- 1 root root 3.4M Jun 23 18:44 tokenizer.json
-rw-r--r-- 1 root root 780K Jun 23 18:44 vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [141]:
from peft import PeftConfig, PeftModel

config = PeftConfig.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=20
)
inference_model = PeftModel.from_pretrained(model, model_id).eval()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [142]:
id2label = {
    0: "❤", 1: "😍", 2: "😂", 3: "💕", 4: "🔥",
    5: "😊", 6: "😎", 7: "✨", 8: "💙", 9: "😘",
    10: "📷", 11: "🇺🇸", 12: "☀", 13: "💜", 14: "😉",
    15: "💯", 16: "😁", 17: "🎄", 18: "📸", 19: "😜"
}

In [152]:
from transformers import pipeline

pipe = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tok,
    return_all_scores=True,          # return all 20 logits
    function_to_apply="softmax"      # turn logits -> probabilities
)

def emojify(text, k=3):
    probs = pipe(text)[0]
    top = sorted(probs, key=lambda x: x["score"], reverse=True)[:k]
    return " ".join(id2label[int(d["label"].split("_")[-1])] for d in top)


print(emojify("Sunny day"))

Device set to use cuda:0


☀ 💙 😎


In [153]:
repo_id = "codinglabsong/roberta-base-tweet-emoji-lora"

trainer.create_model_card(   # adds automatic metrics, dataset info, tags
    model_name=repo_id.split("/")[-1],
    language="en",
    license="apache-2.0",
    tags=["twitter", "emoji", "lora", "roberta-base"],
)

trainer.push_to_hub(
    repo_id,                        # full path, e.g. "your-hf-username/…"
    finetuned_from="roberta-base",
    tasks="text-classification",
    dataset="tweet_eval/emoji",
)

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/9.51M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/codinglabsong/roberta-base-with-tweet-eval-emoji-full/commit/76696d41cb9ae04b75b3324ac2c921f5b055ae1c', commit_message='codinglabsong/roberta-base-tweet-emoji-lora', commit_description='', oid='76696d41cb9ae04b75b3324ac2c921f5b055ae1c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/codinglabsong/roberta-base-with-tweet-eval-emoji-full', endpoint='https://huggingface.co', repo_type='model', repo_id='codinglabsong/roberta-base-with-tweet-eval-emoji-full'), pr_revision=None, pr_num=None)

In [159]:

adapter_id = "codinglabsong/roberta-base-tweet-emoji-lora"
base_id    = "roberta-base"

# 1) tokenizer
tok = AutoTokenizer.from_pretrained(adapter_id)

# 2) base backbone (suppress that first warning)
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_id,
    num_labels=20,
    ignore_mismatched_sizes=True,   # hides “not initialized” notice
)

# 3) load LoRA
model = PeftModel.from_pretrained(base_model, adapter_id)

# OPTIONAL: merge so the pipeline sees a plain Roberta model
model = model.merge_and_unload()    # now type(model) == RobertaForSequenceClassification
model.eval()

# 4) build pipeline (no “unsupported” warning anymore)
pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tok,
    return_all_scores=True,
    function_to_apply="softmax",
)

def emojify(text, k=3):
    scores = pipe(text)[0]
    top = sorted(scores, key=lambda x: x["score"], reverse=True)[:k]
    return " ".join(id2label[int(s["label"].split("_")[-1])] for s in top)

print(emojify("Sunny day"))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


☀ 💙 😎


| Area                              | What’s happening in your notebook                 | Why it hurts                                                                                            | How to fix                                                                                                                   |
| --------------------------------- | ------------------------------------------------- | ------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- |
| **Evaluation flag typo**          | `eval_strategy="epoch"`                           | HF ignores this; no mid-epoch evaluation or early stopping signals.                                     | Rename to `evaluation_strategy="epoch"` (or `"steps"`).                                                                      |
| **Learning-rate & batch size**    | `lr=9 e-4`, `batch=256`                           | Very high LR + very large batch ⇒ under-fitting & unstable loss (your val loss 2.57 vs baseline ≈ 1.6). | Try **2 e-4 – 3 e-4** with batch 32 (or keep 256 but add `gradient_accumulation_steps` so the *effective* batch is smaller). |
| **LoRA only, small base model**   | LoRA on DeBERTa-v3-small                          | You froze most weights; for 20-way classification, the extra capacity matters.                          | **Full fine-tune** the base model first (or use DeBERTa-v3-base / RoBERTa-base). Once that is solid, experiment with LoRA.   |
| **LoRA target list**              | `["query_proj","value_proj", … ]` (no `key_proj`) | Missing keys ⇒ only part of self-attention adapts.                                                      | Add `"key_proj"` (and verify exact layer names with `print(model)`); or let 🤗 PEFT auto-match by regex.                     |
| **Max sequence length**           | `max_length=70`                                   | \~25 % of tweets in this set are longer; you’re truncating signal.                                      | Increase to **128** (Twitter limit is 280 chars; 128 covers > 97 % after tokenisation).                                      |
| **Class imbalance**               | Plain CE loss                                     | Macro-F1 punishes you for rare emojis (some appear < 1 % in train).                                     | Pass **class weights** to loss (or use **focal loss**). HF `Trainer` lets you supply a custom `loss_fn`.                     |
| **Weight decay & regularisation** | Not set                                           | Over-fitting the dominant classes, poor generalisation.                                                 | Add `weight_decay=0.01` and/or `label_smoothing_factor=0.1`.                                                                 |
| **Dynamic padding**               | Padding to 70 tokens everywhere wastes compute.   | Fewer real tokens per batch ⇒ lower throughput.                                                         | Use `DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)` for mixed-precision efficiency.                               |
