## Preparing Working Environment

In [1]:
! pip install transformers datasets peft evaluate -q

In [23]:
! pip install python-dotenv numpy -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import os
from huggingface_hub import login

login(token=os.getenv("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Emoji-reaction-coach-with-lora"
os.environ["WANDB_NOTES"] = "Fine tune model with low rank adaptation for an emoji reaction coach"
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use only one GPU

## Download and inspect the dataset

In [17]:
from src.data import load_emoji_dataset

ds = load_emoji_dataset()

In [18]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [19]:
ds["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 45000
})

## Tokenise and Rename the Label Column

In [21]:
from src.data import tokenize_and_format

ds_tok, tok = tokenize_and_format(ds, checkpoint="FacebookAI/roberta-base")

Map: 100%|██████████| 45000/45000 [00:07<00:00, 6022.16 examples/s]
Map: 100%|██████████| 50000/50000 [00:06<00:00, 7145.92 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 8115.74 examples/s]


In [22]:
ds_tok

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})