# Showcase of a PEFT technique
- PEFT Technique: LoRA
- Model: "istilbert/distilbert-base-uncased-finetuned-sst-2-english"
- Evaluation approach: Determine 
- Fine-tuning dataset: "JoaoFassina/pokemon_anotated"

In [39]:
! pip install -q "datasets==2.15.0"
! pip install transformers

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m435.0/435.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading tokenizers-0.21.0-cp39

In [34]:
from datasets import load_dataset

original_dataset = load_dataset("JoaoFassina/pokemon_anotated", split="train").train_test_split(test_size=0.2, shuffle=True, seed=27)

In [35]:
import pandas as pd

df = pd.DataFrame(original_dataset["train"])
df["text"].head()

0    Sentret is a small mammalian pokemon covered i...
1    Mewtwo is a humanoid pokemon with a white body...
2    Sunflora is Sunkern evolution it resembles a s...
3    Tangela is a plant pokemon covered by blue sea...
4    Tornadus is a humanoid pokemon it resembles a ...
Name: text, dtype: object

In [36]:
def split_text_to_label(example):
    """
    Since each entry begins with the name of the Pokemon, 
    we'll split the text on the first word and use that as the label.
    """
    # Split the text into words
    words = example['text'].split()
    
    label = words[0]
    text = ' '.join(words[1:]) if len(words) > 1 else ''
    
    return {
        'label': label,
        'text': text
    }

dataset = original_dataset.map(split_text_to_label)

df = pd.DataFrame(dataset["train"])
df[["label", "text"]].head()

Unnamed: 0,label,text
0,Sentret,is a small mammalian pokemon covered in brown ...
1,Mewtwo,is a humanoid pokemon with a white body it has...
2,Sunflora,is Sunkern evolution it resembles a sunflower ...
3,Tangela,is a plant pokemon covered by blue seaweed-lik...
4,Tornadus,is a humanoid pokemon it resembles a genie it ...


In [37]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'text', 'label'],
        num_rows: 634
    })
    test: Dataset({
        features: ['image', 'text', 'label'],
        num_rows: 159
    })
})

In [44]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in dataset:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["text"], truncation=True), batched=True
    )

# Inspect the available columns in the dataset
tokenized_dataset["train"]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Map:   0%|          | 0/159 [00:00<?, ? examples/s]

Dataset({
    features: ['image', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 634
})