In [None]:
!pip install datasets



In [None]:
# fix dataset with datasets library

from sklearn.model_selection import train_test_split
from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')
root = "/content/gdrive/MyDrive/Colab Notebooks/torch/"
df = pd.read_csv(root+"data/BBC-text/bbc-text.csv")

instruction = "Classify the following article in one of the following categories: business, politics, tech, sport or entertainment"
answer = "I would classify the article as: "

df['answers'] = df.apply(lambda row: answer + row.category, axis = 1)

(x_train, x_test, y_train, y_test) = train_test_split(df['text'], df['answers'], test_size=0.2, random_state=17)
(x_train, x_val, y_train, y_val) = train_test_split(x_train, y_train, test_size=0.1, random_state=17)

In [None]:
from datasets import Dataset, DatasetDict

def create_dataset(x_split, y_split):
    def my_gen():
        for i in range(len(x_split)):
            yield {'instruction' : instruction, 'input': x_split.at[i], 'textual_label': y_split.at[i]}

    x_split = x_split.reset_index(drop=True)
    y_split = y_split.reset_index(drop=True)

    return Dataset.from_generator(my_gen)

dataset_train = create_dataset(x_train, y_train)
dataset_val = create_dataset(x_val, y_val)
dataset_test = create_dataset(x_test, y_test)

In [None]:
dataset = DatasetDict({
    "train": dataset_train,
    "val": dataset_val,
    "test": dataset_test
})

In [None]:
dataset_train.to_json(root+"data/BBC-text/bbc-text-train.jsonl")
dataset_val.to_json(root+"data/BBC-text/bbc-text-val.jsonl")
dataset_test.to_json(root+"data/BBC-text/bbc-text-test.jsonl")

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1104814

In [None]:
from datasets import load_dataset

d = load_dataset('json', data_files={'train': root+"data/BBC-text/bbc-text-train.jsonl",
                                    'val' : root+"data/BBC-text/bbc-text-val.jsonl",
                                    'test': root+"data/BBC-text/bbc-text-test.jsonl"})
d

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'textual_label'],
        num_rows: 1602
    })
    val: Dataset({
        features: ['instruction', 'input', 'textual_label'],
        num_rows: 178
    })
    test: Dataset({
        features: ['instruction', 'input', 'textual_label'],
        num_rows: 445
    })
})

In [None]:
from google.colab import userdata

hf_auth = userdata.get('HF_TOKEN')

lm_model_inst = 'meta-llama/Llama-3.2-1B-Instruct'

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained(lm_model_inst)
model = AutoModelForCausalLM.from_pretrained(lm_model_inst)
device = 'cuda'
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [None]:
# carica modello
from peft import get_peft_model, LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['q_proj', 'k_proj', 'v_proj'],
    lora_dropout=0.1
)

model_p = get_peft_model(model, lora_config)
model_p.gradient_checkpointing_enable()

In [None]:
def fix_dataset(old_dataset):
    new_dataset = {}
    for split in old_dataset:
        complete_input = [tokenizer.apply_chat_template(
                [{'role': 'system', 'content': old_dataset[split][i]['instruction']},
                {'role': 'user', 'content': old_dataset[split][i]['input']},
                {'role': 'assistant', 'content': old_dataset[split][i]['textual_label']}], tokenize=False
            ) for i in range(len(old_dataset[split]))]
        new_dataset[split] = old_dataset[split].add_column("text", complete_input)

    return DatasetDict(new_dataset)

fixed_dataset = fix_dataset(d)

In [None]:
fixed_dataset['train'][0]['text']

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 01 Dec 2024\n\nClassify the following article in one of the following categories: business, politics, tech, sport or entertainment<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nhalo fans  hope for sequel xbox video game halo 2 has been released in the us on 9 november  with a uk release two days later. why is the game among the most anticipated of all time   halo is considered by many video game pundits to be one of the finest examples of interactive entertainment ever produced and more than 1.5 million people worldwide have pre-ordered the sequel. a science fiction epic  halo centred the action on a human cyborg  controlled by the player  who had to save his crew from an alien horde after a crash landing on a strange and exotic world contained on the interior surface of a giant ring in space. remembrance of things past it was not - but as a slice of schlock science

In [None]:
!pip install trl

Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.12.1


In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

import wandb
wandb.init(mode="disabled")

training_args = TrainingArguments(
    output_dir = root+'/models/llama32ft/',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    learning_rate=1e-3,
    gradient_accumulation_steps=16,
    logging_steps=10,
    report_to=None,
)

trainer = SFTTrainer(
        model=model,
        train_dataset=fixed_dataset['train'],
        peft_config=lora_config,
        dataset_text_field="text",
        args=training_args,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1602 [00:00<?, ? examples/s]

Step,Training Loss
10,3.4242
20,3.388
30,3.3669
40,3.373
50,3.2844
60,3.3166
70,3.3256
80,3.2367
90,3.1508
100,3.2055


TrainOutput(global_step=401, training_loss=3.044400676527523, metrics={'train_runtime': 2813.4971, 'train_samples_per_second': 0.569, 'train_steps_per_second': 0.143, 'total_flos': 7242074108559360.0, 'train_loss': 3.044400676527523, 'epoch': 1.0})

In [None]:
# how to save model + tokenizer (mettere funzione per aggiungere elementi al tokenizer)
# retrieve model and tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from google.colab import drive

drive.mount('/content/gdrive')
root = "/content/gdrive/MyDrive/Colab Notebooks/torch/"

tokenizer = AutoTokenizer.from_pretrained(root+'models/llama32ft/checkpoint-401')
model = AutoModelForCausalLM.from_pretrained(root+'models/llama32ft/checkpoint-401')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install datasets
from datasets import load_dataset

d = load_dataset('json', data_files={'train': root+"data/BBC-text/bbc-text-train.jsonl",
                                    'val' : root+"data/BBC-text/bbc-text-val.jsonl",
                                    'test': root+"data/BBC-text/bbc-text-test.jsonl"})



In [None]:
d['test']

Dataset({
    features: ['instruction', 'input', 'textual_label'],
    num_rows: 445
})

In [None]:
def ask(question, model, device):

    tk = tokenizer(question, return_tensors='pt')
    tk['input_ids'] = tk['input_ids'].to(device)
    tk['attention_mask'] = tk['attention_mask'].to(device)

    gen_config = GenerationConfig(
        do_sample=True,
        max_new_tokens=256,
        temperature=0.0000001)

    response = model.generate(
        input_ids=tk['input_ids'],
        attention_mask=tk['attention_mask'],
        generation_config=gen_config)

    answer = tokenizer.batch_decode(response[:, len(tk['input_ids'][0]):], skip_special_tokens=True)[0]

    return response, answer

In [None]:
from tqdm import tqdm

device = 'cpu'

results = []

for idx, (inst, texts, labels) in enumerate(tqdm(d['test'], desc='test set')):
    messages = [
        {'role': 'system', 'content': inst},
        {'role': 'user', 'content': input}
    ]

    response, answer = ask(tokenizer.apply_chat_template(messages, tokenize=False), model, device)
    results.append((answer, labels))

test set:   0%|          | 0/445 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
test set:   0%|          | 1/445 [02:17<17:00:38, 137.93s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
test set:   0%|          | 2/445 [04:41<17:24:46, 141.51s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
test set:   1%|          | 3/445 [06:55<16:56:27, 137.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
test set:   1%|          | 4/445 [09:05<16:31:01, 134.83s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
test set:   1%|          | 5/445 [11:19<16:26:03, 134.46s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
test set:   1%|▏         | 6/445 [13:29<16:12:41, 132.94s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
test set:   2%|▏         | 7/445 [15:38<16:01:40, 131.74s/it]Setting `pad_token_id` to `