In [1]:
#Example training LLM
#!pip install --upgrade transformers torch

In [None]:
!pip install --upgrade bitsandbytes accelerate peft
!pip install evaluate==0.4.3

In [3]:
import pandas as pd
from transformers import AutoModelForCausalLM
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
import evaluate
from transformers import TrainingArguments
from datasets import Dataset, DatasetDict
from transformers import Trainer

from transformers import OPTForCausalLM
from peft import LoraConfig
import timeit  # calcular metrica de tempo

import wandb

def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', max_length=128, truncation=True)


In [4]:
dataset = 'rotten_tomatoes_2024'
index_fold = 4
df_dados = pd.read_parquet(f'/kaggle/input/datasets-sentiment/{dataset}.parquet')
ids = pd.read_parquet(f'/kaggle/input/datasets-sentiment/{dataset}_folds.parquet')

numero_rotulos = len(df_dados['label'].unique())
df_dados

Unnamed: 0,id_movie,text,label,date,idx
0,small_town_wisconsin,Accurate right down to the pickles on the coun...,1,"Nov 26, 2024",0
1,small_town_wisconsin,"It's not breaking any genra barriers, nor is i...",1,"Apr 1, 2024",1
2,dragon_fury,What was that?\nI think my shoe can act with m...,0,"Jan 3, 2024",2
3,the_advent_calendar,"Great horror movie, definitely must see!",1,"Feb 26, 2024",3
4,west_side_story_2021,I haven't seen the original classic but I'm pr...,0,"Dec 5, 2024",4
...,...,...,...,...,...
7943,catherine_called_birdy,I was nervous about this because I loved the b...,1,"Apr 25, 2024",7943
7944,catherine_called_birdy,"I'm getting exhausted by these smug ""history b...",0,"Feb 26, 2024",7944
7945,catherine_called_birdy,Brilliant. Excellent dialogue. Engaging. Perfo...,1,"Jan 17, 2024",7945
7946,out_of_death,I cannot believe Bruce Willis starred in a mov...,0,"Jun 17, 2024",7946


In [None]:

name_model = "meta-llama/Meta-Llama-3.1-8B" 

model = AutoModelForSequenceClassification.from_pretrained(name_model, token=your_token, num_labels=numero_rotulos, torch_dtype="auto", device_map="auto", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(name_model, token=your_token load_in_4bit=True)


#llama3
eot = "<|eot_id|>"
eot_id = tokenizer.convert_tokens_to_ids(eot)
tokenizer.pad_token = eot
tokenizer.pad_token_id = eot_id
tokenizer.unk_token = eot
tokenizer.unk_token_id = eot_id
model.config.pad_token_id =eot_id

In [None]:
# formato huggingface

from sklearn.model_selection import train_test_split

train_data = {
    "text": list(df_dados.iloc [ ids['train_idxs'].iloc[index_fold] ]['text']),
    "label": list(df_dados.iloc [ ids['train_idxs'].iloc[index_fold] ]['label'])    
}

val_data = {
   "text":  list(df_dados.iloc [ ids['val_idxs'].iloc[index_fold] ]['text']),
   "label": list(df_dados.iloc [ ids['val_idxs'].iloc[index_fold] ]['label'])
}

test_data = {
    "text":  list(df_dados.iloc [ ids['test_idxs'].iloc[index_fold] ]['text']),
    "label": list(df_dados.iloc [ ids['test_idxs'].iloc[index_fold] ]['label'])
}

dataset_dict = DatasetDict({
    "train": Dataset.from_dict(train_data),
    "val": Dataset.from_dict(val_data),
    "test": Dataset.from_dict(test_data),
})

#dataset_dict['train']= dataset_dict['train'].shuffle(42)
#dataset_dict['val']= dataset_dict['val'].shuffle(42)

# Print the DatasetDict
print(dataset_dict)


tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))


training_args = TrainingArguments(output_dir="test_trainer")
#metric = evaluate.load("accuracy")
metric = evaluate.load("f1")


training_args = TrainingArguments(output_dir="test_trainer",                    
                    per_device_train_batch_size=4, per_device_eval_batch_size=4,
                    num_train_epochs=2, 
                    learning_rate = 2e-4,
                    logging_dir='./logs',
                    evaluation_strategy="epoch",
                                  logging_steps=100,
                                  run_name=f"llama31_{dataset}_{index_fold}"
                                 )



peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS" #""CAUSAL_LM", 
)

model.add_adapter(peft_config, adapter_name="adapter_1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro') 

trainer = Trainer( 
    model=model,
    args=training_args,
    train_dataset= tokenized_datasets["train"],#,#small_train_dataset, # tokenized_datasets["train"], #
    eval_dataset=tokenized_datasets["val"],
    compute_metrics=compute_metrics     
)


wandb.init(mode="disabled")
ini = timeit.default_timer()
trainer.train()
tempo_treino = timeit.default_timer() - ini
print(tempo_treino)

In [None]:
ini = timeit.default_timer()
predictions = trainer.predict(tokenized_datasets["test"])
tempo_pred= timeit.default_timer() - ini
print(tempo_pred)

In [None]:
preds = np.argmax(predictions.predictions, axis=-1)
print(f"\
    {dataset}\t\
    {index_fold}\t\
    {evaluate.load( 'accuracy').compute(predictions=preds, references=predictions.label_ids)['accuracy']}\t\
    {evaluate.load( 'f1').compute(predictions=preds, references=predictions.label_ids, average='macro')['f1']}\t\
    {tempo_treino}\t\
    {tempo_pred}\t\
    {list(preds)}\t\
     ") 





In [None]:
#!mkdir modelo
trainer.save_model(f'model_{dataset}_{index_fold}')