In [1]:
# pip install llama-recipes ipywidgets
# pip install -U transformers trl accelerate

# import huggingface_hub
# huggingface_hub.login()

In [2]:
# SITES

# https://huggingface.co/blog/llama3#fine-tuning-with-%F0%9F%A4%97-trl
# https://huggingface.co/meta-llama/Llama-3.1-8B
# https://huggingface.co/meta-llama/Llama-3.2-1B
# https://huggingface.co/settings/gated-repos
# https://www.llama.com/docs/how-to-guides/fine-tuning
# https://www.llama.com/docs/overview
# https://github.com/meta-llama/llama-recipes/blob/main/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
# https://huggingface.co/blog/stackllama#stackllama-a-hands-on-guide-to-train-llama-with-rlhf
# https://huggingface.co/docs/transformers/peft

In [1]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:

import torch
import pandas as pd
import evaluate
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from transformers import LlamaForCausalLM, AutoTokenizer, LlamaTokenizer, Trainer, TrainingArguments
from llama_recipes.configs import train_config as TRAIN_CONFIG

train_config = TRAIN_CONFIG()
train_config.model_name = "meta-llama/Llama-3.2-1B"
train_config.num_epochs = 3
train_config.run_validation = False
train_config.gradient_accumulation_steps = 4
train_config.batch_size_training = 1
train_config.lr = 3e-4
train_config.use_fast_kernels = True
train_config.use_fp16 = True
train_config.context_length = 1024 if torch.cuda.get_device_properties(0).total_memory < 16e9 else 2048 # T4 16GB or A10 24GB
train_config.batching_strategy = "packing"
train_config.output_dir = "meta-llama-samsum"
train_config.use_peft = True
# token: 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

In [3]:
# metric = evaluate.load("accuracy")

In [4]:
from transformers import BitsAndBytesConfig
dataset = load_dataset('csv', data_files='../data/csv/processed_toxic_classification_dataset_full.csv')
config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Toxic'],
        num_rows: 31909
    })
})

In [6]:
peft_model_id = "ybelkada/opt-350m-lora"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = LlamaForCausalLM.from_pretrained(
            train_config.model_name,
            device_map="auto",
            quantization_config=config,
            use_cache=False,
            attn_implementation="sdpa" if train_config.use_fast_kernels else None,
            torch_dtype=torch.float16,
    )

model.load_adapter(peft_model_id)

Loading adapter weights from ybelkada/opt-350m-lora led to unexpected keys not found in the model:  ['model.decoder.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.decoder.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.decoder.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.decoder.layers.1.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.1.self_attn.q_proj.lora_B.default.weight', 'model.decoder.layers.1.self_attn.v_proj.lora_A.default.weight', 'model.decoder.layers.1.self_attn.v_proj.lora_B.default.weight', 'model.decoder.layers.10.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.10.self_attn.q_proj.lora_B.default.weight', 'model.decoder.layers.10.self_attn.v_proj.lora_A.default.weight', 'model.decoder.layers.10.self_attn.v_proj.lora_B.default.weight', 'model.decoder.layers.11.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.11.self_attn.q_proj.lora_B.

In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
# Tokenize the dataset
def preprocess_function(data):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
    tokenizer.pad_token = tokenizer.eos_token
    result = tokenizer(str(data["Text"]),truncation=True,   
                       max_length=512, return_overflowing_tokens=True)

    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in data.items():
        result[key] = [values[i] for i in sample_map]
    return result

def pad_attention_mask(dataset):
    for i in range(len(dataset)):
        attention_mask = dataset[i]['attention_mask']
        if len(attention_mask) < 512:
            # Pad with zeros to reach length 512
            padding_length = 512 - len(attention_mask)
            dataset[i]['attention_mask'] = attention_mask + [0] * padding_length
            # Also pad input_ids accordingly
            dataset[i]['input_ids'] = dataset[i]['input_ids'] + [0] * padding_length
    return dataset

def process_data(dataset):
    processed_data = []
    for item in dataset:
        # Assuming your dataset items are dictionaries like {'train': {'attention_mask': [...], ...}}
        flat_item = item['train']
        processed_data.append(flat_item)
    return processed_data

In [9]:
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.remove_columns(["Text"])
dataset = dataset.rename_column("Toxic", "labels")
dataset

Map:   0%|          | 0/31909 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2944
    })
})

In [10]:
pandas_data = pd.DataFrame(dataset)
pandas_data

Unnamed: 0,train
0,"{'labels': 0, 'input_ids': [128000, 681, 38, 2..."
1,"{'labels': 0, 'input_ids': [128000, 19288, 163..."
2,"{'labels': 0, 'input_ids': [128000, 49374, 261..."
3,"{'labels': 0, 'input_ids': [128000, 3549, 659,..."
4,"{'labels': 0, 'input_ids': [128000, 26291, 518..."
...,...
2939,"{'labels': 0, 'input_ids': [128000, 20142, 220..."
2940,"{'labels': 0, 'input_ids': [128000, 44840, 121..."
2941,"{'labels': 0, 'input_ids': [128000, 21051, 143..."
2942,"{'labels': 0, 'input_ids': [128000, 65302, 145..."


In [11]:
train_dataset, val_dataset = train_test_split(pandas_data, test_size=0.2, random_state=42)
train_dataset

Unnamed: 0,train
685,"{'labels': 0, 'input_ids': [128000, 3878, 1690..."
2664,"{'labels': 0, 'input_ids': [128000, 2262, 5321..."
2796,"{'labels': 1, 'input_ids': [128000, 13354, 111..."
456,"{'labels': 0, 'input_ids': [128000, 3621, 2800..."
462,"{'labels': 0, 'input_ids': [128000, 62584, 144..."
...,...
1638,"{'labels': 0, 'input_ids': [128000, 7489, 518,..."
1095,"{'labels': 0, 'input_ids': [128000, 36017, 358..."
1130,"{'labels': 0, 'input_ids': [128000, 3783, 2199..."
1294,"{'labels': 0, 'input_ids': [128000, 5663, 7652..."


In [12]:
train_dataset = Dataset.from_pandas(pd.DataFrame(train_dataset))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_dataset))

In [13]:
train_dataset

Dataset({
    features: ['train', '__index_level_0__'],
    num_rows: 2355
})

In [14]:
train_dataset = process_data(train_dataset.remove_columns(["__index_level_0__"]))
val_dataset = process_data(val_dataset.remove_columns(["__index_level_0__"]))

In [27]:
print(train_dataset[0]['input_ids'].shape)
print(train_dataset[0]['labels'].shape)

AttributeError: 'list' object has no attribute 'shape'

In [16]:
# for i in range(10):
#     print(str(len(train_dataset[i]['input_ids'])) + " and " + str(len(train_dataset[i]['labels'])))

In [17]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'labels': torch.stack([torch.tensor(item['labels']) for item in batch]) if 'labels' in batch[0] else None
    }

In [18]:
eval_prompt = """
Look at this statement - 'You are such an interesting person, but, your ideologies are a disgrace to the human race. Get out of my face scrub'
Is this toxic or non-toxic?
"""

In [19]:
# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# model.eval()
# with torch.inference_mode():
#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

In [24]:
training_args = TrainingArguments(
    output_dir='../data/models/Llama',
    evaluation_strategy='steps',
    logging_dir='./logs',
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn
)

In [26]:
trainer.evaluate()

ValueError: Expected input batch_size (4088) to match target batch_size (7).

In [None]:
# Save the model
# model.save_pretrained('./fine_tuned_llama')
# tokenizer.save_pretrained('./fine_tuned_llama')

In [None]:
# from transformers import pipeline

# model = LlamaForCausalLM.from_pretrained('./fine_tuned_llama')
# tokenizer = LlamaTokenizer.from_pretrained('./fine_tuned_llama')

# generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# output = generator("Your prompt here", max_length=100)
# print(output)

In [None]:
# train_dataloader = get_dataloader(tokenizer, '../data/csv/processed_toxic_classification_dataset_short.csv', train_config)
# eval_dataloader = get_dataloader(tokenizer, '../data/csv/processed_toxic_classification_dataset_short.csv', train_config, "val")