In [86]:
# pip install llama-recipes ipywidgets
# pip install -U transformers trl accelerate

# import huggingface_hub
# huggingface_hub.login()

In [87]:
# SITES

# https://huggingface.co/blog/llama3#fine-tuning-with-%F0%9F%A4%97-trl
# https://huggingface.co/meta-llama/Llama-3.1-8B
# https://huggingface.co/meta-llama/Llama-3.2-1B
# https://huggingface.co/settings/gated-repos
# https://www.llama.com/docs/how-to-guides/fine-tuning
# https://www.llama.com/docs/overview
# https://github.com/meta-llama/llama-recipes/blob/main/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
# https://huggingface.co/blog/stackllama#stackllama-a-hands-on-guide-to-train-llama-with-rlhf
# https://huggingface.co/docs/transformers/peft

In [88]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [89]:

import torch
from datasets import load_dataset
from transformers import LlamaForCausalLM, AutoTokenizer, LlamaTokenizer, Trainer, TrainingArguments
from llama_recipes.configs import train_config as TRAIN_CONFIG

train_config = TRAIN_CONFIG()
train_config.model_name = "meta-llama/Llama-3.2-1B"
train_config.num_epochs = 3
train_config.run_validation = False
train_config.gradient_accumulation_steps = 4
train_config.batch_size_training = 1
train_config.lr = 3e-4
train_config.use_fast_kernels = True
train_config.use_fp16 = True
train_config.context_length = 1024 if torch.cuda.get_device_properties(0).total_memory < 16e9 else 2048 # T4 16GB or A10 24GB
train_config.batching_strategy = "packing"
train_config.output_dir = "meta-llama-samsum"
train_config.use_peft = True
# token: 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

In [90]:
from transformers import BitsAndBytesConfig
dataset = load_dataset('csv', data_files='../data/csv/processed_toxic_classification_dataset_full.csv')
config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [91]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Toxic'],
        num_rows: 31909
    })
})

In [92]:
peft_model_id = "ybelkada/opt-350m-lora"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = LlamaForCausalLM.from_pretrained(
            train_config.model_name,
            device_map="auto",
            quantization_config=config,
            use_cache=False,
            attn_implementation="sdpa" if train_config.use_fast_kernels else None,
            torch_dtype=torch.float16,
    )
model.load_adapter(peft_model_id)

Loading adapter weights from ybelkada/opt-350m-lora led to unexpected keys not found in the model:  ['model.decoder.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.decoder.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.decoder.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.decoder.layers.1.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.1.self_attn.q_proj.lora_B.default.weight', 'model.decoder.layers.1.self_attn.v_proj.lora_A.default.weight', 'model.decoder.layers.1.self_attn.v_proj.lora_B.default.weight', 'model.decoder.layers.10.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.10.self_attn.q_proj.lora_B.default.weight', 'model.decoder.layers.10.self_attn.v_proj.lora_A.default.weight', 'model.decoder.layers.10.self_attn.v_proj.lora_B.default.weight', 'model.decoder.layers.11.self_attn.q_proj.lora_A.default.weight', 'model.decoder.layers.11.self_attn.q_proj.lora_B.

In [93]:
tokenizer.pad_token = tokenizer.eos_token

In [94]:
# Tokenize the dataset
def preprocess_function(data):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
    
    inputs = data['Text']
    targets = [str(x) if x is not None else '' for x in data['Toxic']]
    model_inputs = tokenizer(inputs, truncation=True, max_length=512)

    # Add labels for the target text
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True, max_length=512)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [95]:
dataset = dataset.map(preprocess_function, batched=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Toxic', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 31909
    })
})

In [96]:
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [97]:
for i in range(10):
    print(len(train_dataset[i]['Text']))

716
124
355
156
18
161
255
81
161
2505


In [98]:
eval_prompt = """
Look at this statement - 'You are such an interesting person, but, your ideologies are a disgrace to the human race. Get out of my face scrub'
Is this toxic or non-toxic?
"""

In [99]:
# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# model.eval()
# with torch.inference_mode():
#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

In [100]:
training_args = TrainingArguments(
    output_dir='../data/models/Llama',
    evaluation_strategy='steps',
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    logging_dir='./logs',
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
)



In [101]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [102]:
trainer.evaluate()

ValueError: expected sequence of length 38 at dim 1 (got 42)

In [None]:
# Save the model
model.save_pretrained('./fine_tuned_llama')
tokenizer.save_pretrained('./fine_tuned_llama')

In [None]:
from transformers import pipeline

model = LlamaForCausalLM.from_pretrained('./fine_tuned_llama')
tokenizer = LlamaTokenizer.from_pretrained('./fine_tuned_llama')

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

output = generator("Your prompt here", max_length=100)
print(output)