<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/FTDEMO_LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env -q
!pip install datasets -q
!pip install transformers -q
!pip install evaluate -q
!pip install bitsandbytes -q
!pip install accelerate -q

In [1]:
from datasets import load_from_disk
import colab_env

#Prepare the Dataset
dataset = load_from_disk("/content/gdrive/MyDrive/datasets/flight_dataset_tpu")

dataset

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Dataset({
    features: ['input', 'label'],
    num_rows: 1127
})

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
import torch
import bitsandbytes as bnb
import evaluate # Import the evaluate library
from peft import LoraConfig, get_peft_model # Import PEFT modules



# Model and Tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add the pad token
tokenizer.pad_token = tokenizer.eos_token

# Quantization Config (Optional, but recommended for 4-bit quantization)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Pass the quantization config here
    device_map="auto"
)



# PEFT Configuration (LoRA)
peft_config = LoraConfig(
    r=8,  # Rank of the LoRA update matrices
    lora_alpha=32,  # Scaling factor for the LoRA updates
    target_modules=["q_proj", "v_proj"],  # Modules to apply LoRA to
    lora_dropout=0.05,  # Dropout rate for the LoRA layers
    bias="none",  # Bias type for the LoRA layers
    task_type="CAUSAL_LM" # Specify the task type
)

# Apply PEFT (LoRA) to the model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # Print trainable parameters after applying LoRA


# Create a label mapping (string to integer)
label_mapping = {
    "short": 0,
    "medium": 1,
    "long": 2
}

# Tokenize and format the data
def tokenize_function(examples):
    # convert string labels to integers using label_mapping
    examples["labels"] = [label_mapping[label] for label in examples["label"]]
    # Use padding='max_length' and truncation=True to ensure uniform sequence lengths
    # Remove label_ids from the tokenizer output
    tokenized_output = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=128) # Add max_length
    # Assign the labels directly without wrapping
    tokenized_output['labels'] = examples['labels']
    return tokenized_output


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["input", "label"])
# Include labels in the set_format call
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) # Change to "torch"


# Split the dataset into train and eval
train_testvalid = tokenized_datasets.train_test_split(test_size=0.2, seed=42)
train_dataset = train_testvalid["train"]
testvalid_dataset = train_testvalid["test"]

test_valid = testvalid_dataset.train_test_split(test_size=0.5, seed=42)
eval_dataset = test_valid["test"]
test_dataset = test_valid["train"]

small_train_dataset = train_dataset.shuffle(seed=42).select(range(800))
small_eval_dataset = eval_dataset.shuffle(seed=42).select(range(113))

# Define the metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return metric.compute(predictions=predictions, references=labels)



# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to='none',
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)


# Train the model
trainer.train()

In [3]:
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 800
})