In [None]:
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments, Trainer, EarlyStoppingCallback, AutoTokenizer
from huggingface_hub import login
from dotenv import load_dotenv
import os
import logging

In [None]:
pip install -U bitsandbytes

In [None]:
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if (HF_TOKEN == None):
    raise ValueError("HF_TOKEN is not set")
login(token=HF_TOKEN)

In [None]:
# 4 bit quantization
# could be further increased to 8b for more precision
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration for Qwen model architecture
lora_config = LoraConfig(
    r=16,                       #rank of the added low-rank matrices
    lora_alpha=32,              #generally 2*r
    target_modules=[            #modules where LoRA is applied
        "q_proj",               # query, key, value, output projection layers in the self-attention
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",            # gate, up, down are part of the FFNN in the model
        "up_proj",
        "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [None]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)

In [None]:
# preparing model for LoRA
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

#logging during training
LOG_FILE_PATH = os.path.join("./francesco_lora", "training_log.txt")

# ensure dir exists
os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE_PATH),
        logging.StreamHandler()
    ]
)

# training arguments
training_args = TrainingArguments(
    output_dir="./francesco_lora",
    num_train_epochs=2,
    per_device_train_batch_size=12,
    gradient_accumulation_steps=4,      # effective batch size = per_device_train_batch_size * gradinet_accumulation_steps
    warmup_ratio=0.03,
    learning_rate=2e-4,                # Slightly lower for distilled model
    optim="paged_adamw_8bit",         # 8bit optimizer <- ADDED
    lr_scheduler_type="cosine",       # cosine learning rate scheduler <- ADDED
    fp16=True,
    logging_steps=50,
    eval_steps=50,
    save_steps=25,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="loss",
    greater_is_better=False,            #lower loss is better
    gradient_checkpointing=False,
    disable_tqdm=False,
    report_to=["none"],
    label_names=["labels"]  # Explicitly specify label field
)

In [None]:
from transformers import DataCollatorWithPadding, DataCollatorForLanguageModeling, DefaultDataCollator

tokenizer = AutoTokenizer.from_pretrained(model_id)

data_collator = DefaultDataCollator()

#Dynamic padding for causal LM
# data_collator = DataCollatorForLanguageModeling(
#    tokenizer=tokenizer,
#    mlm=False
# )

#Dynamic padding more general purpose
# it doesnt handle shifting lables: you have to implement it manually

# data_collator = DataCollatorWithPadding(
#     tokenizer=tokenizer,
#     padding=True,            # pad to longest in batch
#     return_tensors="pt",
# )


In [None]:
# loading datasets
tokenized_train = load_from_disk('datasets/tokenized_train')
tokenized_val = load_from_disk('datasets/tokenized_val')
tokenized_test = load_from_disk('datasets/tokenized_test')

# print
print(f"Training examples: {len(tokenized_train)}")
print(f"Validation examples: {len(tokenized_val)}")
print(f"Test examples: {len(tokenized_test)}")

print("\nOne training example:")
print(tokenized_train[1000])

In [None]:
#IF ON COLAB
from google.colab import drive

drive.mount('/content/drive')
drive_base_path = '/content/drive/My Drive/datasets'

tokenized_train = load_from_disk(os.path.join(drive_base_path, 'tokenized_train'))
tokenized_val = load_from_disk(os.path.join(drive_base_path, 'tokenized_val'))
tokenized_test = load_from_disk(os.path.join(drive_base_path, 'tokenized_test'))

print("Datasets loaded successfully from Google Drive!")
print(f"Training examples: {len(tokenized_train)}")
print(f"Validation examples: {len(tokenized_val)}")
print(f"Test examples: {len(tokenized_test)}")

print("\nOne training example:")
print(tokenized_train[8000])

In [None]:
# print trainable parameters
model.print_trainable_parameters()

# training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

# add early stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)
trainer.add_callback(early_stopping)

In [None]:
trainer.train()