In [None]:
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments, Trainer, EarlyStoppingCallback, AutoTokenizer, DefaultDataCollator
from huggingface_hub import login
import os
import wandb

In [None]:
#!pip install -U bitsandbytes accelerate transformers peft

In [None]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
WB_KEY = user_secrets.get_secret("WB_KEY")

wandb.login(key=WB_KEY)
run = wandb.init(project="Digital Self-Replica", job_type="Training", name="Final train with rank=64 and alpha=32 pt2")
if (HF_TOKEN == None):
    raise ValueError("HF_TOKEN is not set")
login(token=HF_TOKEN)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfrancescobrigante[0m ([33mfrancescobrigante_s_projects[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Setting parameters

In [None]:
# 4 bit quantization
# could be further increased to 8b for more precision
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration for Qwen model architecture
lora_config = LoraConfig(
    r=64,                       #rank of the added low-rank matrices
    lora_alpha=32,              #generally 2*r
    target_modules=[            #modules where LoRA is applied
        "q_proj",               # query, key, value, output projection layers in the self-attention
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",            # gate, up, down are part of the FFNN in the model
        "up_proj",
        "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [None]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)

In [None]:
# preparing model for LoRA
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


# training arguments
training_args = TrainingArguments(
    output_dir="./francesco_lora",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,      # effective batch size = per_device_train_batch_size * gradinet_accumulation_steps
    per_device_eval_batch_size=6,
    eval_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=3e-4,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    fp16=True,
    logging_steps=25,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,            #lower loss is better
    gradient_checkpointing=True,
    max_grad_norm=1,
    disable_tqdm=False,
    report_to=["wandb"],                                # W&B logging
    label_names=["labels"]
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

data_collator = DefaultDataCollator()

# loading datasets
tokenized_train = load_from_disk('/kaggle/input/tok-datasets/datasets/tokenized_train')
tokenized_val = load_from_disk('/kaggle/input/tok-datasets/datasets/tokenized_val')

### Actual training

In [None]:
# print trainable parameters
model.print_trainable_parameters()

# training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

# add early stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)
trainer.add_callback(early_stopping)


trainable params: 161,480,704 || all params: 7,777,097,216 || trainable%: 2.0764


In [None]:
trainer.train()

In [None]:
trainer.train(resume_from_checkpoint="./francesco_lora/checkpoint-600")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
700,2.5455,3.456467
800,2.4091,3.417243
900,2.2382,3.496536


TrainOutput(global_step=900, training_loss=0.8073342344495985, metrics={'train_runtime': 19306.2088, 'train_samples_per_second': 1.997, 'train_steps_per_second': 0.062, 'total_flos': 3.199471712128205e+17, 'train_loss': 0.8073342344495985, 'epoch': 2.241443683883012})