In [1]:
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments, Trainer, EarlyStoppingCallback, AutoTokenizer, DefaultDataCollator
from huggingface_hub import login
from dotenv import load_dotenv
import os
import wandb

In [2]:
WB_KEY = os.getenv("WB_KEY")
wandb.login(key=WB_KEY)
run = wandb.init(project="Digital Self-Replica", job_type="Training", name="Train with rank=16")

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\Francesco\_netrc
wandb: Currently logged in as: francescobrigante (francescobrigante_s_projects) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


In [3]:
#pip install -U bitsandbytes

### Setting parameters

In [4]:
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if (HF_TOKEN == None):
    raise ValueError("HF_TOKEN is not set")
login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
# 4 bit quantization
# could be further increased to 8b for more precision
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration for Qwen model architecture
lora_config = LoraConfig(
    r=16,                       #rank of the added low-rank matrices
    lora_alpha=32,              #generally 2*r
    target_modules=[            #modules where LoRA is applied
        "q_proj",               # query, key, value, output projection layers in the self-attention
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",            # gate, up, down are part of the FFNN in the model
        "up_proj",
        "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [6]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# preparing model for LoRA
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


# TRY  AND BATCH 8x8

# training arguments
training_args = TrainingArguments(
    output_dir="./francesco_lora",
    num_train_epochs=4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,      # effective batch size = per_device_train_batch_size * gradinet_accumulation_steps
    #per_device_eval_batch_size=4,
    #eval_accumulation_steps=6,
    #warmup_steps=5,
    warmup_ratio=0.1,
    learning_rate=2e-4,                # Slightly lower for distilled model
    optim="paged_adamw_8bit",         # 8bit optimizer <- ADDED
    lr_scheduler_type="cosine",       # cosine learning rate scheduler <- ADDED
    weight_decay=0.01,
    fp16=True,
    logging_steps=25,
    #eval_strategy="no",
    eval_strategy="steps",
    #per_device_eval_batch_size=16,
    #eval_accumulation_steps=2,
    eval_steps=100,
    save_steps=100,
    save_strategy="steps",
    load_best_model_at_end=True,
    #save_total_limit=1,
    metric_for_best_model="loss",
    greater_is_better=False,            #lower loss is better
    gradient_checkpointing=False,
    max_grad_norm=0.5,
    disable_tqdm=False,
    report_to=["wandb"],                                # Enable W&B logging
    #report_to=["none"],                                # Enable W&B logging
    label_names=["labels"]
)

In [8]:
# training_args = TrainingArguments(
#     output_dir="./francesco_lora",
#     num_train_epochs=3,
#     per_device_train_batch_size=6,
#     gradient_accumulation_steps=4,      # effective batch size = per_device_train_batch_size * gradinet_accumulation_steps
#     #per_device_eval_batch_size=4,
#     #eval_accumulation_steps=6,
#     warmup_steps=5,
#     warmup_ratio=0.1,
#     learning_rate=2e-4,                # Slightly lower for distilled model
#     optim="paged_adamw_8bit",         # 8bit optimizer <- ADDED
#     lr_scheduler_type="cosine",       # cosine learning rate scheduler <- ADDED
#     weight_decay=0.01,
#     fp16=True,
#     logging_steps=5,
#     #eval_strategy="no",
#     eval_strategy="steps",
#     per_device_eval_batch_size=16,
#     eval_accumulation_steps=2,
#     eval_steps=5,
#     save_steps=50,
#     save_strategy="steps",
#     load_best_model_at_end=True,
#     #save_total_limit=1,
#     metric_for_best_model="loss",
#     greater_is_better=False,            #lower loss is better
#     gradient_checkpointing=False,
#     max_grad_norm=0.5,
#     disable_tqdm=False,
#     #report_to=["wandb"],                                # Enable W&B logging
#     report_to=["none"],                                # Enable W&B logging
#     label_names=["labels"]
# )

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

data_collator = DefaultDataCollator()

# loading datasets
tokenized_train = load_from_disk('datasets/tokenized_train')
tokenized_val = load_from_disk('datasets/tokenized_val')
#tokenized_test = load_from_disk('datasets/tokenized_test')

# print
print(f"Training examples: {len(tokenized_train)}")
print(f"Validation examples: {len(tokenized_val)}")
#print(f"Test examples: {len(tokenized_test)}")

print("\nOne training example:")
print(tokenized_train[1000])

Training examples: 8720
Validation examples: 1090

One training example:
{'input_ids': [151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 

In [10]:
#IF ON COLAB
# from google.colab import drive

# drive.mount('/content/drive')
# drive_base_path = '/content/drive/My Drive/datasets'

# tokenized_train = load_from_disk(os.path.join(drive_base_path, 'tokenized_train'))
# tokenized_val = load_from_disk(os.path.join(drive_base_path, 'tokenized_val'))
# tokenized_test = load_from_disk(os.path.join(drive_base_path, 'tokenized_test'))

# print("Datasets loaded successfully from Google Drive!")
# print(f"Training examples: {len(tokenized_train)}")
# print(f"Validation examples: {len(tokenized_val)}")
# print(f"Test examples: {len(tokenized_test)}")

# print("\nOne training example:")
# print(tokenized_train[8000])

### Actual training

In [11]:
# print trainable parameters
model.print_trainable_parameters()

# training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

# add early stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)
trainer.add_callback(early_stopping)

trainable params: 40,370,176 || all params: 7,655,986,688 || trainable%: 0.5273


In [12]:
trainer.train()



  0%|          | 0/2180 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 8.2295, 'grad_norm': 6.230851650238037, 'learning_rate': 2.2935779816513765e-05, 'epoch': 0.05}
{'loss': 5.0601, 'grad_norm': 2.8211240768432617, 'learning_rate': 4.587155963302753e-05, 'epoch': 0.09}
{'loss': 4.2819, 'grad_norm': 3.2490782737731934, 'learning_rate': 6.880733944954129e-05, 'epoch': 0.14}
{'loss': 4.0453, 'grad_norm': 4.860716819763184, 'learning_rate': 9.174311926605506e-05, 'epoch': 0.18}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.9521830081939697, 'eval_runtime': 576.1567, 'eval_samples_per_second': 1.892, 'eval_steps_per_second': 0.238, 'epoch': 0.18}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 3.9428, 'grad_norm': 5.354930400848389, 'learning_rate': 0.00011467889908256881, 'epoch': 0.23}
{'loss': 3.7331, 'grad_norm': 4.727965831756592, 'learning_rate': 0.00013761467889908258, 'epoch': 0.28}
{'loss': 3.7145, 'grad_norm': 5.566544532775879, 'learning_rate': 0.00016055045871559632, 'epoch': 0.32}
{'loss': 3.667, 'grad_norm': 5.684599876403809, 'learning_rate': 0.00018348623853211012, 'epoch': 0.37}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.629140615463257, 'eval_runtime': 569.0517, 'eval_samples_per_second': 1.915, 'eval_steps_per_second': 0.241, 'epoch': 0.37}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 3.7087, 'grad_norm': 6.002601146697998, 'learning_rate': 0.0001999937185012612, 'epoch': 0.41}
{'loss': 3.6008, 'grad_norm': 4.516737461090088, 'learning_rate': 0.00019986875683942535, 'epoch': 0.46}
{'loss': 3.6228, 'grad_norm': 5.109896183013916, 'learning_rate': 0.00019958378286369502, 'epoch': 0.5}
{'loss': 3.3953, 'grad_norm': 3.980117082595825, 'learning_rate': 0.00019913925316676945, 'epoch': 0.55}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.511225938796997, 'eval_runtime': 744.6918, 'eval_samples_per_second': 1.464, 'eval_steps_per_second': 0.184, 'epoch': 0.55}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 3.5368, 'grad_norm': 4.319626808166504, 'learning_rate': 0.0001985358799856651, 'epoch': 0.6}
{'loss': 3.4401, 'grad_norm': 3.9891531467437744, 'learning_rate': 0.0001977746300605507, 'epoch': 0.64}
{'loss': 3.5, 'grad_norm': 3.9637115001678467, 'learning_rate': 0.00019685672308581152, 'epoch': 0.69}
{'loss': 3.5759, 'grad_norm': 3.931408643722534, 'learning_rate': 0.00019578362975582292, 'epoch': 0.73}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.414339303970337, 'eval_runtime': 743.9944, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.184, 'epoch': 0.73}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 3.4612, 'grad_norm': 3.686387300491333, 'learning_rate': 0.000194557069408566, 'epoch': 0.78}
{'loss': 3.4959, 'grad_norm': 3.168027400970459, 'learning_rate': 0.0001931790072708596, 'epoch': 0.83}
{'loss': 3.3979, 'grad_norm': 4.088258743286133, 'learning_rate': 0.0001916516513096226, 'epoch': 0.87}
{'loss': 3.3202, 'grad_norm': 4.35308313369751, 'learning_rate': 0.00018997744869421246, 'epoch': 0.92}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.3550636768341064, 'eval_runtime': 744.3233, 'eval_samples_per_second': 1.464, 'eval_steps_per_second': 0.184, 'epoch': 0.92}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 3.4621, 'grad_norm': 3.5806515216827393, 'learning_rate': 0.00018815908187550667, 'epoch': 0.96}
{'loss': 3.1963, 'grad_norm': 3.858060121536255, 'learning_rate': 0.0001861994642880105, 'epoch': 1.01}
{'loss': 2.8754, 'grad_norm': 3.7331011295318604, 'learning_rate': 0.00018410173568187647, 'epoch': 1.06}
{'loss': 2.8282, 'grad_norm': 1.9308745861053467, 'learning_rate': 0.00018186925709231532, 'epoch': 1.1}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.3838977813720703, 'eval_runtime': 743.8008, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.184, 'epoch': 1.1}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.8131, 'grad_norm': 4.913252353668213, 'learning_rate': 0.00017950560545445813, 'epoch': 1.15}
{'loss': 3.0731, 'grad_norm': 4.465583801269531, 'learning_rate': 0.00017701456787229804, 'epoch': 1.19}
{'loss': 2.955, 'grad_norm': 4.453853607177734, 'learning_rate': 0.00017440013555089393, 'epoch': 1.24}
{'loss': 3.0374, 'grad_norm': 4.159090518951416, 'learning_rate': 0.000171666497401558, 'epoch': 1.28}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.3543453216552734, 'eval_runtime': 743.9958, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.184, 'epoch': 1.28}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.8948, 'grad_norm': 3.5331246852874756, 'learning_rate': 0.00016881803333027362, 'epoch': 1.33}
{'loss': 2.9539, 'grad_norm': 4.597719669342041, 'learning_rate': 0.00016585930722009601, 'epoch': 1.38}
{'loss': 2.8147, 'grad_norm': 3.805389165878296, 'learning_rate': 0.00016279505961878064, 'epoch': 1.42}
{'loss': 2.9632, 'grad_norm': 4.388492584228516, 'learning_rate': 0.00015963020014335438, 'epoch': 1.47}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.3114547729492188, 'eval_runtime': 743.3565, 'eval_samples_per_second': 1.466, 'eval_steps_per_second': 0.184, 'epoch': 1.47}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.8905, 'grad_norm': 4.304809093475342, 'learning_rate': 0.0001563697996137997, 'epoch': 1.51}
{'loss': 2.8842, 'grad_norm': 4.070367813110352, 'learning_rate': 0.0001530190819284555, 'epoch': 1.56}
{'loss': 2.7597, 'grad_norm': 3.9153177738189697, 'learning_rate': 0.00014958341569415147, 'epoch': 1.61}
{'loss': 2.8694, 'grad_norm': 4.133796691894531, 'learning_rate': 0.0001460683056244869, 'epoch': 1.65}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.318251371383667, 'eval_runtime': 744.1579, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.184, 'epoch': 1.65}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.9635, 'grad_norm': 4.726286888122559, 'learning_rate': 0.00014247938372003582, 'epoch': 1.7}
{'loss': 2.8298, 'grad_norm': 3.9863169193267822, 'learning_rate': 0.00013882240024460927, 'epoch': 1.74}
{'loss': 2.9149, 'grad_norm': 4.50140380859375, 'learning_rate': 0.0001351032145120337, 'epoch': 1.79}
{'loss': 2.8888, 'grad_norm': 3.859428882598877, 'learning_rate': 0.00013132778549820618, 'epoch': 1.83}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.263798713684082, 'eval_runtime': 743.5722, 'eval_samples_per_second': 1.466, 'eval_steps_per_second': 0.184, 'epoch': 1.83}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.8319, 'grad_norm': 3.9285507202148438, 'learning_rate': 0.0001275021622934685, 'epoch': 1.88}
{'loss': 2.839, 'grad_norm': 4.516336917877197, 'learning_rate': 0.00012363247441059776, 'epoch': 1.93}
{'loss': 3.003, 'grad_norm': 4.199852466583252, 'learning_rate': 0.00011972492196394187, 'epoch': 1.97}
{'loss': 2.5762, 'grad_norm': 5.436491966247559, 'learning_rate': 0.0001157857657354354, 'epoch': 2.02}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.325185537338257, 'eval_runtime': 787.78, 'eval_samples_per_second': 1.384, 'eval_steps_per_second': 0.174, 'epoch': 2.02}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.1256, 'grad_norm': 5.6115899085998535, 'learning_rate': 0.00011182131714341247, 'epoch': 2.06}
{'loss': 2.1021, 'grad_norm': 4.582574367523193, 'learning_rate': 0.00010783792813028827, 'epoch': 2.11}
{'loss': 2.0962, 'grad_norm': 5.027643203735352, 'learning_rate': 0.00010384198098531225, 'epoch': 2.16}
{'loss': 2.103, 'grad_norm': 4.662832260131836, 'learning_rate': 9.983987811869862e-05, 'epoch': 2.2}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.454127073287964, 'eval_runtime': 747.3107, 'eval_samples_per_second': 1.459, 'eval_steps_per_second': 0.183, 'epoch': 2.2}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.1274, 'grad_norm': 5.092673301696777, 'learning_rate': 9.583803180351852e-05, 'epoch': 2.25}
{'loss': 2.0352, 'grad_norm': 5.308938503265381, 'learning_rate': 9.184285390178978e-05, 'epoch': 2.29}
{'loss': 2.1175, 'grad_norm': 4.602804183959961, 'learning_rate': 8.7860745591225e-05, 'epoch': 2.34}
{'loss': 2.1461, 'grad_norm': 4.904566287994385, 'learning_rate': 8.389808710909881e-05, 'epoch': 2.39}


  0%|          | 0/137 [00:00<?, ?it/s]

{'eval_loss': 3.4414234161376953, 'eval_runtime': 750.5605, 'eval_samples_per_second': 1.452, 'eval_steps_per_second': 0.183, 'epoch': 2.39}




{'train_runtime': 47609.3077, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.046, 'train_loss': 3.186554688673753, 'epoch': 2.39}


TrainOutput(global_step=1300, training_loss=3.186554688673753, metrics={'train_runtime': 47609.3077, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.046, 'total_flos': 2.271875753312256e+17, 'train_loss': 3.186554688673753, 'epoch': 2.385321100917431})

In [13]:
#trainer.train(resume_from_checkpoint="./francesco_lora/checkpoint-50")