In [None]:
!pip install -q bitsandbytes accelerate
!pip install -q -U git+https://github.com/fabienfrfr/tptt@main

In [None]:
# only in kaggle for HF
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
QUANTIZATION = False

base_model_name="meta-llama/Llama-3.2-1B"
tokenizer_name=base_model_name

# basic training
N = 100
EPOCH = 10

# saving
username = "ffurfaro"
model_name = '/Titans-' + base_model_name.split('/')[1]
dir_path = '.' + model_name
repo_id = username + '/' + model_name.lstrip('/')

In [None]:
import os
import shutil
import json

if QUANTIZATION:
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, 
    DataCollatorForLanguageModeling, TrainerCallback
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

from transformers import BitsAndBytesConfig

if QUANTIZATION:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
else :
    bnb_config = None

# Tools
import tptt

In [None]:
## Load backbone
backbone = AutoModelForCausalLM.from_pretrained(
    config.base_model_name,
    trust_remote_code=True,
    attn_implementation="eager",
    token=hf_token,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
)

## LoRA adapter
lora_candidates = [
    "q_proj", ## minimal
    "k_proj",
    "v_proj", ## minimal
    "o_proj",  # Llama, Mistral, OLMo
    "qkv_proj",
    "out_proj",  # OpenELM,
    "c_attn",
    "c_proj",  # GPT-2
]

target_modules = [
    name
    for name, _ in backbone.named_modules()
    if any(name.endswith(n) for n in lora_candidates)
]
target_modules = list(set(target_modules))
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)
# Inject LoRA adapters (external function, not shown here)
backbone = get_peft_model(backbone, lora_config)
backbone.print_trainable_parameters()

## Transforming into Titans
model = tptt.TpttModel(config, backbone=backbone)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_tokenizer_name, token=hf_token)
# Ensure the tokenizer has a padding token for batching
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or "[PAD]"

raw_dataset = load_dataset("yahma/alpaca-cleaned")["train"].select(range(N))

def preprocess_fn(samples):
    """
    Tokenize the samples for causal language modeling.
    Concatenate instruction, input, and output as needed.
    """
    prompts = [
        f"{instr}\n{inp}" if inp else instr
        for instr, inp in zip(samples["instruction"], samples["input"])
    ]
    # Optionally, append output for supervised fine-tuning
    prompts = [f"{p}\n{out}" for p, out in zip(prompts, samples["output"])]
    tokens = tokenizer(
        prompts,
        truncation=True,
        max_length=512, #256,
        padding="longest", #padding= "max_length",
        return_attention_mask=True,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = raw_dataset.map(
    preprocess_fn, batched=True, remove_columns=raw_dataset.column_names
)

# Tokenize the dataset in batches and remove original columns
tokenized_dataset = raw_dataset.map(
    preprocess_fn, batched=True, remove_columns=raw_dataset.column_names)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Step 6: Define HuggingFace TrainingArguments for reproducible training
training_args = TrainingArguments(
    output_dir="./tptt_output",
    per_device_train_batch_size=4, # per_device_train_batch_size * N GPU --> VRAM limit risk 
    num_train_epochs=EPOCH,
    learning_rate=  5e-4,
    max_grad_norm=1.0, # gradiant clipping
    fp16=True,  # Use mixed precision if supported by hardware
    ddp_find_unused_parameters=False, 
    logging_steps=5,
    save_total_limit=2,  # Limit HDD
    seed=42,
    save_strategy="epoch",
    report_to="tensorboard",
)

# LiZA MaG callback
initial_weight=0.01,
final_weight=0.5,
transition_step=100,
liza_callback = tptt.AdjustMaGWeightCallback(
            model,
            initial_weight=initial_weight,
            final_weight=final_weight,
            transition_step=transition_step,)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
    callbacks=[liza_callback],
)

trainer.train()

In [None]:
model.save_pretrained(dir_path)
tokenizer.save_pretrained(dir_path)

config_path = os.path.join(dir_path, "config.json")
with open(config_path, "r") as f:
    config = json.load(f)
config["auto_map"] = {
  "AutoModelForCausalLM": "modeling_tptt.TpttModel",
  "AutoConfig": "configuration_tptt.TpttConfig"
}
config["architectures"] = ["TpttModel"]
with open(config_path, "w") as f:
    json.dump(config, f, indent=2)

api = HfApi()
api.create_repo(
    repo_id=repo_id,
    token=hf_token,
    repo_type="model",
    exist_ok=True,
    private=False
)
api.upload_folder(
    folder_path=dir_path,
    repo_id=repo_id,
    repo_type="model",
    token=hf_token,
    commit_message="Upload init + code custom"
)