In [1]:
#!pip install -q bitsandbytes accelerate
#!pip install -q -U git+https://github.com/fabienfrfr/tptt@main
!pip install -q -U git+https://github.com/fabienfrfr/tptt@dev

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━

In [2]:
# only in kaggle for HF
from huggingface_hub import login, HfApi
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
login(token=hf_token)
api = HfApi()

In [3]:
QUANTIZATION = False

base_model_name="meta-llama/Llama-3.2-1B"
base_tokenizer_name=base_model_name

# basic training
N = 100
EPOCH = 10

# saving
username = "ffurfaro"
model_name = '/Titans-' + base_model_name.split('/')[1]
dir_path = '.' + model_name
repo_id = username + '/' + model_name.lstrip('/')

In [4]:
import os
import shutil
import json

if QUANTIZATION:
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, 
    DataCollatorForLanguageModeling, TrainerCallback
)
from datasets import load_dataset
from peft import LoraConfig, PeftConfig, get_peft_model

if QUANTIZATION:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
else :
    bnb_config = None

# Tools
import tptt

2025-06-07 13:31:32.408754: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749303092.674963      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749303092.756628      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
target_modules = ["q_proj","k_proj","v_proj","o_proj"]  # Llama, Mistral, OLMo. Minimal : q_proj, v_proj
#target_modules = ["qkv_proj","out_proj"]  # OpenELM,
#target_modules = ["c_attn","c_proj"]  # GPT-2

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
).to_dict()

In [6]:
## Transforming into Titans
config = tptt.TpttConfig(
    base_model_name=base_model_name,
    max_self_attn_length = 2048,
    lora_config=lora_config,
)

model = tptt.TpttModel(config,
    trust_remote_code=True,
    attn_implementation="eager",
    token=hf_token,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,)
#display(model)
model.backbone.print_trainable_parameters()

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [7]:
tokenizer = AutoTokenizer.from_pretrained(base_tokenizer_name, token=hf_token)
# Ensure the tokenizer has a padding token for batching
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or "[PAD]"

raw_dataset = load_dataset("yahma/alpaca-cleaned")["train"].select(range(N))

def preprocess_fn(samples):
    """
    Tokenize the samples for causal language modeling.
    Concatenate instruction, input, and output as needed.
    """
    prompts = [
        f"{instr}\n{inp}" if inp else instr
        for instr, inp in zip(samples["instruction"], samples["input"])
    ]
    # Optionally, append output for supervised fine-tuning
    prompts = [f"{p}\n{out}" for p, out in zip(prompts, samples["output"])]
    tokens = tokenizer(
        prompts,
        truncation=True,
        max_length=512, #256,
        padding="longest", #padding= "max_length",
        return_attention_mask=True,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = raw_dataset.map(
    preprocess_fn, batched=True, remove_columns=raw_dataset.column_names
)

# Tokenize the dataset in batches and remove original columns
tokenized_dataset = raw_dataset.map(
    preprocess_fn, batched=True, remove_columns=raw_dataset.column_names)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Step 6: Define HuggingFace TrainingArguments for reproducible training
training_args = TrainingArguments(
    output_dir="./tptt_output",
    per_device_train_batch_size=3, # per_device_train_batch_size * N GPU --> VRAM limit risk 
    num_train_epochs=EPOCH,
    learning_rate=  5e-4,
    max_grad_norm=1.0, # gradiant clipping
    fp16=True,  # Use mixed precision if supported by hardware
    ddp_find_unused_parameters=False, 
    logging_steps=5,
    save_total_limit=2,  # Limit HDD
    seed=42,
    save_strategy="epoch",
    report_to="tensorboard",
)

# LiZA MaG callback
initial_weight=0.01,
final_weight=0.5,
transition_step=100,
liza_callback = tptt.AdjustMaGWeightCallback(
            model,
            initial_weight=initial_weight,
            final_weight=final_weight,
            transition_step=transition_step,)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
    callbacks=[liza_callback],
)

trainer.train()

In [8]:
tokenizer.save_pretrained(dir_path)
model.save_pretrained(dir_path)

In [9]:
api.create_repo(
    repo_id=repo_id,
    token=hf_token,
    repo_type="model",
    exist_ok=True,
    private=False
)
api.upload_folder(
    folder_path=dir_path,
    repo_id=repo_id,
    repo_type="model",
    token=hf_token,
    commit_message="Upload model + init tptt code"
)

model.safetensors:   0%|          | 0.00/2.48G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ffurfaro/Titans-Llama-3.2-1B/commit/eeb89fa3ed03f30589bc3aea5040189269d04633', commit_message='Upload model + init tptt code', commit_description='', oid='eeb89fa3ed03f30589bc3aea5040189269d04633', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ffurfaro/Titans-Llama-3.2-1B', endpoint='https://huggingface.co', repo_type='model', repo_id='ffurfaro/Titans-Llama-3.2-1B'), pr_revision=None, pr_num=None)

In [10]:
from transformers import AutoModelForCausalLM

model_tptt = AutoModelForCausalLM.from_pretrained(repo_id, token=hf_token, trust_remote_code=True)
display(model_tptt)

config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

configuration_tptt.py:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ffurfaro/Titans-Llama-3.2-1B:
- configuration_tptt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_tptt.py:   0%|          | 0.00/27.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ffurfaro/Titans-Llama-3.2-1B:
- modeling_tptt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/2.48G [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.83M [00:00<?, ?B/s]

TpttModel(
  (backbone): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): LlamaForCausalLM(
        (model): LlamaModel(
          (embed_tokens): Embedding(128256, 2048)
          (layers): ModuleList(
            (0-15): 16 x LlamaDecoderLayer(
              (self_attn): LiZAttention(
                (base_attn): LlamaAttention(
                  (q_proj): lora.Linear(
                    (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2048, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_emb