Let's see how we can use an LLM to try and predict the edgelist of a graph just from the 7 features given to us.

!pip install transformers accelerate flash_attn peft datasets bitsandbytes trl pandas

In [1]:
import torch

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModelForCausalLM, AutoPeftModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline,
    DataCollatorForLanguageModeling
)
from trl import SFTTrainer
     

In [2]:
if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

device_map = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(attn_implementation, ',', device_map)

flash_attention_2 , cuda


In [3]:
# Model configuration
model_id = "microsoft/phi-3-mini-4k-instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load model and tokenizer with explicit device map
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": torch.cuda.current_device()},  # This is the key change
    trust_remote_code=True,
    attn_implementation=attn_implementation
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model.config.use_cache = False  # Required for gradient checkpointing

messages = [{"role": "user", "content": "Do you know what a graph edgelist is ? Answer in a single short sentence."}]

inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to('cuda')

outputs = model.generate(inputs, max_new_tokens=32)

text = tokenizer.batch_decode(outputs)[0]

print(text)

## Onto preparting the way for training

We generated the data in `generate_LLM_sets.ipynb`.

In [5]:
from generate_LLM_sets import ConcatenatedFileDataset

In [6]:
train = torch.load('data/train.pt')
val = torch.load('data/valid.pt')

  train = torch.load('data/train.pt')
  val = torch.load('data/valid.pt')


In [7]:
print(train[0])

{'prompt': 'Give the graph edgelist associated to the following features.-Number of nodes: 15.0-Number of edges: 105.0-Average degree: 14.0-Number of triangles: 455.0-Clustering coefficient: 1.0-Max k cores: 14.0-Number of communities: 1.0', 'answer': '(0, 1), (0, 14), (0, 2), (0, 13), (0, 3), (0, 12), (0, 4), (0, 11), (0, 5), (0, 10), (0, 6), (0, 9), (0, 7), (0, 8), (1, 2), (1, 3), (1, 14), (1, 4), (1, 13), (1, 5), (1, 12), (1, 6), (1, 11), (1, 7), (1, 10), (1, 8), (1, 9), (14, 2), (14, 3), (14, 4), (14, 5), (14, 6), (14, 7), (14, 8), (14, 9), (14, 10), (14, 11), (14, 12), (14, 13), (2, 3), (2, 4), (2, 5), (2, 6), (2, 13), (2, 7), (2, 12), (2, 8), (2, 11), (2, 9), (2, 10), (13, 3), (13, 4), (13, 5), (13, 6), (13, 7), (13, 8), (13, 9), (13, 10), (13, 11), (13, 12), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 12), (3, 10), (3, 11), (12, 4), (12, 5), (12, 6), (12, 7), (12, 8), (12, 9), (12, 10), (12, 11), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (4, 11), (11, 5), (11, 6),

Adapt to template


In [7]:
def create_message_column(row):
    messages = [
        {"content": row['prompt'], "role": "user"},
        {"content": row['answer'], "role": "assistant"}
    ]
    return {"messages": messages}

def format_dataset_chatml(row):
    # Apply the chat template
    formatted_text = tokenizer.apply_chat_template(
        row["messages"], 
        add_generation_prompt=False, 
        tokenize=False
    )
    
    # Tokenize the text
    tokenized = tokenizer(
        formatted_text,
        truncation=True,
        max_length=4096,
        padding=False,
        return_tensors="pt",  # Return PyTorch tensors (or omit for lists)
    )
    
    return {
        "input_ids": tokenized["input_ids"][0],  # Assuming batch size of 1 for simplicity
        "attention_mask": tokenized["attention_mask"][0]  # Include this if needed
    }

In [8]:
# Prepare datasets with batched processing
train_dataset = train.map(create_message_column)
train_dataset = train_dataset.map(format_dataset_chatml)

val_dataset = val.map(create_message_column)
val_dataset = val_dataset.map(format_dataset_chatml)

del train
del val

torch.save(train_dataset, 'data/train_dataset_tokenized.pt')
torch.save(val_dataset, 'data/val_dataset_tokenized.pt')

In [9]:
train_dataset = torch.load('data/train_dataset_tokenized.pt')
val_dataset = torch.load('data/val_dataset_tokenized.pt')


  train_dataset = torch.load('data/train_dataset_tokenized.pt')
  val_dataset = torch.load('data/val_dataset_tokenized.pt')


In [10]:
print("Sample from training dataset:")
print(list(train_dataset[0].keys()))  # Should include 'input_ids'
print("Length of first sample:", len(train_dataset[0]['input_ids']))

Sample from training dataset:
['input_ids', 'attention_mask']
Length of first sample: 788


## Let's go


In [11]:
# LoRA Configuration
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
    inference_mode=False,
)

# Prepare model for training
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.use_cache = False
model.gradient_checkpointing_enable()

In [12]:
training_args = TrainingArguments(
    output_dir="./phi-3-mini-LoRA",
    evaluation_strategy="steps",
    do_eval=True,
    max_steps=-1,
    optim="adamw_torch",
    per_device_train_batch_size=1,  # Reduced from 8
    gradient_accumulation_steps=16,   # Increased from 4
    per_device_eval_batch_size=1,    # Reduced from 8
    log_level="info",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=1e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    eval_steps=125,
    num_train_epochs=1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    seed=0,
    gradient_checkpointing=True,
    report_to="tensorboard",
)



In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False,
)

The next cell is the most cursed thing known to man but hey it fixes all my problems and I have been fighting this notebook for way too long now.

In [14]:
def patched_forward(self, *args, **kwargs):
    kwargs.pop("num_items_in_batch", None)  # Ignore the extra argument
    return PeftModelForCausalLM.forward(self, *args, **kwargs)  # Replace 'MyModelClass' with your model's actual class name

from types import MethodType

model.forward = MethodType(patched_forward, model)

In [15]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,  # Added data collator
)

# Training
trainer.train()

# Save the final model
trainer.save_model()

# Optional: Save adapter only
model.save_pretrained("./phi-3-mini-LoRA/final_adapter")

merged_model = model.merge_and_unload()
save_path = "./merged_model"
merged_model.save_pretrained(save_path)

  trainer = SFTTrainer(
Using auto half precision backend
***** Running training *****
  Num examples = 8,000
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 500
  Number of trainable parameters = 8,912,896


  0%|          | 0/500 [00:00<?, ?it/s]

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 9.2236, 'grad_norm': 3.1693050861358643, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 8.739, 'grad_norm': 2.2582690715789795, 'learning_rate': 4e-05, 'epoch': 0.04}
{'loss': 8.2193, 'grad_norm': 1.3784129619598389, 'learning_rate': 6e-05, 'epoch': 0.06}
{'loss': 7.0056, 'grad_norm': 1.6875430345535278, 'learning_rate': 8e-05, 'epoch': 0.08}
{'loss': 5.6848, 'grad_norm': 4.614770889282227, 'learning_rate': 0.0001, 'epoch': 0.1}
{'loss': 4.2678, 'grad_norm': 1.5896835327148438, 'learning_rate': 9.987820251299122e-05, 'epoch': 0.12}
{'loss': 3.9761, 'grad_norm': 0.992117702960968, 'learning_rate': 9.951340343707852e-05, 'epoch': 0.14}
{'loss': 3.5681, 'grad_norm': 1.0812567472457886, 'learning_rate': 9.890738003669029e-05, 'epoch': 0.16}
{'loss': 3.6988, 'grad_norm': 1.237465739250183, 'learning_rate': 9.806308479691595e-05, 'epoch': 0.18}
{'loss': 3.8249, 'grad_norm': 1.1787524223327637, 'learning_rate': 9.698463103929542e-05, 'epoch': 0.2}
{'loss': 3.7028, 'grad_norm': 1.521


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 1


  0%|          | 0/1000 [00:00<?, ?it/s]

{'eval_loss': 0.22255034744739532, 'eval_runtime': 877.9555, 'eval_samples_per_second': 1.139, 'eval_steps_per_second': 1.139, 'epoch': 0.25}
{'loss': 3.5986, 'grad_norm': 1.624793529510498, 'learning_rate': 9.24024048078213e-05, 'epoch': 0.26}
{'loss': 3.6561, 'grad_norm': 1.751072883605957, 'learning_rate': 9.045084971874738e-05, 'epoch': 0.28}
{'loss': 3.6833, 'grad_norm': 1.7926079034805298, 'learning_rate': 8.83022221559489e-05, 'epoch': 0.3}
{'loss': 3.1952, 'grad_norm': 1.2431806325912476, 'learning_rate': 8.596699001693255e-05, 'epoch': 0.32}
{'loss': 3.2922, 'grad_norm': 1.6546307802200317, 'learning_rate': 8.345653031794292e-05, 'epoch': 0.34}
{'loss': 3.4471, 'grad_norm': 1.3735625743865967, 'learning_rate': 8.07830737662829e-05, 'epoch': 0.36}
{'loss': 3.5141, 'grad_norm': 1.6207785606384277, 'learning_rate': 7.795964517353735e-05, 'epoch': 0.38}
{'loss': 3.3031, 'grad_norm': 1.2811110019683838, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.4}
{'loss': 3.4332, 'grad_no


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 1


{'loss': 3.4652, 'grad_norm': 1.6332688331604004, 'learning_rate': 5.868240888334653e-05, 'epoch': 0.5}


  0%|          | 0/1000 [00:00<?, ?it/s]

{'eval_loss': 0.2084374725818634, 'eval_runtime': 877.7589, 'eval_samples_per_second': 1.139, 'eval_steps_per_second': 1.139, 'epoch': 0.5}
{'loss': 3.3379, 'grad_norm': 2.233769178390503, 'learning_rate': 5.522642316338268e-05, 'epoch': 0.52}
{'loss': 3.6246, 'grad_norm': 1.8723210096359253, 'learning_rate': 5.174497483512506e-05, 'epoch': 0.54}
{'loss': 3.0862, 'grad_norm': 2.138923406600952, 'learning_rate': 4.825502516487497e-05, 'epoch': 0.56}
{'loss': 3.1581, 'grad_norm': 1.8332033157348633, 'learning_rate': 4.477357683661734e-05, 'epoch': 0.58}
{'loss': 3.3488, 'grad_norm': 1.2996113300323486, 'learning_rate': 4.131759111665349e-05, 'epoch': 0.6}
{'loss': 3.1478, 'grad_norm': 1.9925156831741333, 'learning_rate': 3.790390522001662e-05, 'epoch': 0.62}
{'loss': 3.3254, 'grad_norm': 2.1001808643341064, 'learning_rate': 3.4549150281252636e-05, 'epoch': 0.64}
{'loss': 2.9687, 'grad_norm': 1.7294458150863647, 'learning_rate': 3.12696703292044e-05, 'epoch': 0.66}
{'loss': 3.3551, 'grad_


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 1


  0%|          | 0/1000 [00:00<?, ?it/s]

{'eval_loss': 0.20364397764205933, 'eval_runtime': 877.7728, 'eval_samples_per_second': 1.139, 'eval_steps_per_second': 1.139, 'epoch': 0.75}
{'loss': 3.132, 'grad_norm': 2.599910020828247, 'learning_rate': 1.6543469682057106e-05, 'epoch': 0.76}
{'loss': 2.9112, 'grad_norm': 1.8020118474960327, 'learning_rate': 1.4033009983067452e-05, 'epoch': 0.78}
{'loss': 3.1364, 'grad_norm': 2.038886785507202, 'learning_rate': 1.1697777844051105e-05, 'epoch': 0.8}
{'loss': 3.1151, 'grad_norm': 2.046433210372925, 'learning_rate': 9.549150281252633e-06, 'epoch': 0.82}
{'loss': 2.9804, 'grad_norm': 2.28875994682312, 'learning_rate': 7.597595192178702e-06, 'epoch': 0.84}
{'loss': 3.4798, 'grad_norm': 1.9011940956115723, 'learning_rate': 5.852620357053651e-06, 'epoch': 0.86}
{'loss': 3.478, 'grad_norm': 2.946134328842163, 'learning_rate': 4.322727117869951e-06, 'epoch': 0.88}
{'loss': 3.2395, 'grad_norm': 1.802404522895813, 'learning_rate': 3.0153689607045845e-06, 'epoch': 0.9}
{'loss': 3.2249, 'grad_no


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 1


{'loss': 3.302, 'grad_norm': 1.5325467586517334, 'learning_rate': 0.0, 'epoch': 1.0}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to ./phi-3-mini-LoRA/checkpoint-500


{'eval_loss': 0.2023552656173706, 'eval_runtime': 877.7394, 'eval_samples_per_second': 1.139, 'eval_steps_per_second': 1.139, 'epoch': 1.0}


loading configuration file config.json from cache at /home/marceau/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,

{'train_runtime': 26707.2465, 'train_samples_per_second': 0.3, 'train_steps_per_second': 0.019, 'train_loss': 3.806950183868408, 'epoch': 1.0}


loading configuration file config.json from cache at /home/marceau/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,