In [None]:
import os
import torch
from transformers import AutoTokenizer, Llama4ForConditionalGeneration, BitsAndBytesConfig


model_id = "unsloth/Llama-4-Scout-17B-16E-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


model = Llama4ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto", 
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    trust_remote_code=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
!nvidia-smi

In [None]:
from datasets import load_dataset

# Define the prompt template
PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Define the End-of-Sequence token
EOS_TOKEN = tokenizer.eos_token # Ensure tokenizer is defined in a previous cell

# Define the formatting function using list comprehension for conciseness
def formatting_prompts_func(examples):
    """Formats the dataset examples into a single text string per example."""
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    
    # Create the formatted text for each example
    texts = [
        PROMPT_TEMPLATE.format(instruction, input, output) + EOS_TOKEN
        for instruction, input, output in zip(instructions, inputs, outputs)
    ]
    
    return { "text" : texts }

# Define the dataset path
dataset_path = "/home/ubuntu/mem0/fine-tuning/dataset/memory_dataset_ft.json" 

# Load the dataset
dataset = load_dataset("json", data_files=dataset_path, split="train")

# Apply the formatting function to the dataset
# The `batched=True` argument processes multiple examples at once for efficiency.
dataset = dataset.map(formatting_prompts_func, batched=True)

# Print the first 10 instructions from the original dataset (before formatting)
# This can be useful for a quick sanity check of the input data.
print("First 10 instructions from the original dataset:")
print(dataset['instruction'][:10])

# Optionally, print the first example of the formatted text to verify formatting
# print("\nFirst formatted example:")
# print(dataset['text'][0])


In [5]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [6]:
PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
# Get the first example's instruction and input
instruction = dataset['instruction'][0]
input_text = dataset['input'][0]

# Format the prompt for inference (leave response blank)
prompt = PROMPT_TEMPLATE.format(instruction, input_text, "")

# Tokenize the prompt
inputs = tokenizer(
    [prompt + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

# Generate the model's response
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

# Decode and extract the response
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Split to get only the model's response part
if "### Response:" in response:
    print(response.split("### Response:")[1].strip())
else:
    print(response)

In [8]:
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,                           # Scaling factor for LoRA
    lora_dropout=0.05,                       # Add slight dropout for regularization
    r=64,                                    # Rank of the LoRA update matrices
    bias="none",                             # No bias reparameterization
    task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)

model = get_peft_model(model, peft_config)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments


# Training Arguments
training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=0.2,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none"
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [4]:
import os

os.environ["HF_TOKEN"] = "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"

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

In [None]:
model.push_to_hub("cosmos98a/mem0_llama_4_scout_fine_tuned_f16", token="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")
tokenizer.push_to_hub("cosmos98a/mem0_llama_4_scout_fine_tuned_f16", token="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")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os # Import os to check path existence

# --- Configuration ---
base_model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct"

# --- Find the correct adapter path ---
# Option 1: If you know the final checkpoint number (e.g., from training output)
lora_adapter_path = "output/checkpoint-2434" # <--- CHANGE THIS

# Option 2: Find the latest checkpoint directory automatically (more robust)
# output_dir = "output"
# checkpoints = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
# if not checkpoints:
#     raise ValueError(f"No checkpoint directories found in {output_dir}")
# checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
# lora_adapter_path = checkpoints[-1] # Get the latest checkpoint
# print(f"Found latest checkpoint: {lora_adapter_path}")


# Check if the path and config file exist
adapter_config_path = os.path.join(lora_adapter_path, "adapter_config.json")
if not os.path.exists(adapter_config_path):
    raise FileNotFoundError(f"Cannot find adapter_config.json at {lora_adapter_path}. Please verify the checkpoint path.")


# --- Load Base Model ---
print(f"Loading base model: {base_model_name}")
# If using BitsAndBytes quantization for the base model during merge:
# bnb_config = BitsAndBytesConfig(...) # Define your config if needed
# model = AutoModelForCausalLM.from_pretrained(
#     base_model_name,
#     quantization_config=bnb_config, # Apply quantization if needed
#     torch_dtype=torch.bfloat16,     # Use appropriate dtype
#     device_map="auto"
# )
# OR if loading without quantization for merge (requires more memory):
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,  # Load in float16 for merge
    device_map="auto"
)


# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter from: {lora_adapter_path}")
model = PeftModel.from_pretrained(model, lora_adapter_path)
print("LoRA adapter loaded.")

# --- Merge LoRA and Base Model ---
print("Merging LoRA weights into the base model...")
model = model.merge_and_unload()
print("Merge complete.")

# --- Save Merged Model ---
merged_model_dir = "merged_model_fp16"
print(f"Saving merged model (float16) to: {merged_model_dir}")
model.save_pretrained(merged_model_dir)
print("Merged model saved.")

# --- Save Tokenizer ---
print(f"Saving tokenizer to: {merged_model_dir}")
# Assuming tokenizer is already loaded correctly from previous cells
# If not, load it: tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.save_pretrained(merged_model_dir)
print("Tokenizer saved.")

print("\n--- Merging and Saving Complete ---")
print(f"Merged model (float16) is available in the directory: {merged_model_dir}")


In [None]:
from transformers import AutoTokenizer

# --- Configuration --- (Make sure these are defined or copy them here)
base_model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct"
merged_model_dir = "merged_model_fp16"

# --- Load and Save Tokenizer ---
print(f"Loading tokenizer for: {base_model_name}")
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
print("Tokenizer loaded.")

print(f"Saving tokenizer to: {merged_model_dir}")
tokenizer.save_pretrained(merged_model_dir)
print("Tokenizer saved.")

In [1]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import os
# import torch # Add torch import if missing

# # --- Configuration ---
# merged_model_dir = "merged_model_fp16" # Local directory with the saved merged model
# repo_id = "cosmos98a/mem0_merged_llama4_scout_ft_f16" # Your target repo on the Hub

# # --- Ensure HF Token is Set ---
# hf_token = os.environ.get("HF_TOKEN")
# if not hf_token:
#     # Try getting the token from the specific value set earlier if env var is not set
#     try:
#         hf_token = "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" # The token you set in cell execution_count: 4
#         if not hf_token: raise ValueError("Token is empty")
#         print("Using hardcoded HF Token.")
#     except:
#         raise ValueError("Hugging Face token not found. Please set HF_TOKEN env var or hardcode it.")


# # --- Load the Merged Model and Tokenizer --- # <--- UNCOMMENT THIS BLOCK
# # This step is necessary after a kernel restart
# print(f"Loading merged model from: {merged_model_dir}")
# model = AutoModelForCausalLM.from_pretrained(merged_model_dir, torch_dtype=torch.float16, device_map="auto")
# print(f"Loading tokenizer from: {merged_model_dir}")
# tokenizer = AutoTokenizer.from_pretrained(merged_model_dir)
# print("Model and tokenizer loaded.")


# # --- Push to Hub ---
# print(f"\nPushing merged model and tokenizer to: https://huggingface.co/{repo_id}")
# try:
#     # Push the model weights and config
#     model.push_to_hub(repo_id, token=hf_token, commit_message="Upload merged float16 model")

#     # Push the tokenizer files
#     tokenizer.push_to_hub(repo_id, token=hf_token, commit_message="Upload tokenizer")

#     print(f"\nSuccessfully pushed merged model and tokenizer to: https://huggingface.co/{repo_id}")

# except Exception as e:
#     print(f"\nAn error occurred during push_to_hub: {e}")
#     print("Please ensure:")
#     print(f"- The repository '{repo_id}' exists on Hugging Face Hub.")
#     print("- Your Hugging Face token has write permissions for this repository.")
#     print("- You have 'huggingface_hub' installed and potentially 'git-lfs'.")


Using hardcoded HF Token.
Loading merged model from: merged_model_fp16


ValueError: Unrecognized model in merged_model_fp16. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v3, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, git, glm, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mistral3, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam_vision_model, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import os

# --- Configuration ---
base_model_id = "unsloth/Llama-4-Scout-17B-16E-Instruct"
# !!! IMPORTANT: Verify this is the correct ID for your saved LoRA adapter !!!
adapter_id = "cosmos98a/mem0_llama_4_scout_fine_tuned_f16"
# !!! IMPORTANT: Create a NEW repo on Hugging Face for the merged model !!!
# Example: new_repo_id_fp16 = "cosmos98a/my-scout-merged-fp16"
new_repo_id_fp16 = "YOUR_NEW_HF_REPO_ID_FOR_MERGED_FP16" # <--- CHANGE THIS
local_save_dir_fp16 = "./merged_model_fp16_from_hub" # Local directory to save temporarily

# Optional: Set your HF Token if needed (or login via CLI: huggingface-cli login)
# hf_token = "YOUR_HF_WRITE_TOKEN"
# os.environ["HF_TOKEN"] = hf_token

# --- 1. Load FP16, Apply Adapter, Merge, Save, and Push ---

print(f"--- Scenario 1: Loading {base_model_id} in FP16 ---")

# Load base model in float16
model_fp16 = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16, # Use float16 for merging
    device_map="auto"          # Use "auto" or specific device like "cuda:0"
)

# Load the tokenizer (needed for saving and potentially generation)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

print(f"--- Applying adapter {adapter_id} to FP16 model ---")
# Load and apply the LoRA adapter
# Ensure the adapter exists at the specified 'adapter_id' on the Hub
try:
    model_fp16 = PeftModel.from_pretrained(model_fp16, adapter_id)
    print("Adapter applied successfully.")
except Exception as e:
    print(f"Error loading adapter: {e}")
    print("Please ensure the adapter_id is correct and the adapter files exist on the Hub.")
    # Handle error appropriately, maybe exit or skip merging/pushing

if hasattr(model_fp16, 'merge_and_unload'):
    print("--- Merging adapter into FP16 model ---")
    # Merge the adapter weights into the base model
    model_fp16 = model_fp16.merge_and_unload()
    print("Merge complete.")

    print(f"--- Saving merged FP16 model locally to {local_save_dir_fp16} ---")
    # Save the merged model locally
    model_fp16.save_pretrained(local_save_dir_fp16)
    tokenizer.save_pretrained(local_save_dir_fp16)
    print("Merged model and tokenizer saved locally.")

    # --- Push merged FP16 model to Hub ---
    print(f"--- Pushing merged FP16 model to {new_repo_id_fp16} ---")
    # Ensure you have huggingface_hub installed and are logged in (huggingface-cli login)
    # Or provide token=hf_token if using an environment variable
    try:
        model_fp16.push_to_hub(new_repo_id_fp16, commit_message="Upload merged FP16 model")
        tokenizer.push_to_hub(new_repo_id_fp16, commit_message="Upload tokenizer")
        print(f"Successfully pushed merged FP16 model and tokenizer to https://huggingface.co/{new_repo_id_fp16}")
    except Exception as e:
        print(f"Error pushing merged FP16 model to Hub: {e}")
        print("Please ensure:")
        print(f"- The repository '{new_repo_id_fp16}' exists on Hugging Face Hub.")
        print("- Your Hugging Face token has write permissions.")
        print("- You have 'huggingface_hub' and 'git-lfs' installed.")
else:
    print("Model does not have merge_and_unload. Skipping merge, save, and push for FP16.")


# --- 2. Load 4-bit, Apply Adapter (for Inference) ---

print(f"\n--- Scenario 2: Loading {base_model_id} in 4-bit ---")

# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False, # Set as needed
    bnb_4bit_quant_type="nf4",      # Or "fp4"
    bnb_4bit_compute_dtype=torch.bfloat16 # Or float16
)

# Load base model in 4-bit
model_4bit = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True # If required by the base model
)

# Reload tokenizer if not already loaded
if 'tokenizer' not in locals():
     tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)


print(f"--- Applying adapter {adapter_id} to 4-bit model ---")
# Load and apply the LoRA adapter
# Note: The adapter weights themselves are NOT quantized; they are applied to the quantized base model
try:
    model_4bit_with_adapter = PeftModel.from_pretrained(model_4bit, adapter_id)
    print("Adapter applied successfully to 4-bit model.")
    print("This model is now ready for inference using the fine-tuned weights on the 4-bit base.")

    # --- Using the 4-bit model with adapter for inference (Example) ---
    # print("\n--- Example Inference with 4-bit + Adapter ---")
    # instruction = "Your instruction here"
    # input_text = "Your input here"
    # PROMPT_TEMPLATE = "..." # Define your prompt template
    # prompt = PROMPT_TEMPLATE.format(instruction, input_text, "")
    # inputs = tokenizer([prompt + tokenizer.eos_token], return_tensors="pt").to(model_4bit_with_adapter.device)
    #
    # outputs = model_4bit_with_adapter.generate(
    #     **inputs,
    #     max_new_tokens=200,
    #     eos_token_id=tokenizer.eos_token_id,
    #     use_cache=True
    # )
    # response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # print(response)

except Exception as e:
    print(f"Error loading adapter for 4-bit model: {e}")
    print("Please ensure the adapter_id is correct and the adapter files exist on the Hub.")


# --- Note on Merging/Pushing 4-bit ---
# You generally DO NOT merge_and_unload() a 4-bit quantized model with adapters.
# The standard practice is to load the base in 4-bit and apply the adapter dynamically for inference,
# as shown above (model_4bit_with_adapter). There isn't a standard, widely supported format
# for saving a *pre-merged* 4-bit + adapter model for direct loading later.
# Therefore, there's no 'push' step included for the 4-bit scenario here.