In [2]:
from unsloth import FastLanguageModel
import torch
torch.cuda.empty_cache()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 2048
dtype = None 
load_in_4bit = True 

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit)


==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA H100 NVL. Num GPUs = 1. Max memory: 93.111 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",    
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
from datasets import load_dataset


PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token 

def formatting_prompts_func(examples):
    """Formats the dataset examples into a single text string per example."""
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    
   
    texts = [
        PROMPT_TEMPLATE.format(instruction, input, output) + EOS_TOKEN
        for instruction, input, output in zip(instructions, inputs, outputs)
    ]
    
    return { "text" : texts }


dataset_path = "/workspace/mem0/fine-tuning/dataset/memory_dataset_ft.json" 


dataset = load_dataset("json", data_files=dataset_path, split="train")

dataset = dataset.map(formatting_prompts_func, batched=True)


print("First 10 instructions from the original dataset:")
print(dataset['instruction'][:10])



Map:   0%|          | 0/4868 [00:00<?, ? examples/s]

First 10 instructions from the original dataset:
['My preferred airline for domestic travel is Delta.', 'Which airline do I usually prefer for flights within the country?', "Let's schedule the team sync for Wednesday at 2 PM PST in the main conference room.", 'Where is the team sync happening on Wednesday?', 'Actually, can we move the team sync to 3 PM PST instead? Same day, same place.', 'Does the updated team sync time conflict with my dentist appointment?', "For the 'Website Redesign' project, the key deliverables are: a new sitemap, wireframes for the homepage and product pages, and a revised branding style guide. The deadline for draft submission is March 15th.", 'What specific pages need wireframes for the Website Redesign project?', "What's my account number for the utility company?", 'Correction: my preferred domestic airline is actually United, not Delta.']


In [7]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 4868
})

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, 
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/4868 [00:00<?, ? examples/s]

In [9]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 NVL. Max memory = 93.111 GB.
7.623 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,868 | Num Epochs = 1 | Total steps = 608
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1104
2,2.6768
3,2.1871
4,1.7534
5,1.9834
6,1.9072
7,2.7966
8,1.8704
9,2.0092
10,1.5485


In [11]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

496.9078 seconds used for training.
8.28 minutes used for training.
Peak reserved memory = 7.623 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 8.187 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [12]:
FastLanguageModel.for_inference(model) 
inputs = tokenizer(
[
    PROMPT_TEMPLATE.format(
        "Continue the fibonnaci sequence.", 
        "1, 1, 2, 3, 5, 8",
        "", 
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\nThe next number in the Fibonacci sequence is 13.<|end_of_text|>']

In [13]:

FastLanguageModel.for_inference(model) 


instruction = "Remind me, what method do I prefer for brewing coffee? and also remind me what my preferred coffee bean is."
input_context = "MEMORY• coffee_brewing_preference=French press\nMEMORY• preferred_coffee_bean=Arabica"
output_placeholder = "" 


inputs = tokenizer(
[
    PROMPT_TEMPLATE.format(
        instruction,
        input_context,
        output_placeholder, 
    )
], return_tensors = "pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)


print(tokenizer.batch_decode(outputs))

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nRemind me, what method do I prefer for brewing coffee? and also remind me what my preferred coffee bean is.\n\n### Input:\nMEMORY• coffee_brewing_preference=French press\nMEMORY• preferred_coffee_bean=Arabica\n\n### Response:\nYour preferred method for brewing coffee is French press, and your preferred coffee bean is Arabica.<|end_of_text|>']


In [14]:

FastLanguageModel.for_inference(model)


instruction = "Remind me, what method do I prefer for brewing coffee? and also remind me what my preferred coffee bean is."
input_context = "MEMORY• coffee_brewing_preference=French press\nMEMORY• preferred_coffee_bean=Arabica"
output_placeholder = "" 

inputs = tokenizer(
[
    PROMPT_TEMPLATE.format(
        instruction,
        input_context,
        output_placeholder,
    )
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)


print("Streaming Model Response:")
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

Streaming Model Response:
<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Remind me, what method do I prefer for brewing coffee? and also remind me what my preferred coffee bean is.

### Input:
MEMORY• coffee_brewing_preference=French press
MEMORY• preferred_coffee_bean=Arabica

### Response:
You prefer brewing coffee using a French press. And your preferred coffee bean is Arabica.<|end_of_text|>


In [15]:

FastLanguageModel.for_inference(model)


instruction = "What is my favorite color?"
input_context = "" 
output_placeholder = "" 

    # Tokenize the input prompt
inputs = tokenizer(
[
    PROMPT_TEMPLATE.format(
        instruction,
        input_context,
        output_placeholder,
    )
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

print("Streaming Model Response (No Input Context):")
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

Streaming Model Response (No Input Context):
<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is my favorite color?

### Input:


### Response:
I don't have that information recorded yet. Can you let me know what your favorite color is?<|end_of_text|>


In [17]:

FastLanguageModel.for_inference(model)

# Test case data: Project details
instruction = "Summarize the key details about the 'Marketing Campaign Q3' project."
input_context = (
    "MEMORY• project_name=Marketing Campaign Q3\n"
    "MEMORY• project_goal=Increase brand awareness by 15%\n"
    "MEMORY• project_deadline=September 30th\n"
    "MEMORY• project_lead=Sarah Chen\n"
    "MEMORY• project_budget=$25,000\n"
    "MEMORY• project_key_channels=Social Media, Email Marketing, PPC Ads"
)
output_placeholder = "" 

inputs = tokenizer(
[
    PROMPT_TEMPLATE.format(
        instruction,
        input_context,
        output_placeholder,
    )
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)


print("Streaming Model Response (Project Summary):")
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)


Streaming Model Response (Project Summary):
<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the key details about the 'Marketing Campaign Q3' project.

### Input:
MEMORY• project_name=Marketing Campaign Q3
MEMORY• project_goal=Increase brand awareness by 15%
MEMORY• project_deadline=September 30th
MEMORY• project_lead=Sarah Chen
MEMORY• project_budget=$25,000
MEMORY• project_key_channels=Social Media, Email Marketing, PPC Ads

### Response:
The 'Marketing Campaign Q3' project aims to increase brand awareness by 15% before the September 30th deadline. The project is led by Sarah Chen and has a budget of $25,000. Key channels for the campaign include social media, email marketing, and pay-per-click advertising.<|end_of_text|>


In [None]:
model.save_pretrained("lora_model")  
tokenizer.save_pretrained("lora_model")


hf_token = "" 
repo_id = "cosmos98a/mem0-finetuned-llama3.1-8b-4b"

print(f"Pushing model and tokenizer to: https://huggingface.co/{repo_id}")
try:
    model.push_to_hub(repo_id, token=hf_token)
    tokenizer.push_to_hub(repo_id, token=hf_token)
    print("Successfully pushed to Hugging Face Hub!")
except Exception as e:
    print(f"An error occurred during push_to_hub: {e}")
    print("Please ensure you have replaced 'YOUR_HF_TOKEN_HERE' with a valid token.")
    print("You might also need to install 'huggingface_hub' and log in via the terminal ('huggingface-cli login').")


Pushing model and tokenizer to: https://huggingface.co/cosmos98a/mem0-finetuned-llama3.1-8b-4b


  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/cosmos98a/mem0-finetuned-llama3.1-8b-4b


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Successfully pushed to Hugging Face Hub!


In [20]:
# if False:
#     from unsloth import FastLanguageModel
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name = "lora_model",
#         dtype = dtype,
#         load_in_4bit = load_in_4bit,
#     )
#     FastLanguageModel.for_inference(model) 


# inputs = tokenizer(
# [
#     alpaca_prompt.format(
#         "What is a famous tall tower in Paris?", # instruction
#         "", # input
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [21]:
# if False:
    
#     from peft import AutoPeftModelForCausalLM
#     from transformers import AutoTokenizer
#     model = AutoPeftModelForCausalLM.from_pretrained(
#         "lora_model",
#         load_in_4bit = load_in_4bit,
#     )
#     tokenizer = AutoTokenizer.from_pretrained("lora_model")

In [25]:
# Make sure 'model' and 'tokenizer' are available from the fine-tuning process

output_merged_dir_16bit = "merged_model_16bit" 

print(f"Saving merged float16 model to directory: {output_merged_dir_16bit}")
# This merges the LoRA weights into the base model and saves in float16
model.save_pretrained_merged(output_merged_dir_16bit, tokenizer, save_method = "merged_16bit")
print(f"Successfully saved merged float16 model to {output_merged_dir_16bit}")

# Optional: Push the merged float16 model to the Hub
# Assumes hf_token and repo_id are defined from your previous push attempt
# print(f"\nOptional: Pushing merged float16 model to https://huggingface.co/{repo_id}")
# if 'hf_token' in locals() and 'repo_id' in locals():
#    try:
#        # Use push_to_hub_merged for merged models
#        model.push_to_hub_merged(repo_id, tokenizer, save_method="merged_16bit", token=hf_token)
#        print("Successfully pushed merged float16 model to Hugging Face Hub.")
#    except Exception as e:
#        print(f"Could not push merged float16 model: {e}")
# else:
#    print("Skipping merged float16 push to hub (hf_token or repo_id not defined).")


Saving merged float16 model to directory: merged_model_16bit
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 630.22 out of 1511.62 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:02<00:00, 12.61it/s]


Unsloth: Saving tokenizer... Done.
Done.
Successfully saved merged float16 model to merged_model_16bit


In [27]:
# Make sure 'model' and 'tokenizer' are available from the fine-tuning process

output_merged_dir_4bit = "merged_model_4bit" 

print(f"Saving merged 4-bit quantized model to directory: {output_merged_dir_4bit} (forcing overwrite)")
# This merges the LoRA weights into the base model and then quantizes the result to 4-bit
# Using "merged_4bit_forced" acknowledges the potential accuracy trade-off.
model.save_pretrained_merged(output_merged_dir_4bit, tokenizer, save_method = "merged_4bit_forced")
print(f"Successfully saved merged 4-bit model to {output_merged_dir_4bit}")

# Optional: Push the merged 4-bit model to the Hub
# ... (push code as before, using save_method="merged_4bit_forced") ...

Saving merged 4-bit quantized model to directory: merged_model_4bit (forcing overwrite)
Unsloth: Merging 4bit and LoRA weights to 4bit...
This might take 5 minutes...




Done.
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 10 minutes for Llama-7b... Done.
Successfully saved merged 4-bit model to merged_model_4bit


In [28]:
# Ensure 'model' and 'tokenizer' are available

# Define the local directory where the 16-bit merged model was saved
output_merged_dir_16bit = "merged_model_16bit" 

# --- Re-save locally first to ensure the directory is up-to-date ---
print(f"Re-saving merged float16 model to directory: {output_merged_dir_16bit}")
model.save_pretrained_merged(output_merged_dir_16bit, tokenizer, save_method = "merged_16bit")
print(f"Successfully re-saved merged float16 model to {output_merged_dir_16bit}")

# --- Define token and new repo ID ---
# Make sure this token is valid and has write permissions!
hf_token = "" 
# *** Using the NEW repository ID you created ***
repo_id = "cosmos98a/mem0-merged-llama3.1-8b-16bit" 

# --- Push the merged 16-bit model to the new Hub repo ---
print(f"\nPushing merged float16 model to https://huggingface.co/{repo_id}")
if hf_token and repo_id:
   try:
       # Use push_to_hub_merged for the merged model
       model.push_to_hub_merged(
           repo_id, 
           tokenizer, 
           save_method="merged_16bit", 
           token=hf_token,
           commit_message="Upload fine-tuned Llama-3.1-8B merged 16-bit model" # Optional: Add a commit message
       )
       print(f"Successfully pushed merged float16 model to: https://huggingface.co/{repo_id}")
   except Exception as e:
       print(f"An error occurred during push_to_hub_merged: {e}")
else:
   print("Skipping merged float16 push to hub (hf_token or repo_id not defined).")


Re-saving merged float16 model to directory: merged_model_16bit
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 628.74 out of 1511.62 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 201.68it/s]

Unsloth: Saving tokenizer...




 Done.
Done.
Successfully re-saved merged float16 model to merged_model_16bit

Pushing merged float16 model to https://huggingface.co/cosmos98a/mem0-merged-llama3.1-8b-16bit


Unsloth: You are pushing to hub, but you passed your HF username = cosmos98a.
We shall truncate cosmos98a/mem0-merged-llama3.1-8b-16bit to mem0-merged-llama3.1-8b-16bit


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 630.47 out of 1511.62 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 200.85it/s]


Unsloth: Saving tokenizer...

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.


  0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/cosmos98a/mem0-merged-llama3.1-8b-16bit
Successfully pushed merged float16 model to: https://huggingface.co/cosmos98a/mem0-merged-llama3.1-8b-16bit


In [29]:
# Make sure 'model' and 'tokenizer' are available

# Define the local directory where the 4-bit merged model was saved
output_merged_dir_4bit = "merged_model_4bit" 

# --- Re-save locally first to ensure the directory is up-to-date ---
# This step also ensures the 4-bit model exists if the kernel was restarted
print(f"Re-saving merged 4-bit quantized model to directory: {output_merged_dir_4bit} (forcing overwrite)")
model.save_pretrained_merged(output_merged_dir_4bit, tokenizer, save_method = "merged_4bit_forced")
print(f"Successfully re-saved merged 4-bit model to {output_merged_dir_4bit}")

# --- Define token and new repo ID ---
# Make sure this token is valid and has write permissions!
hf_token = "" 
# *** Using the NEW repository ID you created for the 4-bit model ***
repo_id_4bit = "cosmos98a/mem0-merged-llama3.1-8b-4bit" 

# --- Push the merged 4-bit model to the new Hub repo ---
print(f"\nPushing merged 4-bit model to https://huggingface.co/{repo_id_4bit}")
if hf_token and repo_id_4bit:
   try:
       # Use push_to_hub_merged with save_method="merged_4bit_forced"
       model.push_to_hub_merged(
           repo_id_4bit, 
           tokenizer, 
           save_method="merged_4bit_forced", # Important: Use forced to match how it was saved
           token=hf_token,
           commit_message="Upload fine-tuned Llama-3.1-8B merged 4-bit model" # Optional: Add a commit message
       )
       print(f"Successfully pushed merged 4-bit model to: https://huggingface.co/{repo_id_4bit}")
   except Exception as e:
       print(f"An error occurred during push_to_hub_merged (4-bit): {e}")
else:
   print("Skipping merged 4-bit push to hub (hf_token or repo_id_4bit not defined).")


Re-saving merged 4-bit quantized model to directory: merged_model_4bit (forcing overwrite)
Unsloth: Merging 4bit and LoRA weights to 4bit...
This might take 5 minutes...
Done.
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 10 minutes for Llama-7b... Done.
Successfully re-saved merged 4-bit model to merged_model_4bit

Pushing merged 4-bit model to https://huggingface.co/cosmos98a/mem0-merged-llama3.1-8b-4bit
Unsloth: Merging 4bit and LoRA weights to 4bit...
This might take 5 minutes...
Done.
Unsloth: Saving 4bit Bitsandbytes model. Please wait...


  0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Saved merged_4bit model to https://huggingface.co/cosmos98a/mem0-merged-llama3.1-8b-4bit
Successfully pushed merged 4-bit model to: https://huggingface.co/cosmos98a/mem0-merged-llama3.1-8b-4bit
