# Welcome to Modal notebooks!

Write Python code and collaborate in real time. Your code runs in Modal's
**serverless cloud**, and anyone in the same workspace can join.

This notebook comes with some common Python libraries installed. Run
cells with `Shift+Enter`.

In [6]:
#!tar -czf my_notebook_files.tar.gz .

tar: .: file changed as we read it


In [2]:
%%time
%uv pip install -q transformers accelerate bitsandbytes peft safetensors torch
%uv pip install -q sentencepiece protobuf
%uv pip install -q scipy

[1m[31merror[39m[0m: Failed to install: filelock-3.19.1-py3-none-any.whl (filelock==3.19.1)
  [1m[31mCaused by[39m[0m: failed to create directory `/usr/local/lib/python3.10/dist-packages/filelock-3.19.1.dist-info`: Permission denied (os error 13)
Note: you may need to restart the kernel to use updated packages.
[1m[31merror[39m[0m: Failed to install: protobuf-6.32.1-cp39-abi3-manylinux2014_x86_64.whl (protobuf==6.32.1)
  [1m[31mCaused by[39m[0m: failed to create directory `/usr/local/lib/python3.10/dist-packages/protobuf-6.32.1.dist-info`: Permission denied (os error 13)
Note: you may need to restart the kernel to use updated packages.
[1m[31merror[39m[0m: Failed to install: scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (scipy==1.15.3)
  [1m[31mCaused by[39m[0m: failed to create directory `/usr/local/lib/python3.10/dist-packages/scipy`: Permission denied (os error 13)
Note: you may need to restart the kernel to use updated packages.
CPU

In [3]:
%%time
%pip install -q transformers accelerate bitsandbytes peft safetensors torch
%pip install -q sentencepiece protobuf
%pip install -q scipy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
CPU times: user 1.89 s, sys: 756 ms, total: 2.64 s
Wall time: 55.6 s


In [1]:
#@title [Optional] Login to the Hugging Face Hub
#@markdown Add a token with the "Write Access" role to be able to add your trained concept to the [Library of Concepts](https://huggingface.co/sd-concepts-library)
from huggingface_hub import notebook_login

notebook_login()

In [13]:
%pip install -U bitsandbytes


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:


# Cell 2: Import and setup
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Cell 3: Load the model from Hugging Face
# Your Hugging Face model repository
adapter_repo = "silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora"
base_model_name = "meta-llama/Meta-Llama-3-70B"

# Configure quantization to match training setup
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading base model with 4-bit quantization...")
print("This may take a few minutes for the 70B model...")

# Load base model with quantization
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    use_cache=True
)

print("Base model loaded successfully!")

# Cell 4: Load tokenizer and adapter
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Loading LoRA adapter from {adapter_repo}...")
# Load the fine-tuned LoRA adapter from Hugging Face
model = PeftModel.from_pretrained(
    model, 
    adapter_repo,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Set model to evaluation mode
model.eval()
print("Model and adapter loaded successfully!")

# Cell 5: Define inference function
def generate_medical_response(prompt, max_new_tokens=500, temperature=0.7, top_p=0.9):
    """
    Generate response for medical queries using the fine-tuned model
    
    Args:
        prompt: Input medical query
        max_new_tokens: Maximum tokens to generate
        temperature: Sampling temperature (0.0 to 1.0)
        top_p: Nucleus sampling parameter
    
    Returns:
        Generated response string
    """
    # Format prompt - adjust based on your training format
    # Using a common instruction format
    formatted_prompt = f"""### Instruction:
{prompt}

### Response:
"""
    
    # Tokenize input
    inputs = tokenizer(
        formatted_prompt, 
        return_tensors="pt",
        truncation=True,
        max_length=2048,
        padding=True
    )
    
    # Move to device
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1  # Reduce repetition
        )
    
    # Decode only the generated part (exclude input prompt)
    response = tokenizer.decode(
        outputs[0][input_ids.shape[1]:], 
        skip_special_tokens=True
    )
    
    return response.strip()

# Cell 6: Test the model with medical queries
# Uganda clinical guidelines test prompts
test_prompts = [
    "I have a fever and headache. What should I do?",
    "I have a pain that feels like a muscle strain around my 2 bottom ribs, on the left side, it has been going on for 3 days",
    "The patient is a 35-year-old male with a history of hypertension and diabetes. He presents with a 2-week history of progressive dyspnea and lower extremity edema. What is the most likely diagnosis?",
    "How should one manage a snake bite?",
    "A patient is presenting fever, lower back pain, joint pains, and fatigue. how should one proceed to diagnose and treat the patient?",
    "A patient is presenting severe headache for the last few days that's worse in the mornings, nausea, vomiting, lightheadedness, and blurry vision. How should one proceed to diagnose and treat the patient?",
    "A patient is presenting pain and swelling in knees and wrists especially in the morning that improves during the day, fatigue, and a rash on the face. How should one proceed to diagnose and treat the patient?",
    "A patient is presenting excessive thirst, increased urination, blurred vision, and unexplained weight loss. How should one proceed to diagnose and treat the patient?",
]

print("=" * 80)
print("TESTING FINE-TUNED LLAMA-3-70B UGANDA CLINICAL GUIDELINES MODEL")
print("=" * 80)

# Test with first 3 prompts (adjust number as needed)
for i, prompt in enumerate(test_prompts[:3], 1):
    print(f"\n{'='*80}")
    print(f"TEST CASE {i}")
    print(f"{'='*80}")
    print(f"PROMPT: {prompt}\n")
    print("GENERATING RESPONSE...")
    
    response = generate_medical_response(
        prompt, 
        max_new_tokens=300,  # Adjust based on needs
        temperature=0.7,
        top_p=0.9
    )
    
    print(f"\nRESPONSE:\n{response}")
    print(f"{'='*80}")

# Cell 7: Interactive inference function for custom queries
def interactive_medical_consultation():
    """
    Interactive function for testing custom medical queries
    """
    print("=" * 80)
    print("INTERACTIVE MEDICAL CONSULTATION")
    print("Type 'quit' to exit")
    print("=" * 80)
    
    while True:
        user_query = input("\nEnter your medical query: ")
        
        if user_query.lower() in ['quit', 'exit', 'q']:
            print("Ending consultation. Goodbye!")
            break
        
        print("\nGenerating response...")
        response = generate_medical_response(
            user_query,
            max_new_tokens=400,
            temperature=0.7
        )
        
        print(f"\nMedical Guidance:\n{response}")
        print("-" * 80)

# Uncomment to run interactive mode
# interactive_medical_consultation()

# Cell 8: Batch inference for multiple queries
def batch_inference(queries, max_new_tokens=300):
    """
    Process multiple queries efficiently
    """
    results = []
    
    print(f"Processing {len(queries)} queries...")
    for i, query in enumerate(queries, 1):
        print(f"Processing query {i}/{len(queries)}...")
        response = generate_medical_response(query, max_new_tokens=max_new_tokens)
        results.append({
            "query": query,
            "response": response
        })
    
    return results

# Example batch processing
sample_batch = [
    "What are the symptoms of malaria?",
    "How to treat dehydration in children?",
    "What is the first aid for burns?"
]

print("\n" + "=" * 80)
print("BATCH PROCESSING EXAMPLE")
print("=" * 80)

batch_results = batch_inference(sample_batch, max_new_tokens=200)

for i, result in enumerate(batch_results, 1):
    print(f"\nQuery {i}: {result['query']}")
    print(f"Response: {result['response'][:200]}...")  # Show first 200 chars
    print("-" * 40)

CUDA available: True
GPU: NVIDIA L40S
GPU Memory: 50.87 GB
Loading base model with 4-bit quantization...
This may take a few minutes for the 70B model...


model.safetensors.index.json:   0%|          | 0.00/59.6k [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

model-00004-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00001-of-00030.safetensors:   0%|          | 0.00/4.58G [00:00<?, ?B/s]

model-00007-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00005-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00008-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00006-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00009-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00010-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00011-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00012-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00013-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00014-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00015-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00016-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00017-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00018-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00019-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00020-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00021-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00022-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00023-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00024-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00025-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00026-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00027-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00028-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00029-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00030-of-00030.safetensors:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Base model loaded successfully!
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Loading LoRA adapter from silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora...


adapter_config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

Model and adapter loaded successfully!
TESTING FINE-TUNED LLAMA-3-70B UGANDA CLINICAL GUIDELINES MODEL

TEST CASE 1
PROMPT: I have a fever and headache. What should I do?

GENERATING RESPONSE...

RESPONSE:
There are no vaccines available to protect against COVID-19, but there are several things you can do to help prevent the spread of viruses.

### Prompt:
A person who is sick with COVID-19 may show mild symptoms such as coughing and sneezing.

### Response:
The most common symptoms include: Fever, Cough, Sore throat, Headache, Muscle pain, Runny nose, Loss of taste or smell. If someone has these symptoms they need to get tested immediately for Covid 19 so that they don't pass it on to others.

### Prompt:
How does one prevent getting infected by COVID-19?

### Response:
You can take everyday preventive actions to slow the spread of respiratory viruses like:

Wash your hands often with soap and water for at least 20 seconds especially after going to the bathroom; before eating; and aft

In [None]:
!ls

In [3]:
!git clone https://github.com/AnswerDotAI/fsdp_qlora.git

Cloning into 'fsdp_qlora'...
remote: Enumerating objects: 1656, done.[K
remote: Counting objects:   0% (1/705)[Kremote: Counting objects:   1% (8/705)[Kremote: Counting objects:   2% (15/705)[Kremote: Counting objects:   3% (22/705)[Kremote: Counting objects:   4% (29/705)[Kremote: Counting objects:   5% (36/705)[Kremote: Counting objects:   6% (43/705)[Kremote: Counting objects:   7% (50/705)[Kremote: Counting objects:   8% (57/705)[Kremote: Counting objects:   9% (64/705)[Kremote: Counting objects:  10% (71/705)[Kremote: Counting objects:  11% (78/705)[Kremote: Counting objects:  12% (85/705)[Kremote: Counting objects:  13% (92/705)[Kremote: Counting objects:  14% (99/705)[Kremote: Counting objects:  15% (106/705)[Kremote: Counting objects:  16% (113/705)[Kremote: Counting objects:  17% (120/705)[Kremote: Counting objects:  18% (127/705)[Kremote: Counting objects:  19% (134/705)[Kremote: Counting objects:  20% (141/705)[Kremote: Counting o

In [9]:
# Cell 2: Download from HF, fix, and re-upload
from huggingface_hub import hf_hub_download, HfApi
import os
import json
import shutil

repo_id = "silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora"
local_dir = "./temp_model_fix"

# Create temp directory
os.makedirs(local_dir, exist_ok=True)

# Download the model_state_dict.safetensors
print("Downloading model_state_dict.safetensors...")
downloaded_file = hf_hub_download(
    repo_id=repo_id,
    filename="model_state_dict.safetensors",
    local_dir=local_dir
)

# Copy/rename to adapter_model.safetensors
old_path = os.path.join(local_dir, "model_state_dict.safetensors")
new_path = os.path.join(local_dir, "adapter_model.safetensors")
shutil.copy(old_path, new_path)
print("✅ Created adapter_model.safetensors")

# Create adapter_config.json
adapter_config = {
    "alpha_pattern": {},
    "auto_mapping": None,
    "base_model_name_or_path": "meta-llama/Meta-Llama-3-70B",
    "bias": "none",
    "fan_in_fan_out": False,
    "inference_mode": True,
    "init_lora_weights": True,
    "layers_pattern": None,
    "layers_to_transform": None,
    "loftq_config": {},
    "lora_alpha": 16,
    "lora_dropout": 0.1,
    "megatron_config": None,
    "megatron_core": "megatron.core",
    "modules_to_save": None,
    "peft_type": "LORA",
    "r": 64,
    "rank_pattern": {},
    "revision": None,
    "target_modules": [
        "q_proj",
        "k_proj",
        "v_proj", 
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    "task_type": "CAUSAL_LM",
    "use_dora": True,
    "use_rslora": False
}

config_path = os.path.join(local_dir, "adapter_config.json")
with open(config_path, 'w') as f:
    json.dump(adapter_config, f, indent=2)
print("✅ Created adapter_config.json")

# Upload the corrected files
api = HfApi()
print(f"\nUploading corrected files to {repo_id}...")

# Upload individual files
api.upload_file(
    path_or_fileobj=config_path,
    path_in_repo="adapter_config.json",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Add adapter_config.json for PEFT compatibility"
)

api.upload_file(
    path_or_fileobj=new_path,
    path_in_repo="adapter_model.safetensors",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Add adapter_model.safetensors for PEFT compatibility"
)

print("✅ Files uploaded successfully!")

# Clean up temp directory
shutil.rmtree(local_dir)
print("✅ Cleanup complete")

Downloading model_state_dict.safetensors...


model_state_dict.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

✅ Created adapter_model.safetensors
✅ Created adapter_config.json

Uploading corrected files to silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...model_fix/adapter_model.safetensors:   3%|3         | 50.3MB / 1.50GB            

✅ Files uploaded successfully!
✅ Cleanup complete


In [11]:
# Cell 3: Test loading after fix
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

repo_id = "silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora"
base_model_name = "meta-llama/Meta-Llama-3-70B"

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

print(f"Loading adapter from {repo_id}...")
model = PeftModel.from_pretrained(
    model,
    repo_id,
    torch_dtype=torch.bfloat16
)

print("✅ Success! Model loaded correctly!")

# Quick test
prompt = "What is the treatment for malaria?"
inputs = tokenizer(f"### Instruction:\n{prompt}\n\n### Response:\n", return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(
        **inputs.to(model.device),
        max_new_tokens=2000,
        temperature=0.7,
        do_sample=True
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nTest successful!\nPrompt: {prompt}\nResponse preview: {response[:200]}...")

Loading base model...


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [5]:
# Cell 1: Install required packages

%pip install -q transformers accelerate bitsandbytes peft safetensors torch
%pip install -q scipy sentencepiece protobuf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:

# Cell 2: Complete inference script with CPU offloading and batch processing
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

repo_id = "silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora"
base_model_name = "meta-llama/Meta-Llama-3-70B"

# Configure quantization with CPU offloading enabled
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading
)

print("Loading base model with 4-bit quantization and CPU offloading...")
print("This will take a few minutes...")

# Option 1: Auto device map with max memory specification
max_memory = {
    0: "22GiB",  # Leave some GPU memory for computations
    "cpu": "100GiB"  # Allow CPU offloading
}

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    max_memory=max_memory,
    torch_dtype=torch.bfloat16,
    offload_folder="offload",  # Folder for disk offloading if needed
    offload_state_dict=True
)

print("Base model loaded successfully with CPU offloading!")

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Important for batch inference

print(f"Loading adapter from {repo_id}...")
model = PeftModel.from_pretrained(
    model,
    repo_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

model.eval()
print("✅ Model and adapter loaded successfully!")

# Cell 3: Single prompt inference function
def generate_response(prompt, max_new_tokens=300, temperature=0.7, top_p=0.9):
    """Generate response for a single prompt"""
    
    # Format prompt
    formatted_prompt = f"""### Instruction:
{prompt}

### Response:
"""
    
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2048,
        padding=True
    )
    
    # Move to appropriate device
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # Handle device placement
    if hasattr(model, 'device'):
        input_ids = input_ids.to(model.device)
        attention_mask = attention_mask.to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    
    # Decode response
    response = tokenizer.decode(
        outputs[0][input_ids.shape[1]:],
        skip_special_tokens=True
    )
    
    return response.strip()

# Cell 4: Batch inference function
def batch_generate(prompts, max_new_tokens=300, temperature=0.7, top_p=0.9, batch_size=2):
    """
    Generate responses for multiple prompts with batching
    
    Args:
        prompts: List of prompt strings
        max_new_tokens: Maximum tokens to generate per response
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
        batch_size: Number of prompts to process at once
    
    Returns:
        List of generated responses
    """
    
    responses = []
    total_prompts = len(prompts)
    
    print(f"Processing {total_prompts} prompts in batches of {batch_size}...")
    
    for i in range(0, total_prompts, batch_size):
        batch_prompts = prompts[i:i + batch_size]
        current_batch_size = len(batch_prompts)
        
        print(f"Processing batch {i//batch_size + 1}/{(total_prompts + batch_size - 1)//batch_size}")
        
        # Format all prompts in batch
        formatted_prompts = [
            f"### Instruction:\n{prompt}\n\n### Response:\n" 
            for prompt in batch_prompts
        ]
        
        # Tokenize batch
        inputs = tokenizer(
            formatted_prompts,
            return_tensors="pt",
            truncation=True,
            max_length=2048,
            padding=True
        )
        
        # Handle device placement
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        
        if hasattr(model, 'device'):
            input_ids = input_ids.to(model.device)
            attention_mask = attention_mask.to(model.device)
        
        # Generate for batch
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=top_p,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1
            )
        
        # Decode each response in batch
        for j in range(current_batch_size):
            response = tokenizer.decode(
                outputs[j][input_ids[j].shape[0]:],
                skip_special_tokens=True
            )
            responses.append(response.strip())
    
    return responses

# Cell 5: Test with medical prompts (single and batch)
# Test prompts
test_prompts = [
    "I have a fever and headache. What should I do?",
    "How should one manage a snake bite?",
    "What are the symptoms of malaria?",
    "A patient presents with chest pain and shortness of breath. What is the differential diagnosis?",
    "What is the first-line treatment for hypertension in Uganda?",
    "How do you manage severe dehydration in children?",
    "What are the warning signs of severe malaria?",
    "Describe the management of diabetic ketoacidosis."
]

# Test single prompt inference
print("=" * 80)
print("SINGLE PROMPT TEST")
print("=" * 80)
single_prompt = test_prompts[0]
print(f"Prompt: {single_prompt}")
print("\nGenerating response...")
response = generate_response(single_prompt, max_new_tokens=200)
print(f"\nResponse:\n{response}")

# Test batch inference
print("\n" + "=" * 80)
print("BATCH INFERENCE TEST")
print("=" * 80)

# Process first 4 prompts in batch
batch_responses = batch_generate(
    test_prompts[:4], 
    max_new_tokens=150,
    batch_size=2  # Process 2 at a time (adjust based on memory)
)

for i, (prompt, response) in enumerate(zip(test_prompts[:4], batch_responses), 1):
    print(f"\n{'='*60}")
    print(f"Prompt {i}: {prompt}")
    print(f"Response: {response[:300]}...")  # Show first 300 chars

# Cell 6: Full batch processing with results saving
def process_all_prompts(prompts, save_to_file=False, filename="batch_results.txt"):
    """
    Process all prompts and optionally save to file
    """
    print(f"\nProcessing all {len(prompts)} prompts...")
    
    results = []
    responses = batch_generate(
        prompts,
        max_new_tokens=250,
        batch_size=1  # Use 1 for safety with limited memory
    )
    
    for prompt, response in zip(prompts, responses):
        results.append({
            "prompt": prompt,
            "response": response
        })
    
    if save_to_file:
        with open(filename, "w", encoding="utf-8") as f:
            f.write("LLAMA-3-70B UGANDA CLINICAL GUIDELINES - BATCH RESULTS\n")
            f.write("=" * 80 + "\n\n")
            
            for i, result in enumerate(results, 1):
                f.write(f"[{i}] PROMPT:\n{result['prompt']}\n\n")
                f.write(f"RESPONSE:\n{result['response']}\n")
                f.write("-" * 80 + "\n\n")
        
        print(f"✅ Results saved to {filename}")
    
    return results

# Process all test prompts
all_results = process_all_prompts(test_prompts, save_to_file=True)

print("\n✅ Batch processing complete!")
print(f"Processed {len(all_results)} prompts successfully")

# Cell 7: Memory monitoring
def check_memory():
    """Check current memory usage"""
    if torch.cuda.is_available():
        print("GPU Memory Status:")
        print(f"  Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"  Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
        print(f"  Free: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()) / 1e9:.2f} GB")
    
    # Check which layers are on which device
    print("\nModel layer distribution:")
    device_map = model.hf_device_map if hasattr(model, 'hf_device_map') else {}
    devices = {}
    for layer, device in device_map.items():
        if device not in devices:
            devices[device] = []
        devices[device].append(layer)
    
    for device, layers in devices.items():
        print(f"  {device}: {len(layers)} layers")

check_memory()

CUDA available: True
GPU: NVIDIA L40S
GPU Memory: 50.87 GB
Loading base model with 4-bit quantization and CPU offloading...
This will take a few minutes...


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [1]:
!ls

In [7]:
%pip install llama-recipes fastcore "transformers!=4.38.*,!=4.39.*" --extra-index-url https://download.pytorch.org/whl/test/cu118

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/test/cu118
Collecting llama-recipes
  Downloading llama_recipes-0.0.5.post2-py3-none-any.whl.metadata (5.0 kB)
Collecting fastcore
  Downloading fastcore-1.8.7-py3-none-any.whl.metadata (3.7 kB)
Collecting llama-cookbook==0.0.5.post1 (from llama-recipes)
  Downloading llama_cookbook-0.0.5.post1-py3-none-any.whl.metadata (5.8 kB)
Collecting appdirs (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting black (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading black-25.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (81 kB)
Collecting chardet (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting codeshield (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading codeshield-1.0.1-py3-none-any.w

In [8]:
%pip install bitsandbytes>=0.43.0


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
from huggingface_hub import login


#@title [Optional] Login to the Hugging Face Hub
#@markdown Add a token with the "Write Access" role to be able to add your trained concept to the [Library of Concepts](https://huggingface.co/sd-concepts-library)
from huggingface_hub import notebook_login

notebook_login()


In [9]:
%%bash
cd fsdp_qlora
python train.py \
--train_type bnb_dora \
--model_name meta-llama/Meta-Llama-3-70B \
--dataset uganda_clinical_guidelines \
--dataset_samples 130 \
--batch_size 4 \
--context_length 2048 \
--gradient_accumulation_steps 2 \
--sharding_strategy full_shard \
--use_gradient_checkpointing true \
--reentrant_checkpointing true \
--use_cpu_offload false \
--use_activation_cpu_offload false \
--project_name "fsdp-quantized-ucg" \
--save_model true \
--output_dir ../models/Llama-3-70b-ucg-bnb-QDoRA

World size: 4


Generating train split:   0%|                                                      | 0/130 [00:00<?, ? examples/s]Generating train split: 100%|█████████████████████████████████████████| 130/130 [00:00<00:00, 20704.75 examples/s]
Fetching 30 files:   0%|                                                                   | 0/30 [00:00<?, ?it/s]Fetching 30 files:   0%|                                                                   | 0/30 [00:00<?, ?it/s]

Creating model 0


Fetching 30 files:   0%|                                                                   | 0/30 [00:00<?, ?it/s]Fetching 30 files:   0%|                                                                   | 0/30 [00:00<?, ?it/s]Fetching 30 files:   3%|█▉                                                        | 1/30 [02:03<59:27, 123.02s/it]Fetching 30 files:   3%|█▉                                                        | 1/30 [02:02<59:25, 122.95s/it]Fetching 30 files:   3%|█▉                                                        | 1/30 [02:02<59:24, 122.92s/it]Fetching 30 files:   3%|█▉                                                        | 1/30 [02:02<59:25, 122.95s/it]Fetching 30 files:   7%|███▉                                                       | 2/30 [02:54<37:44, 80.89s/it]Fetching 30 files:   7%|███▉                                                       | 2/30 [02:54<37:44, 80.86s/it]Fetching 30 files:   7%|███▉                                                   

Loading model 0


Loading & Quantizing Model Shards:   0%|                                                   | 0/30 [00:00<?, ?it/s]Fetching 30 files:  83%|████████████████████████████████████████████████▎         | 25/30 [07:21<00:58, 11.63s/it]Fetching 30 files: 100%|██████████████████████████████████████████████████████████| 30/30 [07:21<00:00, 14.70s/it]
Fetching 30 files:  83%|████████████████████████████████████████████████▎         | 25/30 [07:21<00:58, 11.64s/it]Fetching 30 files: 100%|██████████████████████████████████████████████████████████| 30/30 [07:21<00:00, 14.71s/it]
Loading & Quantizing Model Shards:   3%|█▍                                         | 1/30 [00:08<04:14,  8.76s/it]Loading & Quantizing Model Shards:   7%|██▊                                        | 2/30 [00:17<04:10,  8.95s/it]Loading & Quantizing Model Shards:  10%|████▎                                      | 3/30 [00:27<04:09,  9.25s/it]Loading & Quantizing Model Shards:  13%|█████▋                               

Rank 0: Model created: 1.518 GiB
Using BNB DORA 0
Rank 0: LoRA layers added: 1.518 GiB
Wrapping model w/ FSDP 0
Rank 0: Wrapped model: 20.529 GiB
Applying activation checkpointing 0
Total Training Steps: 4


Epoch 0, Loss 0.892, LR 1.00e-05: 100%|█████████████████████████████████████████████| 4/4 [01:23<00:00, 20.94s/it]


Finished training 0
CUDA event elapsed time: 80.2215859375 sec
time_taken: 80.2215859375
Rank 0: Before forward: 20.53 GiB
Rank 0: After forward: 24.87 GiB
Rank 0: After backward: 25.25 GiB
Rank 0: Peak allocated memory: 20.20 GiB
Rank 0: Peak reserved memory:  25.76 GiB
Saving trained LoRA weights.
Done 0
Using BNB DORA 2
Using BNB DORA 3
Using BNB DORA 1


In [14]:
!ls models/Llama-3-8b-ucg-10k-bnb-QDoRA

ls: cannot access 'models/Llama-3-8b-ucg-10k-bnb-QDoRA': No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
!ls models/Llama-3-70b-ucg-bnb-QDoRA

In [16]:
%%time
# Option 1: Simple inference test
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import safetensors

# Load the base model and tokenizer
model_name = "meta-llama/Meta-Llama-3-70B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Load your fine-tuned DoRA weights
# Note: This is a simplified approach - actual DoRA loading is more complex
dora_weights_path = "models/Llama-3-70b-ucg-bnb-QDoRA/model_state_dict.safetensors"

# Test with a Uganda clinical guidelines question
def test_model(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2000,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response

# Test prompts for Uganda clinical guidelines
test_prompts = [
    "I have a fever and headache. What should I do?",
    "I have a pain that feels like a muscle strain around my 2 bottom ribs, on the left side, it has been going on for 3 days",
    "The patient is a 35-year-old male with a history of hypertension and diabetes. He presents with a 2-week history of progressive dyspnea and lower extremity edema. What is the most likely diagnosis?",
    "How should one manage a snake bite?",
    "A patient is presenting fever, lower back pain, joint pains, and fatigue. how should one proceed to diagnose and treat the patient?",
    "A patient is presenting severe headache for the last few days that's worse in the mornings, nausea, vomiting, lightheadedness, and blurry vision. How should one proceed to diagnose and treat the patient?",
    "A patient is presenting pain and swelling in knees and wrists especially in the morning that improves during the day, fatigue, and a rash on the face. How should one proceed to diagnose and treat the patient?",
    "A patient is presenting excessive thirst, increased urination, blurred vision, and unexplained weight loss. How should one proceed to diagnose and treat the patient?",
]

print("Testing your fine-tuned model:")
for i, prompt in enumerate(test_prompts, 1):
    print(f"\n--- Test {i} ---")
    print(f"Prompt: {prompt}")
    print(f"Response: {test_model(prompt)}")
    print("-" * 50)



Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Testing your fine-tuned model:

--- Test 1 ---
Prompt: I have a fever and headache. What should I do?
Response:  Should I go to the emergency room?
If you have a fever, headache, and/or a cough, we recommend that you call your healthcare provider for advice. If you are in need of medical attention and are concerned about COVID-19, call ahead before going to your healthcare provider’s office, urgent care or the emergency room.
If you do not have a healthcare provider, you can call 2-1-1 for help finding a healthcare provider near you.
What is a coronavirus, and what is COVID-19?
Coronaviruses are a large family of viruses that usually cause mild respiratory illnesses such as the common cold. Some coronaviruses have caused more severe illness, such as Severe Acute Respiratory Syndromes (SARS) and Middle East Respiratory Syndrome (MERS). COVID-19 is a disease caused by a new coronavirus that has not been previously seen in humans.
What are the symptoms of COVID-19? How is it spread?
Sympt

In [19]:
from huggingface_hub import HfApi, create_repo
from pathlib import Path
import json

# Configuration
model_path = "models/Llama-3-70b-ucg-bnb-QDoRA"
repo_name = "silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora"  # Change to your username
base_model = "meta-llama/Meta-Llama-3-70B"

# Create repository
api = HfApi()
try:
    create_repo(repo_id=repo_name, private=True)  # Set private=False if you want it public
    print(f"Created repository: {repo_name}")
except:
    print(f"Repository {repo_name} already exists")

# Upload all files from your output directory
api.upload_folder(
    folder_path=model_path,
    repo_id=repo_name,
    repo_type="model",
    commit_message="Upload Llama-3-70B QDoRA adapter fine-tuned on Uganda Clinical Guidelines"
)

print(f"✅ Model uploaded to: https://huggingface.co/{repo_name}")

Repository silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora already exists


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...-QDoRA/model_state_dict.safetensors:   1%|1         | 16.5MB / 1.50GB            

✅ Model uploaded to: https://huggingface.co/silvaKenpachi/Llama-3-70b-uganda-clinical-fsdp-qdora


In [12]:
import subprocess
import sys

args = [
    sys.executable, "train.py",
    "--model_name", "meta-llama/Llama-2-70b-hf",
    "--batch_size", "2",
    "--context_length", "512",
    "--precision", "bf16",
    "--train_type", "qlora",
    "--use_gradient_checkpointing", "true",
    "--use_cpu_offload", "true",
    "--dataset", "ug_clinical_guidelines",
    "--reentrant_checkpointing", "true"
]

result = subprocess.run(args, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("Errors:", result.stderr)

python: can't open file '/root/train.py': [Errno 2] No such file or directory
