In [1]:
import subprocess
import sys
import os
import urllib.request
import zipfile
import ssl

# Create an SSL context that doesn't verify certificates
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

# Install packages first
print("🔧 Installing Python packages...")
packages = [
    "torch", "torchvision", "torchaudio", 
    "transformers", "datasets", "accelerate", "peft",
    "bitsandbytes>=0.43.0", "safetensors", "fastcore", "requests"
]

for package in packages:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--user", package], 
                            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(f"✅ Installed {package}")
    except:
        print(f"❌ Failed to install {package}")

# Download with SSL verification disabled
print("\n📥 Downloading repository...")
url = "https://github.com/AnswerDotAI/fsdp_qlora/archive/refs/heads/main.zip"

try:
    # Use urllib with disabled SSL verification
    urllib.request.urlretrieve(url, "fsdp_qlora.zip", context=ssl_context)
    print("✅ Downloaded using urllib")
except Exception as e:
    print(f"❌ urllib failed: {e}")
    
    # Fallback to requests
    try:
        import requests
        print("🔄 Trying with requests...")
        response = requests.get(url, verify=False)
        with open("fsdp_qlora.zip", "wb") as f:
            f.write(response.content)
        print("✅ Downloaded using requests")
    except Exception as e2:
        print(f"❌ requests also failed: {e2}")

# Extract if download succeeded
if os.path.exists("fsdp_qlora.zip"):
    print("📂 Extracting repository...")
    
    # Clean up existing directory
    if os.path.exists("fsdp_qlora"):
        import shutil
        shutil.rmtree("fsdp_qlora")
    
    with zipfile.ZipFile("fsdp_qlora.zip", 'r') as zip_ref:
        zip_ref.extractall(".")
    
    os.rename("fsdp_qlora-main", "fsdp_qlora")
    os.remove("fsdp_qlora.zip")
    print("✅ Setup complete! Repository is in ./fsdp_qlora")
    
    # Verify
    if os.path.exists("fsdp_qlora/train.py"):
        print("🚀 train.py found - ready to train!")
    else:
        print("❌ train.py not found")
else:
    print("❌ Download failed completely")

🔧 Installing Python packages...
✅ Installed torch
✅ Installed torchvision
✅ Installed torchaudio
✅ Installed transformers
✅ Installed datasets
✅ Installed accelerate
✅ Installed peft
✅ Installed bitsandbytes>=0.43.0
✅ Installed safetensors
✅ Installed fastcore
✅ Installed requests

📥 Downloading repository...
❌ urllib failed: urlretrieve() got an unexpected keyword argument 'context'
🔄 Trying with requests...




✅ Downloaded using requests
📂 Extracting repository...
✅ Setup complete! Repository is in ./fsdp_qlora
🚀 train.py found - ready to train!


In [2]:
%pip install huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install llama-recipes fastcore "transformers!=4.38.*,!=4.39.*" --extra-index-url https://download.pytorch.org/whl/test/cu118

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/test/cu118
Collecting llama-recipes
  Downloading llama_recipes-0.0.5.post2-py3-none-any.whl.metadata (5.0 kB)
Collecting llama-cookbook==0.0.5.post1 (from llama-recipes)
  Downloading llama_cookbook-0.0.5.post1-py3-none-any.whl.metadata (5.8 kB)
Collecting appdirs (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting black (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading black-25.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (81 kB)
Collecting chardet (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting codeshield (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downloading codeshield-1.0.1-py3-none-any.whl.metadata (5.2 kB)
Collecting evaluate (from llama-cookbook==0.0.5.post1->llama-recipes)
  Downl

In [4]:
%pip install bitsandbytes>=0.43.0

Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import login
import getpass

# Get your token securely
hf_token = getpass.getpass("Enter your Hugging Face token: ")

# Login programmatically
login(token=hf_token)

print("✅ Successfully logged in to Hugging Face!")

  from .autonotebook import tqdm as notebook_tqdm


Enter your Hugging Face token:  ········


✅ Successfully logged in to Hugging Face!


In [2]:
import os
import subprocess

# Set environment variable
os.environ['BNB_CUDA_VERSION'] = '125'

# Install ONLY the essential fixes
commands = [
    ["pip", "install", "transformers==4.47.1", "--upgrade"],
    ["pip", "install", "bitsandbytes>=0.43.0", "--upgrade", "--force-reinstall"]
]

for cmd in commands:
    print(f"Running: {' '.join(cmd)}")
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
        if result.returncode != 0:
            print(f"Error: {result.stderr}")
        else:
            print("✅ Success")
    except subprocess.TimeoutExpired:
        print("⚠️ Command timed out")

print("✅ Essential dependencies updated")

Running: pip install transformers==4.47.1 --upgrade
✅ Success
Running: pip install bitsandbytes>=0.43.0 --upgrade --force-reinstall
✅ Success
✅ Essential dependencies updated


In [3]:
import os
os.chdir("fsdp_qlora")

# Apply fixes to train.py
with open("train.py", "r") as f:
    content = f.read()

# Apply transformers fix
if "LLAMA_ATTENTION_CLASSES" in content:
    print("🔧 Applying transformers fix...")
    
    # Simple replacement approach
    content = content.replace(
        "LLAMA_ATTENTION_CLASSES,", 
        "LlamaAttention,"
    )
    content = content.replace(
        "MISTRAL_ATTENTION_CLASSES,", 
        "MistralAttention,"
    )
    content = content.replace(
        "(*LLAMA_ATTENTION_CLASSES.values(), *MISTRAL_ATTENTION_CLASSES.values())",
        "(LlamaAttention, MistralAttention)"
    )
    
    # Add dataset choice
    if "uganda_clinical_guidelines" not in content:
        content = content.replace(
            '"orca_math"]) = "alpaca_sample",',
            '"orca_math", "uganda_clinical_guidelines"]) = "alpaca_sample",'
        )
    
    with open("train.py", "w") as f:
        f.write(content)
    
    print("✅ train.py fixed")
    

🔧 Applying transformers fix...
✅ train.py fixed


In [4]:
# Test if fixes work
try:
    import bitsandbytes
    print("✅ Bitsandbytes works")
except Exception as e:
    print(f"❌ Bitsandbytes issue: {e}")

print("Ready for training!")

This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=



❌ Bitsandbytes issue: Failed to find C compiler. Please specify via CC environment variable.
Ready for training!


In [6]:
import subprocess
import os

def setup_environment():
    """Setup the environment to avoid compiler issues"""
    
    print("🔧 Setting up environment for training...")
    
    # Step 1: Set environment variables
    os.environ['BNB_CUDA_VERSION'] = '125'
    os.environ['CC'] = '/usr/bin/gcc'
    os.environ['CXX'] = '/usr/bin/g++'
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
    
    # Step 2: Install build tools if possible
    try:
        print("📦 Installing build tools...")
        subprocess.run(["apt", "update"], capture_output=True, timeout=60)
        result = subprocess.run(["apt", "install", "-y", "build-essential", "gcc", "g++"], 
                              capture_output=True, timeout=120)
        if result.returncode == 0:
            print("✅ Build tools installed")
        else:
            print("⚠️ Build tools installation failed, proceeding anyway...")
    except Exception as e:
        print(f"⚠️ Could not install build tools: {e}")
    
    # Step 3: Test if bitsandbytes works now
    try:
        import bitsandbytes
        print("✅ Bitsandbytes imports successfully")
        return True
    except Exception as e:
        print(f"❌ Bitsandbytes still has issues: {e}")
        
        # Step 4: Try installing older version
        print("🔄 Trying older bitsandbytes version...")
        try:
            subprocess.run(["pip", "uninstall", "bitsandbytes", "-y"], capture_output=True)
            subprocess.run(["pip", "install", "bitsandbytes==0.41.3"], capture_output=True)
            
            import bitsandbytes
            print("✅ Older bitsandbytes version works")
            return True
        except Exception as e2:
            print(f"❌ Even older version failed: {e2}")
            return False

# Run the setup
if setup_environment():
    print("🚀 Environment ready! Running training...")
    
    # Your training command
    cmd = [
        "python", "train.py",
        "--train_type", "bnb_dora",
        "--model_name", "meta-llama/Llama-2-7b-hf", 
        "--dataset", "ug_clinical_guidelines",  # Fixed dataset name
        "--dataset_samples", "10",
        "--batch_size", "1",
        "--context_length", "256",
        "--num_epochs", "1",
        "--save_model", "false",
        "--log_to", "stdout"
    ]
    
    print("🧪 Running test training...")
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    
    try:
        for line in iter(process.stdout.readline, ''):
            if line:
                print(line.rstrip())
        process.wait()
        print(f"Test completed: {process.returncode}")
    except KeyboardInterrupt:
        print("Interrupted")
        process.terminate()
        
else:
    print("❌ Could not setup environment properly")

🔧 Setting up environment for training...
📦 Installing build tools...
✅ Build tools installed
✅ Bitsandbytes imports successfully
🚀 Environment ready! Running training...
🧪 Running test training...
This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=

World size: 2
This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=

This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=


Generating train split:   0%|          | 0/130 [00:00<?, ? examples/s]
Generating train split: 100%|██████████

In [7]:
!ls

'Converting the State Dict.ipynb'   fsdp_multi_node.sh	 tests
 LICENSE			    hf_train.py		 train.py
 PROFILING.md			    nbs			 train.sh
 README.md			    profile.sh		 train_hqq_bench.sh
 __pycache__			    profiling_utils.py	 train_sql.sh
 benchmarking			    scripts
 benchmarks_03_2024.md		    table1.sh


In [9]:
import subprocess
import os

# Set environment
os.environ['BNB_CUDA_VERSION'] = '125'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

# FULL TRAINING with model saving
cmd = [
    "python", "train.py",
    "--train_type", "bnb_dora",
    "--model_name", "meta-llama/Llama-2-7b-hf", 
    "--dataset", "ug_clinical_guidelines",
    "--dataset_samples", "130",  # Use all your data
    "--batch_size", "2",
    "--context_length", "512",   # Longer context for medical text
    "--precision", "bf16",
    "--num_epochs", "3",         # More epochs for better training
    "--save_model", "true",      # 🔥 SAVE THE MODEL
    "--output_dir", "./uganda_clinical_model",  # Where to save
    "--log_to", "stdout"
]

print("🏥 Training Uganda Clinical Model (FULL)...")
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

try:
    for line in iter(process.stdout.readline, ''):
        if line:
            print(line.rstrip())
    process.wait()
    print(f"Training completed: {process.returncode}")
    
    # Check if model was saved
    if os.path.exists("uganda_clinical_model"):
        print("🎉 Model saved successfully!")
        print("📁 Saved files:")
        for f in os.listdir("uganda_clinical_model"):
            print(f"  📄 {f}")
    
except KeyboardInterrupt:
    print("Interrupted")
    process.terminate()

🏥 Training Uganda Clinical Model (FULL)...
This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=

World size: 2
This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=

This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=

Creating model 0
Loading model 0

This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=

This 

In [10]:
import os
import zipfile

def download_model():
    """Package the model for download"""
    if os.path.exists("uganda_clinical_model"):
        print("📦 Packaging model for download...")
        
        # Create a zip file
        with zipfile.ZipFile("uganda_clinical_qdora_model.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk("uganda_clinical_model"):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, file_path)
        
        print("✅ Model packaged as uganda_clinical_qdora_model.zip")
        print(f"📊 File size: {os.path.getsize('uganda_clinical_qdora_model.zip') / 1024 / 1024:.1f} MB")
        
        # In Jupyter, this will be available for download
        print("💾 You can download this file from the Jupyter file browser")
        
    else:
        print("❌ No model directory found. Run training with --save_model true first")

download_model()

📦 Packaging model for download...
✅ Model packaged as uganda_clinical_qdora_model.zip
📊 File size: 217.9 MB
💾 You can download this file from the Jupyter file browser


In [12]:
def test_model():
    """Test the trained model"""
    
    # Load the model for inference
    from transformers import AutoTokenizer, AutoModelForCausalLM
    import torch
    
    model_path = "./uganda_clinical_model"
    
    if not os.path.exists(model_path):
        print("❌ Model not found. Train with --save_model true first")
        return
    
    print("🔄 Loading trained model...")
    
    try:
        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        
        print("✅ Model loaded successfully!")
        
        # Test with a medical question
        test_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
I have fever, general body weakness, joint paints and have been getting by mosquitoes often. what could be the cause ?

### Response:"""
        
        print("\n🧪 Testing model...")
        print("Question: What are the symptoms of malaria?")
        print("\nModel Response:")
        
        # Generate response
        inputs = tokenizer(test_prompt, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                max_length=inputs.input_ids.shape[1] + 150,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract just the response part
        response_only = response.split("### Response:")[-1].strip()
        print(response_only)
        
    except Exception as e:
        print(f"❌ Error loading model: {e}")

# Run after you've saved the model
test_model()

🔄 Loading trained model...
❌ Error loading model: Unrecognized model in ./uganda_clinical_model. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava

In [13]:
import os

def inspect_saved_model():
    """Check what files were actually saved"""
    model_dir = "./uganda_clinical_model"
    
    if os.path.exists(model_dir):
        print("📁 Files in uganda_clinical_model:")
        for file in os.listdir(model_dir):
            file_path = os.path.join(model_dir, file)
            size = os.path.getsize(file_path) / 1024 / 1024  # MB
            print(f"  📄 {file} ({size:.1f} MB)")
        
        # Check for specific files
        expected_files = ["adapter_config.json", "adapter_model.bin", "adapter_model.safetensors"]
        for expected in expected_files:
            if os.path.exists(os.path.join(model_dir, expected)):
                print(f"✅ Found: {expected}")
            else:
                print(f"❌ Missing: {expected}")
    else:
        print("❌ Model directory not found")

inspect_saved_model()

📁 Files in uganda_clinical_model:
  📄 model_state_dict.safetensors (275.4 MB)
❌ Missing: adapter_config.json
❌ Missing: adapter_model.bin
❌ Missing: adapter_model.safetensors
