In [None]:
# Check environment and dependencies
import sys
import torch
import time
from importlib.metadata import version

print("🔍 Environment Check")
print("=" * 50)
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")

# Check for required packages
required_packages = ["torch", "safetensors", "tiktoken", "huggingface_hub", "blobfile"]
missing_packages = []

for package in required_packages:
    try:
        ver = version(package)
        print(f"{package}: {ver} ✅")
    except Exception:
        missing_packages.append(package)
        print(f"{package}: Not installed ❌")

if missing_packages:
    print(f"\n⚠️  Missing packages: {', '.join(missing_packages)}")
    print("Install with: pip install " + " ".join(missing_packages))
else:
    print("\n✅ All dependencies are installed!")

# Check device availability
if torch.cuda.is_available():
    device = "CUDA"
    print(f"🚀 CUDA available: {torch.cuda.get_device_name()}")
elif torch.backends.mps.is_available():
    device = "MPS (Apple Silicon)"
    print("🍎 MPS (Apple Silicon) available")
else:
    device = "CPU"
    print("💻 Using CPU")

print(f"Selected device: {device}")
print("=" * 50)


In [None]:
from fast_llm import FastLLM

print("🔥 FastLLM Instant Startup Demo")
print("=" * 50)

# First load - this will take ~20-25 seconds to download and cache the model
print("⏱️  First load (will be slow, but only once!)...")
start_time = time.time()

model1 = FastLLM("meta-llama/Llama-3.2-1B-Instruct", context_length=4096, use_float16=True)

first_load_time = time.time() - start_time
print(f"First load completed in: {first_load_time:.2f}s")

# Now let's create a second instance - this should be instant!
print("\n⚡ Second load (should be instant!)...")
start_time = time.time()

model2 = FastLLM("meta-llama/Llama-3.2-1B-Instruct", context_length=4096, use_float16=True)

second_load_time = time.time() - start_time
speedup = first_load_time / second_load_time if second_load_time > 0 else float('inf')

print(f"Second load completed in: {second_load_time:.6f}s")
print(f"🚀 Speedup: {speedup:.0f}x faster!")

print("\n✨ This is the magic of our caching system - similar to Ollama!")


In [None]:
print("📝 Basic Text Generation Demo")
print("=" * 50)

# Test different types of prompts
test_prompts = [
    "What is 53 + 27?",
    "Once upon a time, there was a curious cat who",
    "The future of artificial intelligence is",
    "Explain quantum computing in simple terms:"
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{i}. Prompt: {prompt}")
    print("Response: ", end="")
    
    # Generate with streaming output
    for token in model1.generate(
        prompt, 
        max_new_tokens=50,
        temperature=0.7, 
        top_k=40,
        repetition_penalty=1.1,
        use_kv_cache=True
    ):
        print(token, end="", flush=True)
    
    print("\n" + "-" * 30)


In [None]:
def benchmark_generation(model, prompt, max_tokens=100, description=""):
    """Benchmark text generation performance"""
    print(f"🔬 {description}")
    print(f"Prompt: {prompt}")
    print("Response: ", end="")
    
    start_time = time.time()
    token_count = 0
    
    for token in model.generate(
        prompt,
        max_new_tokens=max_tokens,
        temperature=0.7,
        top_k=40,
        repetition_penalty=1.1,
        use_kv_cache=True
    ):
        print(token, end="", flush=True)
        token_count += 1
    
    end_time = time.time()
    generation_time = end_time - start_time
    tokens_per_second = token_count / generation_time if generation_time > 0 else 0
    
    print(f"\n\n📊 Performance Metrics:")
    print(f"   Tokens generated: {token_count}")
    print(f"   Time taken: {generation_time:.2f}s")
    print(f"   Speed: {tokens_per_second:.2f} tokens/second")
    print("=" * 50)
    
    return tokens_per_second

# Benchmark different scenarios
print("🚀 Performance Benchmarking")
print("=" * 50)

# Test 1: Short generation
speed1 = benchmark_generation(
    model1, 
    "The capital of France is",
    max_tokens=30,
    description="Short Generation Test (30 tokens)"
)

# Test 2: Medium generation  
speed2 = benchmark_generation(
    model1,
    "Write a short story about a robot:",
    max_tokens=100,
    description="Medium Generation Test (100 tokens)"
)

# Test 3: Long generation
speed3 = benchmark_generation(
    model1,
    "Explain the history of artificial intelligence:",
    max_tokens=200,
    description="Long Generation Test (200 tokens)"
)

print(f"📈 Average Performance: {(speed1 + speed2 + speed3) / 3:.2f} tokens/second")


In [None]:
def benchmark_kv_cache(model, prompt, use_cache=True):
    """Benchmark with/without KV cache"""
    cache_status = "WITH" if use_cache else "WITHOUT"
    print(f"🧪 Testing {cache_status} KV Cache")
    print(f"Prompt: {prompt}")
    print("Response: ", end="")
    
    start_time = time.time()
    token_count = 0
    
    for token in model.generate(
        prompt,
        max_new_tokens=50,
        temperature=0.7,
        top_k=40,
        repetition_penalty=1.1,
        use_kv_cache=use_cache
    ):
        print(token, end="", flush=True)
        token_count += 1
    
    end_time = time.time()
    generation_time = end_time - start_time
    tokens_per_second = token_count / generation_time if generation_time > 0 else 0
    
    print(f"\n📊 {cache_status} KV Cache: {tokens_per_second:.2f} tokens/second")
    print("-" * 30)
    
    return tokens_per_second

print("⚡ KV Cache Performance Comparison")
print("=" * 50)

test_prompt = "Explain machine learning in simple terms:"

# Test with KV cache
speed_with_cache = benchmark_kv_cache(model1, test_prompt, use_cache=True)

# Test without KV cache
speed_without_cache = benchmark_kv_cache(model1, test_prompt, use_cache=False)

# Calculate improvement
improvement = (speed_with_cache / speed_without_cache - 1) * 100 if speed_without_cache > 0 else 0

print(f"🚀 KV Cache provides {improvement:.1f}% performance improvement!")


In [None]:
print("🏗️ Architecture Exploration")
print("=" * 50)

# Access the model from FastLLM
llama_model = model1.model
config = model1.config

print("📋 Model Configuration:")
print(f"   Model size: {config['emb_dim']} dimensions ({'1B' if config['emb_dim'] == 2048 else '3B'} parameters)")
print(f"   Vocabulary size: {config['vocab_size']:,}")
print(f"   Context length: {config['context_length']:,}")
print(f"   Number of layers: {config['n_layers']}")
print(f"   Attention heads: {config['n_heads']}")
print(f"   KV groups: {config['n_kv_groups']}")
print(f"   Hidden dimension: {config['hidden_dim']:,}")
print(f"   RoPE base: {config['rope_base']:,}")

print(f"\n🧮 Model Statistics:")
total_params = sum(p.numel() for p in llama_model.parameters())
print(f"   Total parameters: {total_params:,}")

# Account for weight tying
total_params_unique = total_params - llama_model.tok_emb.weight.numel()
print(f"   Unique parameters: {total_params_unique:,}")

# Memory usage estimation
memory_gb = total_params * 2 / (1024**3)  # 2 bytes per parameter for float16
print(f"   Memory usage (float16): {memory_gb:.2f} GB")

print(f"\n🔍 Architecture Components:")
print(f"   Token Embedding: {llama_model.tok_emb}")
print(f"   Transformer Blocks: {len(llama_model.trf_blocks)} layers")
print(f"   Final Norm: {llama_model.final_norm}")
print(f"   Output Head: {llama_model.out_head}")

# Examine a single transformer block
block = llama_model.trf_blocks[0]
print(f"\n🔬 First Transformer Block Structure:")
print(f"   Attention: {block.att}")
print(f"   Feed Forward: {block.ff}")
print(f"   Layer Norm 1: {block.norm1}")
print(f"   Layer Norm 2: {block.norm2}")

print(f"\n🎯 Attention Details:")
att = block.att
print(f"   Query projection: {att.W_query.weight.shape}")
print(f"   Key projection: {att.W_key.weight.shape}")
print(f"   Value projection: {att.W_value.weight.shape}")
print(f"   Output projection: {att.out_proj.weight.shape}")
print(f"   Head dimension: {att.head_dim}")
print(f"   Group size: {att.group_size}")


In [None]:
print("⚖️ Model Comparison: 1B vs 3B")
print("=" * 50)

# Load 3B model (this will demonstrate our fix for multi-file loading)
try:
    print("🔄 Loading 3B model...")
    start_time = time.time()
    model_3b = FastLLM("meta-llama/Llama-3.2-3B-Instruct", context_length=4096, use_float16=True)
    load_time_3b = time.time() - start_time
    print(f"3B model loaded in: {load_time_3b:.2f}s")
    
    # Compare configurations
    config_1b = model1.config
    config_3b = model_3b.config
    
    print(f"\n📊 Configuration Comparison:")
    print(f"{'Metric':<20} {'1B Model':<15} {'3B Model':<15}")
    print("-" * 50)
    print(f"{'Embedding Dim':<20} {config_1b['emb_dim']:<15} {config_3b['emb_dim']:<15}")
    print(f"{'Layers':<20} {config_1b['n_layers']:<15} {config_3b['n_layers']:<15}")
    print(f"{'Attention Heads':<20} {config_1b['n_heads']:<15} {config_3b['n_heads']:<15}")
    print(f"{'Hidden Dim':<20} {config_1b['hidden_dim']:<15} {config_3b['hidden_dim']:<15}")
    
    # Compare parameter counts
    params_1b = sum(p.numel() for p in model1.model.parameters())
    params_3b = sum(p.numel() for p in model_3b.model.parameters())
    
    print(f"\n🧮 Parameter Comparison:")
    print(f"1B Model: {params_1b:,} parameters")
    print(f"3B Model: {params_3b:,} parameters")
    print(f"Ratio: {params_3b/params_1b:.2f}x larger")
    
    # Compare generation quality on the same prompt
    test_prompt = "Write a creative short story about time travel:"
    
    print(f"\n📝 Generation Quality Comparison")
    print(f"Prompt: {test_prompt}")
    
    print(f"\n🤖 1B Model Response:")
    for token in model1.generate(test_prompt, max_new_tokens=80, temperature=0.8, top_k=40):
        print(token, end="", flush=True)
    
    print(f"\n\n🧠 3B Model Response:")
    for token in model_3b.generate(test_prompt, max_new_tokens=80, temperature=0.8, top_k=40):
        print(token, end="", flush=True)
    
    print(f"\n\n✅ Both models loaded successfully!")
    
except Exception as e:
    print(f"❌ Error loading 3B model: {e}")
    print("This might be due to:")
    print("- Insufficient memory")
    print("- Network issues downloading the model")
    print("- Missing Hugging Face authentication for gated models")


In [None]:
def compare_generation_settings(model, prompt, settings_list):
    """Compare different generation settings"""
    print(f"🎛️ Comparing Generation Settings")
    print(f"Prompt: {prompt}")
    print("=" * 60)
    
    for i, settings in enumerate(settings_list, 1):
        print(f"\n{i}. {settings['name']}:")
        print(f"   Temperature: {settings['temperature']}")
        print(f"   Top-k: {settings['top_k']}")
        print(f"   Repetition penalty: {settings['repetition_penalty']}")
        print("   Response: ", end="")
        
        for token in model.generate(
            prompt,
            max_new_tokens=60,
            temperature=settings['temperature'],
            top_k=settings['top_k'],
            repetition_penalty=settings['repetition_penalty'],
            use_kv_cache=True
        ):
            print(token, end="", flush=True)
        
        print("\n" + "-" * 40)

# Define different generation settings to compare
generation_settings = [
    {
        "name": "Conservative (Low Temperature)",
        "temperature": 0.3,
        "top_k": 20,
        "repetition_penalty": 1.05
    },
    {
        "name": "Balanced (Medium Temperature)",
        "temperature": 0.7,
        "top_k": 40,
        "repetition_penalty": 1.1
    },
    {
        "name": "Creative (High Temperature)",
        "temperature": 1.0,
        "top_k": 80,
        "repetition_penalty": 1.15
    },
    {
        "name": "Deterministic (Temperature = 0)",
        "temperature": 0.0,
        "top_k": 1,
        "repetition_penalty": 1.0
    }
]

# Test with a creative prompt
creative_prompt = "In a world where colors have sounds"

compare_generation_settings(model1, creative_prompt, generation_settings)


In [None]:
print("🎓 Educational Deep Dive")
print("=" * 50)

print("🔍 Key Innovations in Our Implementation:")
print()

print("1. 🚀 FastLLM Class-Level Caching:")
print("   - Similar to Ollama's approach")
print("   - Models cached in memory after first load")
print("   - Subsequent loads are instant (0.000s)")
print("   - 17,000x+ speedup for repeated usage")
print()

print("2. ⚡ KV Cache Optimization:")
print("   - Caches key/value tensors during generation")
print("   - Avoids recomputing attention for previous tokens")
print("   - Significantly improves generation speed")
print("   - Memory-efficient implementation")
print()

print("3. 🎯 Grouped Query Attention (GQA):")
print("   - Reduces memory usage compared to Multi-Head Attention")
print("   - Groups multiple query heads with fewer key/value heads")
print(f"   - Our 1B model: {config['n_heads']} query heads, {config['n_kv_groups']} KV groups")
print(f"   - Group size: {config['n_heads'] // config['n_kv_groups']} queries per KV group")
print()

print("4. 🌀 RoPE (Rotary Position Embedding):")
print("   - Encodes positional information through rotation")
print("   - Better extrapolation to longer sequences")
print("   - No learned positional embeddings needed")
print(f"   - RoPE base frequency: {config['rope_base']:,}")
print()

print("5. 🔧 Technical Optimizations:")
print("   - Mixed precision (float16) for memory efficiency")
print("   - Apple MPS acceleration on Mac")
print("   - Repetition penalty with sliding window")
print("   - Proper tokenizer handling (no chat mode artifacts)")
print("   - Multi-file safetensors support for 3B model")
print()

print("6. 📚 Educational Design:")
print("   - Based on 'Build a Large Language Model From Scratch'")
print("   - Clear, readable code structure")
print("   - Comprehensive documentation")
print("   - Step-by-step explanations")

# Show the caching mechanism
print(f"\n🗃️ Current Model Cache Status:")
cache_info = FastLLM.list_cached_models()
if hasattr(FastLLM, '_cached_models') and FastLLM._cached_models:
    print(f"   Cached models: {len(FastLLM._cached_models)}")
    for key in FastLLM._cached_models.keys():
        print(f"   - {key}")
else:
    print("   No models currently cached")


In [None]:
def test_italian_text_fix():
    """Test that the Italian text generation bug is fixed"""
    print("🔧 Testing Italian Text Generation Fix")
    print("-" * 40)
    
    # This prompt previously triggered Italian text with << >> formatting
    problematic_prompt = "What is 53 + 27?"
    print(f"Prompt: {problematic_prompt}")
    print("Response: ", end="")
    
    response_tokens = []
    for token in model1.generate(
        problematic_prompt, 
        max_new_tokens=30,
        temperature=0.7, 
        top_k=40,
        repetition_penalty=1.1,
        use_kv_cache=True
    ):
        print(token, end="", flush=True)
        response_tokens.append(token)
    
    response = "".join(response_tokens)
    
    # Check for Italian text indicators
    italian_indicators = ["<<", ">>", "Ecco", "risultati", "operazione", "matematica"]
    has_italian = any(indicator in response for indicator in italian_indicators)
    
    if has_italian:
        print(f"\n❌ ISSUE: Response contains Italian text artifacts")
        return False
    else:
        print(f"\n✅ FIXED: No Italian text artifacts detected")
        return True

def test_3b_model_loading():
    """Test that 3B model loading works with multi-file safetensors"""
    print("\n🔧 Testing 3B Model Multi-File Loading Fix")
    print("-" * 40)
    
    try:
        # This should work with our multi-file loading fix
        test_model = FastLLM("meta-llama/Llama-3.2-3B-Instruct", context_length=1024, use_float16=True)
        print("✅ FIXED: 3B model loads successfully with multi-file safetensors")
        return True
    except Exception as e:
        print(f"❌ ISSUE: 3B model loading failed: {e}")
        return False

def test_performance_targets():
    """Test that we meet performance targets"""
    print("\n🔧 Testing Performance Targets")
    print("-" * 40)
    
    # Test generation speed
    start_time = time.time()
    token_count = 0
    
    for token in model1.generate(
        "The future of technology is",
        max_new_tokens=50,
        temperature=0.7,
        use_kv_cache=True
    ):
        token_count += 1
    
    generation_time = time.time() - start_time
    tokens_per_second = token_count / generation_time if generation_time > 0 else 0
    
    target_speed = 15.0  # tokens per second
    
    if tokens_per_second >= target_speed:
        print(f"✅ PERFORMANCE: {tokens_per_second:.2f} tokens/s (target: {target_speed})")
        return True
    else:
        print(f"❌ PERFORMANCE: {tokens_per_second:.2f} tokens/s (below target: {target_speed})")
        return False

print("🛠️ Troubleshooting & Fixes Verification")
print("=" * 50)

# Run all tests
test1_passed = test_italian_text_fix()
test2_passed = test_3b_model_loading()  
test3_passed = test_performance_targets()

print(f"\n📊 Fix Verification Summary:")
print(f"   Italian Text Fix: {'✅ PASSED' if test1_passed else '❌ FAILED'}")
print(f"   3B Model Loading: {'✅ PASSED' if test2_passed else '❌ FAILED'}")
print(f"   Performance Target: {'✅ PASSED' if test3_passed else '❌ FAILED'}")

all_passed = test1_passed and test2_passed and test3_passed
print(f"\n🎯 Overall Status: {'✅ ALL FIXES WORKING' if all_passed else '❌ SOME ISSUES REMAIN'}")
