# 🔍 Environment Check

**Run the cell below to verify you're using the venv (not global Python)**

In [None]:
import sys
import os

print("=" * 70)
print("🐍 PYTHON ENVIRONMENT CHECK")
print("=" * 70)

# Python version
print(f"Python Version: {sys.version}")
print()

# Python executable location
executable = sys.executable
print(f"Python Executable: {executable}")

# Check if running from venv
if 'venv' in executable:
    print("✅ Status: USING VENV (Local environment)")
    print(f"   Located in: {os.path.dirname(os.path.dirname(executable))}")
elif 'homebrew' in executable or '/usr/bin' in executable or '/usr/local' in executable:
    print("❌ Status: USING GLOBAL PYTHON (System-wide)")
    print("   ⚠️  You may want to use a venv for isolation")
else:
    print(f"✅ Status: Custom environment")
    print(f"   Located in: {os.path.dirname(os.path.dirname(executable))}")

print()

# Site packages location (where libraries are installed)
import site
site_packages = site.getsitepackages()[0]
print(f"Packages Location: {site_packages}")

if 'venv' in site_packages:
    print("✅ Packages: Installed in venv (isolated)")
elif 'site-packages' in site_packages:
    print("✅ Packages: Standard Python environment")
else:
    print("⚠️ Packages: Unknown location")

print()

# Check key packages
print("📦 Key Packages:")
try:
    import matplotlib
    print(f"   ✅ matplotlib {matplotlib.__version__}")
except ImportError:
    print("   ❌ matplotlib NOT FOUND")

try:
    import pandas
    print(f"   ✅ pandas {pandas.__version__}")
except ImportError:
    print("   ❌ pandas NOT FOUND")

try:
    import numpy
    print(f"   ✅ numpy {numpy.__version__}")
except ImportError:
    print("   ❌ numpy NOT FOUND")

try:
    import torch
    print(f"   ✅ torch {torch.__version__}")
    if torch.cuda.is_available():
        print(f"      GPU: {torch.cuda.get_device_name(0)}")
except ImportError:
    print("   ⚠️  torch NOT FOUND (OK for local, needed for Vast.ai)")

print("=" * 70)

# Phase 1B Benchmark Diagnostic Analysis

**Purpose:** Diagnose why MATH has 70% ties and CODE is at 58%

**Sections:**
1. Benchmark results visualization
2. Model loading and testing
3. Automated consistency tests
4. JSON analysis with error handling

In [None]:
# Setup paths and imports
from pathlib import Path
import json
import matplotlib.pyplot as plt
import numpy as np

# Detect environment
if Path('/workspace').exists():
    base_path = Path('/workspace/data/Cogumi-LLM')
    print('Running on Vast.ai')
else:
    base_path = Path.cwd()
    print('Running locally')

In [None]:
# Configure paths
checkpoint_dir = base_path / 'checkpoints' / 'final'
dataset_file = base_path / 'phase1' / 'public_500k_filtered.jsonl'
benchmark_dir = base_path / 'benchmark_results'

print(f"Checkpoint: {checkpoint_dir} (exists: {checkpoint_dir.exists()})")
print(f"Dataset: {dataset_file} (exists: {dataset_file.exists()})")
print(f"Benchmark: {benchmark_dir} (exists: {benchmark_dir.exists()})")

## Section 1: Benchmark Results

Manual benchmark scores:
- MATH: 41% (70% ties - HIGH!)
- CODE: 58% (28% ties)
- REASONING: 86%
- KNOWLEDGE: 90%
- INSTRUCTION: 76%
- CREATIVITY: 60%

In [None]:
# Visualize benchmark results
results = {
    'MATH': (41, 70), 'CODE': (58, 28), 'REASONING': (86, 10),
    'KNOWLEDGE': (90, 8), 'INSTRUCTION': (76, 15), 'CREATIVITY': (60, 25)
}

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

categories = list(results.keys())
scores = [results[c][0] for c in categories]
ties = [results[c][1] for c in categories]

ax1.barh(categories, scores, color=['red' if s<65 else 'orange' if s<80 else 'green' for s in scores])
ax1.set_xlabel('Score (%)')
ax1.set_title('Benchmark Scores')
ax1.axvline(70, color='gray', linestyle='--', label='Target: 70%')
ax1.legend()

ax2.barh(categories, ties, color=['red' if t>50 else 'orange' if t>25 else 'green' for t in ties])
ax2.set_xlabel('Tie Rate (%)')
ax2.set_title('Tie Rates')
ax2.axvline(20, color='gray', linestyle='--', label='Expected: <20%')
ax2.legend()

plt.tight_layout()
plt.show()

## Section 2: Load Model (GPU Required)

⚠️ Requires 5-6GB VRAM. Skip on local Mac, run on Vast.ai.

In [None]:
# Import model libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
# Configure model paths
base_model_name = "unsloth/meta-llama-3.1-8b-instruct-bnb-4bit"

if Path('/workspace/data/Cogumi-LLM/checkpoints/final').exists():
    adapter_path = "/workspace/data/Cogumi-LLM/checkpoints/final"
else:
    adapter_path = str(checkpoint_dir)

print(f"Base model: {base_model_name}")
print(f"Adapter: {adapter_path}")
print(f"Adapter exists: {Path(adapter_path).exists()}")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("✅ Tokenizer loaded")

In [None]:
# Load base model
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)
print(f"✅ Base model loaded on {device}")

In [None]:
# Load adapter and merge
model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.merge_and_unload()
model.eval()
print("✅ Model ready")

## Section 3: Consistency Tests

Test 7 problems × 10 runs = 70 generations (~5-10 min)

In [None]:
# Run automated consistency tests
from collections import Counter
import time

test_problems = {
    'MATH': ["What is 15% of 80?", "Calculate: (25 + 35) × 2", "Average of 10, 20, 30?"],
    'CODE': ["Python function to sum two numbers", "Check if string is palindrome"],
    'CREATIVITY': ["Write a haiku about coding", "Describe a futuristic city in one sentence"]
}

def generate(prompt, temp=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=150, temperature=temp, do_sample=True)
    return tokenizer.decode(out[0], skip_special_tokens=True)

print("🔄 Running consistency tests...")
print(f"Total: {sum(len(p) for p in test_problems.values())} problems × 10 runs = {sum(len(p) for p in test_problems.values()) * 10} generations")
print()

results = {}
total_problems = sum(len(problems) for problems in test_problems.values())
problem_count = 0

for category, problems in test_problems.items():
    results[category] = []
    for prompt in problems:
        problem_count += 1
        print(f"[{problem_count}/{total_problems}] {category} - {prompt[:50]}...")
        
        # Generate 10 responses with progress
        responses = []
        start_time = time.time()
        for i in range(10):
            print(f"  Run {i+1}/10...", end='\r')
            responses.append(generate(prompt))
        
        # Calculate consistency
        consistency = Counter(responses).most_common(1)[0][1] * 10
        elapsed = time.time() - start_time
        
        results[category].append({'prompt': prompt, 'consistency': consistency})
        print(f"  ✅ {consistency}% consistent ({elapsed:.1f}s for 10 runs)    ")

print()
print("=" * 70)
print("📊 AVERAGE CONSISTENCY BY CATEGORY:")
print("=" * 70)
for cat, res in results.items():
    avg = sum(r['consistency'] for r in res) / len(res)
    print(f"  {cat:12} : {avg:.1f}%")
print("=" * 70)

## Section 4: Analyze Benchmark JSON

Load benchmark files with comprehensive error handling.

In [None]:
# Analyze benchmark JSON with error handling
def analyze_json(path):
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f"❌ JSON error: Line {e.lineno}, Col {e.colno}")
        print("💡 Check for unescaped backslashes or quotes")
        return None
    except Exception as e:
        print(f"❌ Error: {type(e).__name__}: {e}")
        return None
    
    # Detect format
    if 'by_category' in data:
        by_cat = data['by_category']
    elif 'category' in data and 'results' in data:
        by_cat = {data['category']: {'test_results': data['results']}}
    else:
        print("⚠️ Unknown format")
        return None
    
    # Analyze each category
    for cat, res in by_cat.items():
        tests = res.get('test_results', [])
        if not tests: continue
        
        wins = losses = ties = 0
        for t in tests:
            if not isinstance(t, dict): continue
            outcome = t.get('outcome', '').lower()
            if outcome == 'win': wins += 1
            elif outcome == 'loss': losses += 1
            elif outcome == 'tie': ties += 1
        
        total = wins + losses + ties
        if total == 0: continue
        
        print(f"\n{cat.upper()}:")
        print(f"  Wins: {wins} ({wins/total*100:.1f}%)")
        print(f"  Losses: {losses} ({losses/total*100:.1f}%)")
        print(f"  Ties: {ties} ({ties/total*100:.1f}%)")
    
    return data

# Try to load
if benchmark_dir.exists():
    reports = sorted(benchmark_dir.glob("benchmark_report_*.json"))
    if reports:
        print(f"Analyzing: {reports[-1].name}")
        analyze_json(reports[-1])
    else:
        print("No benchmark reports found")
else:
    print("Benchmark directory not found")