# Test All HuggingFace App Models

This notebook tests the unified text classification function with every model available in the CatLLM HuggingFace Space.

Tests both:
- **Free models** (Space pays for API costs)
- **Paid models** (user provides own API key)

In [None]:
import sys
import os

# Add the src directory to path so we import from local code, not installed package
src_path = '/Users/chrissoria/Documents/Research/cat-llm/src'
sys.path.insert(0, src_path)

# Remove any cached catllm modules to ensure we load from local
modules_to_remove = [key for key in sys.modules.keys() if key.startswith('catllm')]
for mod in modules_to_remove:
    del sys.modules[mod]

import pandas as pd
import time
from datetime import datetime
from dotenv import load_dotenv, find_dotenv
from catllm.text_functions import multi_class, detect_provider
import catllm

print(f"Testing local catllm version: {catllm.__version__}")
print(f"Loaded from: {catllm.__file__}")

In [None]:
# Load API keys from .env file
os.chdir('/Users/chrissoria/Documents/Research/Categorization_AI_experiments')
_ = load_dotenv(find_dotenv())
os.chdir('/Users/chrissoria/Documents/Research/cat-llm')

# Get API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
mistral_api_key = os.getenv("MISTRAL_API_KEY")
xai_api_key = os.getenv("XAI_API_KEY")
huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")

# Verify keys loaded
print("API keys loaded:")
print(f"  OpenAI: {'✅' if openai_api_key else '❌'}")
print(f"  Anthropic: {'✅' if anthropic_api_key else '❌'}")
print(f"  Google: {'✅' if google_api_key else '❌'}")
print(f"  Mistral: {'✅' if mistral_api_key else '❌'}")
print(f"  xAI: {'✅' if xai_api_key else '❌'}")
print(f"  HuggingFace: {'✅' if huggingface_api_key else '❌'}")
print(f"  Perplexity: {'✅' if perplexity_api_key else '❌'}")

In [None]:
# Output directory
output_dir = os.path.join(os.getcwd(), 'examples', 'test_output')
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory: {output_dir}")

## Model Configuration

Models available in the HuggingFace app

In [None]:
# Free models (Space pays)
FREE_MODEL_CHOICES = [
    "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
    "deepseek-ai/DeepSeek-V3.1:novita",
    "meta-llama/Llama-3.3-70B-Instruct:groq",
    "gemini-2.5-flash",
    "gpt-4o",
    "mistral-medium-2505",
    "claude-3-haiku-20240307",
    "grok-4-fast-non-reasoning",
]

# Paid models (user provides key)
PAID_MODEL_CHOICES = [
    "gpt-4.1",
    "gpt-4o",
    "gpt-4o-mini",
    "claude-sonnet-4-5-20250929",
    "claude-opus-4-20250514",
    "claude-3-5-haiku-20241022",
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "mistral-large-latest",
]

# Models routed through HuggingFace
HF_ROUTED_MODELS = [
    "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
    "deepseek-ai/DeepSeek-V3.1:novita",
    "meta-llama/Llama-3.3-70B-Instruct:groq",
]

print(f"Free models: {len(FREE_MODEL_CHOICES)}")
print(f"Paid models: {len(PAID_MODEL_CHOICES)}")
print(f"Total: {len(set(FREE_MODEL_CHOICES + PAID_MODEL_CHOICES))} unique models")

## Test Data

In [None]:
# Sample test data
TEST_RESPONSES = [
    "I moved because I got a new job in another city",
    "My family needed to be closer to my elderly parents",
    "The rent was too expensive so I had to find somewhere cheaper",
]

TEST_CATEGORIES = [
    "Employment/Job-related",
    "Family reasons",
    "Financial/Cost of living",
    "Education",
    "Health reasons",
]

print(f"Test responses: {len(TEST_RESPONSES)}")
print(f"Test categories: {len(TEST_CATEGORIES)}")

## Helper Functions

In [None]:
def get_provider_for_model(model: str) -> str:
    """Determine the provider for a given model."""
    if model in HF_ROUTED_MODELS:
        return "huggingface"

    model_lower = model.lower()
    if "gpt" in model_lower or "o1" in model_lower:
        return "openai"
    elif "claude" in model_lower:
        return "anthropic"
    elif "gemini" in model_lower:
        return "google"
    elif "mistral" in model_lower:
        return "mistral"
    elif "grok" in model_lower:
        return "xai"
    elif any(x in model_lower for x in [":novita", ":groq", "qwen", "llama", "deepseek"]):
        return "huggingface"
    else:
        return "unknown"


def get_api_key_for_provider(provider: str) -> str:
    """Get API key for a provider."""
    key_map = {
        "openai": openai_api_key,
        "anthropic": anthropic_api_key,
        "google": google_api_key,
        "mistral": mistral_api_key,
        "xai": xai_api_key,
        "huggingface": huggingface_api_key,
        "perplexity": perplexity_api_key,
    }
    return key_map.get(provider, "")


def test_single_model(model: str, provider: str, api_key: str) -> dict:
    """Test a single model."""
    result = {
        "model": model,
        "provider": provider,
        "success": False,
        "time": 0,
        "error": None,
        "results": None,
    }

    if not api_key:
        result["error"] = f"No API key for '{provider}'"
        return result

    try:
        start_time = time.time()

        df = multi_class(
            survey_input=TEST_RESPONSES[:1],  # Just 1 response for speed
            categories=TEST_CATEGORIES,
            api_key=api_key,
            model=model,
            provider=provider,
            survey_question="Why did you move to your current residence?",
            creativity=0.1,
            chain_of_thought=True,
            use_json_schema=True,
        )

        elapsed = time.time() - start_time
        result["time"] = elapsed

        if df is not None and len(df) > 0:
            status = df['processing_status'].iloc[0]
            if status == 'success':
                result["success"] = True
                result["results"] = df.to_dict('records')[0]
            else:
                result["error"] = f"Status: {status}"
                result["results"] = df.to_dict('records')[0]
        else:
            result["error"] = "Empty result DataFrame"

    except Exception as e:
        result["error"] = str(e)

    return result

print("Helper functions loaded ✓")

## Test Free Models

Testing all free models (Space pays for API costs)

In [None]:
# Remove duplicates
free_models_unique = []
seen = set()
for m in FREE_MODEL_CHOICES:
    if m not in seen:
        seen.add(m)
        free_models_unique.append(m)

print(f"Testing {len(free_models_unique)} free models...\n")

free_results = []

for i, model in enumerate(free_models_unique, 1):
    provider = get_provider_for_model(model)
    api_key = get_api_key_for_provider(provider)

    print(f"[{i}/{len(free_models_unique)}] Testing {model} ({provider})...")

    result = test_single_model(model, provider, api_key)
    free_results.append(result)

    status_icon = "✅" if result["success"] else "❌"
    if result["success"]:
        print(f"  {status_icon} PASS ({result['time']:.2f}s)")
    else:
        print(f"  {status_icon} FAIL: {result['error']}")

    if i < len(free_models_unique):
        time.sleep(1)  # Rate limiting

# Summary
passed = sum(1 for r in free_results if r["success"])
print(f"\n{'='*70}")
print(f"FREE MODELS SUMMARY: {passed}/{len(free_results)} passed")
print(f"{'='*70}")
for r in free_results:
    icon = "✅" if r["success"] else "❌"
    time_str = f"({r['time']:.2f}s)" if r["success"] else r["error"]
    print(f"  {icon} {r['model']}: {time_str}")

## Test Paid Models

Testing all paid models (user provides API key)

In [None]:
# Remove duplicates
paid_models_unique = []
seen = set()
for m in PAID_MODEL_CHOICES:
    if m not in seen:
        seen.add(m)
        paid_models_unique.append(m)

print(f"Testing {len(paid_models_unique)} paid models...\n")

paid_results = []

for i, model in enumerate(paid_models_unique, 1):
    provider = get_provider_for_model(model)
    api_key = get_api_key_for_provider(provider)

    print(f"[{i}/{len(paid_models_unique)}] Testing {model} ({provider})...")

    result = test_single_model(model, provider, api_key)
    paid_results.append(result)

    status_icon = "✅" if result["success"] else "❌"
    if result["success"]:
        print(f"  {status_icon} PASS ({result['time']:.2f}s)")
    else:
        print(f"  {status_icon} FAIL: {result['error']}")

    if i < len(paid_models_unique):
        time.sleep(1)  # Rate limiting

# Summary
passed = sum(1 for r in paid_results if r["success"])
print(f"\n{'='*70}")
print(f"PAID MODELS SUMMARY: {passed}/{len(paid_results)} passed")
print(f"{'='*70}")
for r in paid_results:
    icon = "✅" if r["success"] else "❌"
    time_str = f"({r['time']:.2f}s)" if r["success"] else r["error"]
    print(f"  {icon} {r['model']}: {time_str}")

## Save Results

In [None]:
import json

# Combine all results
all_results = free_results + paid_results

# Create DataFrame
rows = []
for r in all_results:
    row = {
        "model": r["model"],
        "provider": r["provider"],
        "success": r["success"],
        "time_seconds": r["time"],
        "error": r["error"],
    }
    if r["results"]:
        for i, cat in enumerate(TEST_CATEGORIES, 1):
            row[f"category_{i}"] = r["results"].get(f"category_{i}", None)
    rows.append(row)

results_df = pd.DataFrame(rows)

# Save CSV
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_path = os.path.join(output_dir, f"model_test_results_{timestamp}.csv")
results_df.to_csv(csv_path, index=False)
print(f"Results saved to: {csv_path}")

# Save detailed JSON
json_path = os.path.join(output_dir, f"model_test_results_{timestamp}.json")
with open(json_path, 'w') as f:
    json.dump({
        "timestamp": datetime.now().isoformat(),
        "catllm_version": catllm.__version__,
        "test_responses": TEST_RESPONSES,
        "test_categories": TEST_CATEGORIES,
        "results": all_results,
    }, f, indent=2, default=str)
print(f"Detailed results: {json_path}")

# Display summary
print(f"\n{'='*70}")
print(f"FINAL SUMMARY")
print(f"{'='*70}")
total_passed = sum(1 for r in all_results if r["success"])
print(f"Total: {len(all_results)}")
print(f"Passed: {total_passed} ✅")
print(f"Failed: {len(all_results) - total_passed} ❌")

results_df.head()