In [None]:
 IMPORTANT: Install required dependencies first
# Run this cell first if you haven't installed huggingface_hub yet

# !pip install huggingface_hub python-dotenv requests
print("🚨 IMPORTANT: Make sure you have installed the required packages:")
print("pip install huggingface_hub python-dotenv requests")
print("\n📝 Note: The old Hugging Face Inference API has been replaced!")
print("This notebook uses the NEW Inference Providers system.")

In [8]:
# Hugging Face API Debug Notebook
# This notebook will help us debug the connection to Hugging Face models step by step

print("🚀 Starting Hugging Face API Debug Session")
print("="*50)

🚀 Starting Hugging Face API Debug Session


# Hugging Face INFERENCE PROVIDERS Debug Notebook
# ⚠️ IMPORTANT: This uses the NEW system that replaced the old api-inference.huggingface.co

print("🚀 Starting Hugging Face INFERENCE PROVIDERS Debug Session")
print("="*60)
print("⚠️  MIGRATION NOTICE: Old serverless inference API is deprecated!")
print("🎆 Now using NEW Inference Providers system with better reliability")
print("="*60)

## New System Overview:
- Uses **Inference Providers** (Together AI, Sambanova, fal.ai, etc.)
- OpenAI-compatible chat completion API
- URL format: `https://router.huggingface.co/{provider}/v1/chat/completions`
- Better reliability and more models available

## Steps:
1. Install huggingface_hub library
2. Load environment variables
3. Test with InferenceClient (recommended)
4. Test direct HTTP calls
5. Find working models
6. Test chat functionality

In [9]:
# Step 1: Import required libraries
import requests
import os
import json
from dotenv import load_dotenv
from pprint import pprint

# Try to import huggingface_hub - this is the modern way
try:
    from huggingface_hub import InferenceClient
    print("✅ huggingface_hub available - using modern Inference Providers")
    HF_HUB_AVAILABLE = True
except ImportError:
    print("⚠️  huggingface_hub not available - will use direct HTTP calls")
    print("💡 Install with: pip install huggingface_hub")
    HF_HUB_AVAILABLE = False

print("✅ Libraries imported successfully")

  from .autonotebook import tqdm as notebook_tqdm


✅ huggingface_hub available - using modern Inference Providers
✅ Libraries imported successfully


In [16]:
# Step 2: Load environment variables and check setup
load_dotenv()

# Get API token
API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")

print(f"Environment check:")
print(f"- API Token found: {'✅ Yes' if API_TOKEN else '❌ No'}")
if API_TOKEN:
    print(f"- Token length: {len(API_TOKEN)} characters")
else:
    print("⚠️  No API token found in environment variables")

Environment check:
- API Token found: ✅ Yes
- Token length: 37 characters


In [None]:
# Step 3: Test NEW Inference Providers system
# The old api-inference.huggingface.co is deprecated
# New system uses router.huggingface.co with different providers

print("🔄 Testing NEW Hugging Face Inference Providers system...")
print("📝 Note: This replaces the old api-inference.huggingface.co")
print("\n💡 Available providers: Together AI, Sambanova, fal.ai, Replicate, etc.")
print("🌐 Uses OpenAI-compatible chat completion format")

if API_TOKEN:
    print(f"\n✅ API Token found: {API_TOKEN[:10]}...{API_TOKEN[-5:]}")
    print(f"🔗 Will test with multiple inference providers")
else:
    print("❌ No API token found - cannot proceed")

Testing API connection:
- Model: gpt2
- API URL: https://api-inference.huggingface.co/models/gpt2
- Headers prepared: ✅

💡 Note: Not all HF models are available through Inference API
   We're testing with GPT-2 which should be available


In [11]:
# Step 4: Create test functions for the NEW system

def test_inference_client(model_id, prompt="Hello, how are you?", verbose=True):
    """
    Test using the modern huggingface_hub InferenceClient (RECOMMENDED)
    """
    if not HF_HUB_AVAILABLE:
        return {"success": False, "error": "huggingface_hub not available"}
    
    try:
        client = InferenceClient(api_key=API_TOKEN)
        
        if verbose:
            print(f"\n🔄 Testing with InferenceClient: {model_id}")
            print(f"📝 Prompt: '{prompt}'")
        
        # Use chat completion (OpenAI-compatible)
        completion = client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=100
        )
        
        response_text = completion.choices[0].message.content
        
        if verbose:
            print(f"✅ Success! Response: {response_text[:200]}")
        
        return {"success": True, "data": response_text}
        
    except Exception as e:
        error_msg = f"InferenceClient error: {e}"
        if verbose:
            print(f"❌ {error_msg}")
        return {"success": False, "error": error_msg}


def test_direct_http(provider, model_id, prompt="Hello, how are you?", verbose=True):
    """
    Test using direct HTTP calls to the new router system
    """
    api_url = f"https://router.huggingface.co/{provider}/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_TOKEN}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": model_id,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 100,
        "stream": False
    }
    
    if verbose:
        print(f"\n🔄 Testing HTTP: {provider}/{model_id}")
        print(f"📝 Prompt: '{prompt}'")
        print(f"🌐 URL: {api_url}")
    
    try:
        response = requests.post(api_url, headers=headers, json=payload, timeout=30)
        
        if verbose:
            print(f"📊 Status Code: {response.status_code}")
        
        if response.status_code == 200:
            try:
                result = response.json()
                response_text = result["choices"][0]["message"]["content"]
                if verbose:
                    print(f"✅ Success! Response: {response_text[:200]}")
                return {"success": True, "data": response_text}
            except (json.JSONDecodeError, KeyError) as e:
                error_msg = f"Response parsing error: {e}"
                if verbose:
                    print(f"❌ {error_msg}")
                    print(f"Raw response: {response.text[:300]}")
                return {"success": False, "error": error_msg}
        else:
            error_msg = f"HTTP {response.status_code}: {response.text}"
            if verbose:
                print(f"❌ {error_msg}")
            return {"success": False, "error": error_msg}
            
    except requests.exceptions.RequestException as e:
        error_msg = f"Request exception: {e}"
        if verbose:
            print(f"❌ {error_msg}")
        return {"success": False, "error": error_msg}

print("✅ New test functions created")

✅ New test functions created


In [12]:
# Step 5: Test the NEW InferenceClient (recommended approach)
if API_TOKEN and HF_HUB_AVAILABLE:
    print("🧪 Testing with InferenceClient (recommended)...")
    
    # Try some popular models that should be available
    test_models = [
        "deepseek-ai/DeepSeek-V3-0324",  # Popular recent model
        "Qwen/QwenQQ-72B-Instruct",      # Qwen model
        "meta-llama/Llama-3.3-70B-Instruct",  # Llama model
    ]
    
    for model in test_models:
        result = test_inference_client(model, "Hello! Can you help me?", verbose=True)
        
        if result["success"]:
            print(f"🎉 SUCCESS with {model}!")
            break
        else:
            print(f"❌ Failed: {model}")
            print(f"   Error: {result['error'][:100]}")
        print("-" * 60)
        
else:
    if not API_TOKEN:
        print("❌ Cannot test without API token")
    if not HF_HUB_AVAILABLE:
        print("❌ Cannot test without huggingface_hub library")
        print("💡 Install with: pip install huggingface_hub")

🧪 Testing with InferenceClient (recommended)...

🔄 Testing with InferenceClient: deepseek-ai/DeepSeek-V3-0324
📝 Prompt: 'Hello! Can you help me?'
✅ Success! Response: Of course! I'd be happy to help. What do you need assistance with? 😊
🎉 SUCCESS with deepseek-ai/DeepSeek-V3-0324!
✅ Success! Response: Of course! I'd be happy to help. What do you need assistance with? 😊
🎉 SUCCESS with deepseek-ai/DeepSeek-V3-0324!


In [13]:
# Step 6: Test different providers with direct HTTP calls
if API_TOKEN:
    print("🔍 Testing multiple providers with HTTP calls...\n")
    
    # Test different provider/model combinations
    test_configs = [
        ("sambanova", "Llama-3.3-70B-Instruct"),
        ("together", "meta-llama/Llama-3.3-70B-Instruct"),
        ("together", "deepseek-ai/DeepSeek-V3-0324"),
        ("fireworks", "deepseek-ai/DeepSeek-V3-0324"),
    ]
    
    results = {}
    working_configs = []
    
    for provider, model in test_configs:
        print(f"Testing {provider}/{model}...")
        result = test_direct_http(provider, model, "Hello!", verbose=False)
        config_key = f"{provider}/{model}"
        results[config_key] = result
        
        if result["success"]:
            print(f"✅ {config_key}: SUCCESS")
            working_configs.append((provider, model))
        else:
            error_preview = result['error'][:100] if len(result['error']) > 100 else result['error']
            print(f"❌ {config_key}: {error_preview}")
            if "403" in str(result['error']):
                print(f"   └─ 💡 May need specific provider API key or PRO subscription")
            elif "404" in str(result['error']):
                print(f"   └─ 💡 Model not available through this provider")
        print("-" * 60)
    
    print("\n📊 Summary:")
    print(f"✅ Working configurations ({len(working_configs)}): {working_configs}")
    
    if not working_configs:
        print("\n🚨 No configurations are working! This could indicate:")
        print("   1. Need Hugging Face PRO subscription ($9/month with $2 credits)")
        print("   2. Need specific provider API keys")
        print("   3. Token permissions insufficient")
        print("   4. Models temporarily unavailable")
        print("\n💡 Try the InferenceClient approach instead (Step 5)")
else:
    print("❌ Cannot test without API token")

🔍 Testing multiple providers with HTTP calls...

Testing sambanova/Llama-3.3-70B-Instruct...
❌ sambanova/Llama-3.3-70B-Instruct: HTTP 404: {"error":"Model not found"}

   └─ 💡 Model not available through this provider
------------------------------------------------------------
Testing together/meta-llama/Llama-3.3-70B-Instruct...
❌ sambanova/Llama-3.3-70B-Instruct: HTTP 404: {"error":"Model not found"}

   └─ 💡 Model not available through this provider
------------------------------------------------------------
Testing together/meta-llama/Llama-3.3-70B-Instruct...
❌ together/meta-llama/Llama-3.3-70B-Instruct: HTTP 400: {
  "id": "nwYsDtt-4yUbBN-9499987118fdbacc",
  "error": {
    "message": "Unable to access
------------------------------------------------------------
Testing together/deepseek-ai/DeepSeek-V3-0324...
❌ together/meta-llama/Llama-3.3-70B-Instruct: HTTP 400: {
  "id": "nwYsDtt-4yUbBN-9499987118fdbacc",
  "error": {
    "message": "Unable to access
-----------------------

In [None]:
# Step 7: Analyze errors and understand the new billing system
print("🔍 Detailed Analysis of New Inference Providers System:\n")

print("💰 Billing Information:")
print("- Hugging Face PRO ($9/month): Includes $2 worth of inference credits")
print("- Free tier: Very limited quota for signed-in users")
print("- Direct provider keys: Billed directly by provider (Together AI, etc.)")
print("- No markup on provider rates when using HF routing")

print("\n🔑 Authentication Options:")
print("1. HF Token + PRO subscription: Routed through HF with credits")
print("2. HF Token + Provider API key: Direct billing to provider")
print("3. Free tier: Very limited usage")

print("\n🚨 Common Issues:")
print("- 403 Forbidden: Insufficient permissions or quota exceeded")
print("- 404 Not Found: Model not available through that provider")
print("- Rate limiting: Hit usage limits")

if 'results' in locals():
    print("\n📄 Error Details from Tests:")
    for config, result in results.items():
        if not result["success"]:
            print(f"\n{config}:")
            print(f"  Error: {result['error'][:200]}")
else:
    print("\n💡 Run the previous cells to see specific error details")

In [14]:
# Step 8: Test conversation with working configuration
if 'working_configs' in locals() and working_configs:
    provider, model = working_configs[0]
    print(f"🗣️ Testing conversation with: {provider}/{model}\n")
    
    conversation_prompts = [
        "Hello, how are you?",
        "What's the capital of France?",
        "Can you help me write a simple Python function?",
        "What's the weather like?"
    ]
    
    for prompt in conversation_prompts:
        print(f"👤 Human: {prompt}")
        result = test_direct_http(provider, model, prompt, verbose=False)
        
        if result["success"]:
            response = result["data"]
            print(f"🤖 Bot: {response}")
        else:
            print(f"🤖 Bot: Error - {result['error'][:100]}")
        
        print("-" * 50)

elif HF_HUB_AVAILABLE and API_TOKEN:
    print("🗣️ Testing conversation with InferenceClient...\n")
    
    # Try to find a working model with InferenceClient
    test_models = ["deepseek-ai/DeepSeek-V3-0324", "meta-llama/Llama-3.3-70B-Instruct"]
    
    working_model = None
    for model in test_models:
        result = test_inference_client(model, "Hello!", verbose=False)
        if result["success"]:
            working_model = model
            break
    
    if working_model:
        conversation_prompts = [
            "Hello, how are you?",
            "What's the capital of France?",
            "Can you help me write a simple Python function?"
        ]
        
        for prompt in conversation_prompts:
            print(f"👤 Human: {prompt}")
            result = test_inference_client(working_model, prompt, verbose=False)
            
            if result["success"]:
                print(f"🤖 Bot: {result['data']}")
            else:
                print(f"🤖 Bot: Error - {result['error'][:100]}")
            
            print("-" * 50)
    else:
        print("❌ No working models found for conversation test")
else:
    print("❌ Cannot test conversation - no working configurations or missing requirements")
    if not HF_HUB_AVAILABLE:
        print("💡 Install huggingface_hub: pip install huggingface_hub")
    if not API_TOKEN:
        print("💡 Set HUGGINGFACE_API_TOKEN in .env file")

🗣️ Testing conversation with InferenceClient...

👤 Human: Hello, how are you?
👤 Human: Hello, how are you?
🤖 Bot: Hello! I'm just a virtual assistant, so I don't have feelings, but I'm here and ready to help you with anything you need. 😊 How about you? How are you doing today?
--------------------------------------------------
👤 Human: What's the capital of France?
🤖 Bot: Hello! I'm just a virtual assistant, so I don't have feelings, but I'm here and ready to help you with anything you need. 😊 How about you? How are you doing today?
--------------------------------------------------
👤 Human: What's the capital of France?
🤖 Bot: The capital of France is **Paris**. It is known for its iconic landmarks such as the Eiffel Tower, the Louvre Museum, and the Arc de Triomphe.  

Would you like recommendations for things to do in Paris? 😊
--------------------------------------------------
👤 Human: Can you help me write a simple Python function?
🤖 Bot: The capital of France is **Paris**. It is k

In [None]:
# Step 9: Test different payload formats with OpenAI-compatible API
if 'working_configs' in locals() and working_configs:
    provider, model = working_configs[0]
    print(f"🧪 Testing different payload formats with: {provider}/{model}\n")
    
    api_url = f"https://router.huggingface.co/{provider}/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_TOKEN}",
        "Content-Type": "application/json"
    }
    
    payloads = [
        # Format 1: Basic chat completion
        {
            "model": model,
            "messages": [{"role": "user", "content": "Hello!"}],
            "max_tokens": 50
        },
        
        # Format 2: With temperature and other parameters
        {
            "model": model,
            "messages": [{"role": "user", "content": "Tell me a joke"}],
            "max_tokens": 100,
            "temperature": 0.7,
            "stream": False
        },
        
        # Format 3: Multi-turn conversation
        {
            "model": model,
            "messages": [
                {"role": "user", "content": "Hello!"},
                {"role": "assistant", "content": "Hi there! How can I help you?"},
                {"role": "user", "content": "What's 2+2?"}
            ],
            "max_tokens": 50
        }
    ]
    
    for i, payload in enumerate(payloads, 1):
        print(f"Format {i}: {json.dumps(payload, indent=2)}")
        
        try:
            response = requests.post(api_url, headers=headers, json=payload, timeout=30)
            print(f"Status: {response.status_code}")
            
            if response.status_code == 200:
                try:
                    result = response.json()
                    content = result["choices"][0]["message"]["content"]
                    print(f"✅ Success: {content}")
                except (json.JSONDecodeError, KeyError) as e:
                    print(f"❌ JSON/Key Error: {e}")
                    print(f"Raw response: {response.text[:200]}")
            else:
                print(f"❌ HTTP Error: {response.text[:200]}")
                
        except Exception as e:
            print(f"❌ Exception: {e}")
        
        print("-" * 60)
else:
    print("❌ No working configurations to test payload formats")
    print("💡 First get a working provider/model combination from Step 6")

## 🎯 Final Recommendations for NEW Inference Providers System

Based on the tests above, here's how to use the **modern** Hugging Face system:

### 🚪 Migration from Old System
1. **Old URL**: `https://api-inference.huggingface.co/models/{model}` ❌ DEPRECATED
2. **New URL**: `https://router.huggingface.co/{provider}/v1/chat/completions` ✅ CURRENT
3. **New Format**: OpenAI-compatible chat completion API

### 💰 Billing & Access
1. **Free Tier**: Very limited quota for exploration
2. **PRO Subscription**: $9/month includes $2 worth of inference credits
3. **Provider Keys**: Use your own Together AI, Sambanova, etc. keys for direct billing

### 🔧 Implementation Options

**Option 1: InferenceClient (Recommended)**
```python
from huggingface_hub import InferenceClient
client = InferenceClient(api_key="your_hf_token")
completion = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3-0324",
    messages=[{"role": "user", "content": "Hello!"}]
)
```

**Option 2: Direct HTTP (OpenAI-compatible)**
```python
requests.post("https://router.huggingface.co/together/v1/chat/completions", ...)
```

### 🎯 Best Practices
1. **Use InferenceClient** for easiest integration
2. **Handle provider fallbacks** (try multiple providers)
3. **Implement proper error handling** for quota/billing issues
4. **Use OpenAI chat completion format** for messages
5. **Consider getting PRO subscription** for reliable access

## 🔄 Migration Steps
1. Replace old API calls with new InferenceClient
2. Update payload format to OpenAI chat completion
3. Add provider fallback logic
4. Update error handling for new response format

In [15]:
# Step 10: Export working configuration for main app (NEW SYSTEM)

# Prepare configuration for the new Inference Providers system
config = {
    "system": "inference_providers",
    "migration_notes": {
        "old_url": "https://api-inference.huggingface.co/models/{model_id} (DEPRECATED)",
        "new_url": "https://router.huggingface.co/{provider}/v1/chat/completions",
        "format_change": "Now uses OpenAI-compatible chat completion API"
    },
    "recommended_approach": "huggingface_hub.InferenceClient",
    "billing": {
        "free_tier": "Very limited quota",
        "pro_subscription": "$9/month with $2 inference credits",
        "direct_provider": "Use provider API keys for direct billing"
    },
    "dependencies": ["huggingface_hub", "requests", "python-dotenv"],
    "environment_variables": {
        "HUGGINGFACE_API_TOKEN": "Required - get from hf.co/settings/tokens"
    }
}

# Add working configurations if found
if 'working_configs' in locals() and working_configs:
    config["working_providers"] = working_configs
    config["recommended_config"] = {
        "provider": working_configs[0][0],
        "model": working_configs[0][1]
    }
else:
    config["working_providers"] = []
    config["recommended_config"] = {
        "provider": "auto",  # Let HF choose best provider
        "model": "deepseek-ai/DeepSeek-V3-0324"
    }

# Add implementation examples
config["implementation"] = {
    "inference_client": {
        "library": "huggingface_hub",
        "example": "InferenceClient(api_key=token).chat.completions.create(model=model, messages=messages)"
    },
    "direct_http": {
        "url_template": "https://router.huggingface.co/{provider}/v1/chat/completions",
        "method": "POST",
        "headers": {"Authorization": "Bearer {token}", "Content-Type": "application/json"},
        "payload_format": {
            "model": "{model_id}",
            "messages": [{"role": "user", "content": "{prompt}"}],
            "max_tokens": 100
        }
    }
}

print("📋 Updated Configuration for NEW Inference Providers system:")
print(json.dumps(config, indent=2))

# Save to file for easy reference
with open("hf_inference_providers_config.json", "w") as f:
    json.dump(config, f, indent=2)

print("\n✅ Configuration saved to hf_inference_providers_config.json")
print("\n🚨 IMPORTANT: Update your main FastAPI app to use this new system!")
print("💡 Key changes needed:")
print("  1. Install: pip install huggingface_hub")
print("  2. Replace old API calls with InferenceClient")
print("  3. Update to OpenAI chat completion format")
print("  4. Handle new error responses and billing limits")

📋 Updated Configuration for NEW Inference Providers system:
{
  "system": "inference_providers",
  "migration_notes": {
    "old_url": "https://api-inference.huggingface.co/models/{model_id} (DEPRECATED)",
    "new_url": "https://router.huggingface.co/{provider}/v1/chat/completions",
    "format_change": "Now uses OpenAI-compatible chat completion API"
  },
  "recommended_approach": "huggingface_hub.InferenceClient",
  "billing": {
    "free_tier": "Very limited quota",
    "pro_subscription": "$9/month with $2 inference credits",
    "direct_provider": "Use provider API keys for direct billing"
  },
  "dependencies": [
    "huggingface_hub",
    "requests",
    "python-dotenv"
  ],
  "environment_variables": {
    "HUGGINGFACE_API_TOKEN": "Required - get from hf.co/settings/tokens"
  },
  "working_providers": [],
  "recommended_config": {
    "provider": "auto",
    "model": "deepseek-ai/DeepSeek-V3-0324"
  },
  "implementation": {
    "inference_client": {
      "library": "huggingfa

## 🎆 Summary: Migration Complete!

**The old Hugging Face serverless inference API is deprecated.** This notebook now uses the **NEW Inference Providers system**.

### 🛠️ What You Need To Do:

1. **Install Dependencies**:
   ```bash
   pip install huggingface_hub python-dotenv requests
   ```

2. **Update Your FastAPI App** to use:
   - `InferenceClient` from `huggingface_hub`
   - OpenAI-compatible chat completion format
   - New error handling for billing/quota limits

3. **Consider Billing**:
   - Free tier has very limited quota
   - PRO subscription ($9/month) includes $2 inference credits
   - Or use your own provider API keys

4. **Update Environment**:
   - Your HF token should work
   - May need PRO subscription for reliable access

### 🚀 Ready for Production
Once you've run this notebook and found working configurations, update your main FastAPI application using the exported configuration file.