In [None]:
pip install scikit-learn langgraph openai


## Importing libraries

In [None]:
import re
import os
import json
import pickle
import numpy as np
from typing import Dict, Any, List, Literal
from dataclasses import dataclass
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
#from sentence_transformers import SentenceTransformer
from openai import OpenAI
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing_extensions import TypedDict
import time
from datetime import datetime
import os
from dotenv import load_dotenv


## Define the state and the result object


In [81]:
@dataclass
class RoutingResult:
    route: Literal["order_status", "product_info", "technical_support","billing","general"]
    confidence: float
    method: str

## Loading API keys

In [None]:
# Load variables from .env
load_dotenv()

# 1. OpenAI
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# 2. Groq 
groq_client = OpenAI(
    base_url="https://api.groq.com/openai/v1", 
    api_key=os.getenv("GROQ_API_KEY")
)

# 3. Gemini
gemini_client = OpenAI(
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    api_key=os.getenv("GEMINI_API_KEY")
)

print(" All clients (OpenAI, Groq, Gemini) initialized successfully!")


‚úÖ All clients (OpenAI, Groq, Gemini) initialized successfully!


## LLM Routing

In [None]:
def llm_based_routing_openAI(query: str) -> RoutingResult:
    """Route using LLM analysis"""
    prompt = f"""
    Analyze the following customer service query and classify it into exactly one category.
    
    Query: "{query}"
    
    Categories:
    - order_status: Questions about order tracking, delivery, shipping status
    - product_info: Questions about product specifications, availability, features
    - technical_support: Technical issues, troubleshooting, bugs, problems
    - billing: Payment, refund, billing, invoice questions
    - general: General questions or anything that doesn't fit other categories
    
    Respond JSON format: {{"route": "", "confidence": 1, "method": "llm"}}
    """
    
    response = openai_client.chat.completions.create(
        model="GPT-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    
      # Get content and clean it
    response_content = response.choices[0].message.content
    
    # Handle markdown backticks if they exist
    response_content = response_content.replace("```json", "").replace("```", "").strip()
    
    # Parse and return
    response_data = json.loads(response_content)
    return RoutingResult(**response_data)

In [83]:
def llm_based_routing_llama(query: str) -> RoutingResult:
    """Route using LLM analysis"""
    prompt = f"""
    Analyze the following customer service query and classify it into exactly one category.
    
    Query: "{query}"
    
    Categories:
    - order_status: Questions about order tracking, delivery, shipping status
    - product_info: Questions about product specifications, availability, features
    - technical_support: Technical issues, troubleshooting, bugs, problems
    - billing: Payment, refund, billing, invoice questions
    - general: General questions or anything that doesn't fit other categories
    
    Respond JSON format: {{"route": "", "confidence": 1, "method": "llm"}}
    """
    response = groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile", # <--- FIX: Use the exact Groq model ID
        messages=[
            {"role": "system", "content": "You are a helpful assistant that classifies customer service queries. Respond ONLY with JSON."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0
    )
    
    # Get content and clean it
    response_content = response.choices[0].message.content
    
    # Handle markdown backticks if they exist
    response_content = response_content.replace("```json", "").replace("```", "").strip()
    
    # Parse and return
    response_data = json.loads(response_content)
    return RoutingResult(**response_data)

In [87]:
def llm_based_routing_gemini(query: str) -> RoutingResult:
    """"Route using LLM analysis"""
    prompt = f"""
    Analyze the following customer service query and classify it into exactly one category.
    
    Query: "{query}"
    
    Categories:
    - order_status: Questions about order tracking, delivery, shipping status
    - product_info: Questions about product specifications, availability, features
    - technical_support: Technical issues, troubleshooting, bugs, problems
    - billing: Payment, refund, billing, invoice questions
    - general: General questions or anything that doesn't fit other categories
    
    Respond JSON format: {{"route": "", "confidence": 1, "method": "llm"}}
    """
    
    response = gemini_client.chat.completions.create(
        model="gemini-2.5-flash-lite",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    
      # Get content and clean it
    response_content = response.choices[0].message.content
    
    # Handle markdown backticks if they exist
    response_content = response_content.replace("```json", "").replace("```", "").strip()
    
    # Parse and return
    response_data = json.loads(response_content)
    return RoutingResult(**response_data)

## Testing the prompts before we actually test

In [88]:
print(llm_based_routing_gemini("I have an issue with the checkout page?"))


RoutingResult(route='technical_support', confidence=1, method='llm')


In [86]:
print(llm_based_routing_llama("I have an issue with the checkout page?"))


RoutingResult(route='technical_support', confidence=1, method='llm')


## Testing

In [89]:
test_dataset = [
    {"query": "Where is my package ORD123?", "expected": "order_status"},
    {"query": "How do I return a broken item?", "expected": "general"}, # Or FAQ
    {"query": "My screen is flickering when I open the app", "expected": "technical_support"},
    {"query": "Can I pay with PayPal?", "expected": "billing"},
    {"query": "Tell me about your latest laptop specs", "expected": "product_info"},
    {"query": "I was charged twice for my last month", "expected": "billing"},
    {"query": "Why is the website so slow today?", "expected": "technical_support"},
]

In [None]:
class LLMRouterEvaluator:
    def __init__(self, routing_functions: dict):
        self.routing_functions = routing_functions
        self.results = []

    def run_evaluation(self, dataset):
        for model_name, routing_fn in self.routing_functions.items():
            print(f"Testing {model_name}...")
            for item in dataset:
                start_time = time.time()
                try:
                    # Run the routing logic
                    result = routing_fn(item['query'])
                    latency = time.time() - start_time
                    
                    self.results.append({
                        "timestamp": datetime.now().isoformat(),
                        "model": model_name,
                        "query": item['query'],
                        "expected": item['expected'],
                        "actual": result.route,
                        "correct": result.route == item['expected'],
                        "latency": latency,
                        "confidence": result.confidence
                    })
                except Exception as e:
                    print(f"Error with {model_name} on query '{item['query']}': {e}")

    def get_summary(self):
        df = pd.DataFrame(self.results)
        # Calculate metrics per model
        summary = df.groupby("model").agg(
            accuracy=("correct", "mean"),
            avg_latency=("latency", "mean"),
            avg_confidence=("confidence", "mean")
        ).reset_index()
        return summary

In [None]:
models_to_test = {
    "Llama-3.3": llm_based_routing_llama,
    "gemini-2.5-flash-lite": llm_based_routing_gemini
}

evaluator = LLMRouterEvaluator(models_to_test)
evaluator.run_evaluation(test_dataset)

# Display the comparison table
summary_table = evaluator.get_summary()
print(summary_table)


Testing Llama-3.3...
Testing gemini-2.5-flash-lite...
                   model  accuracy  avg_latency  avg_confidence
0              Llama-3.3       1.0     0.130204             1.0
1  gemini-2.5-flash-lite       1.0     0.427114             1.0


In [None]:
# Create a DataFrame from your full results
full_df = pd.DataFrame(evaluator.results)

full_df

Unnamed: 0,timestamp,model,query,expected,actual,correct,latency,confidence
0,2025-12-16T16:13:04.550421,Llama-3.3,Where is my package ORD123?,order_status,order_status,True,0.137178,1
1,2025-12-16T16:13:04.682797,Llama-3.3,How do I return a broken item?,general,general,True,0.13234,1
2,2025-12-16T16:13:04.819795,Llama-3.3,My screen is flickering when I open the app,technical_support,technical_support,True,0.136964,1
3,2025-12-16T16:13:04.935616,Llama-3.3,Can I pay with PayPal?,billing,billing,True,0.115789,1
4,2025-12-16T16:13:05.073138,Llama-3.3,Tell me about your latest laptop specs,product_info,product_info,True,0.137486,1
5,2025-12-16T16:13:05.207916,Llama-3.3,I was charged twice for my last month,billing,billing,True,0.134745,1
6,2025-12-16T16:13:05.324876,Llama-3.3,Why is the website so slow today?,technical_support,technical_support,True,0.116926,1
7,2025-12-16T16:13:05.829776,gemini-2.5-flash-lite,Where is my package ORD123?,order_status,order_status,True,0.504764,1
8,2025-12-16T16:13:06.259383,gemini-2.5-flash-lite,How do I return a broken item?,general,general,True,0.429578,1
9,2025-12-16T16:13:06.694407,gemini-2.5-flash-lite,My screen is flickering when I open the app,technical_support,technical_support,True,0.435003,1


## Conclusion & Model Recommendation

After evaluating the performance of **Llama-3.3-70b (via Groq)** and **Gemini-2.5-Flash-Lite (via Google)** across our test suite, the following conclusions were drawn:

### üèÜ The Winner: Llama-3.3-70b (on Groq)
**Llama-3.3** is the recommended model for this routing framework due to its superior balance of speed and reliability.

#### Key Findings:
1. **Unmatched Speed:** Llama-3.3 achieved an average latency of **~0.13s**, making it **3.3x faster** than Gemini-2.5-Flash-Lite (~0.43s). For a real-time customer service bot, this sub-200ms response time is critical for a "snappy" user experience.
2. **Perfect Accuracy:** Both models achieved **100% accuracy** on the provided test set, proving that even "lighter" or open-weights models are more than capable of handling intent classification for 5+ categories.
3. **Infrastructure Reliability:** Llama (via Groq) handled burst requests without any rate-limiting issues. In contrast, the standard Gemini-2.5-Flash reached its free-tier quota almost immediately (20 requests/day), requiring a pivot to the **Flash-Lite** version to complete testing.

### üìä Performance Summary
| Metric | Llama-3.3 (Groq) | Gemini-2.5-Flash-Lite |
| :--- | :--- | :--- |
| **Accuracy** | 100% | 100% |
| **Avg. Latency** | **0.130s** ‚ö° | 0.427s |
| **Reliability** | High (No 429 Errors) | Medium (Requires Lite version) |

### Final Recommendation
For production-level customer service routing, **Llama-3.3 on Groq** is the best choice. It provides the low-latency performance required for routing logic while maintaining the same intelligence level as proprietary models. **Gemini-2.5-Flash-Lite** remains a high-quality alternative if your infrastructure is already built within the Google Cloud/Firebase ecosystem, provided that rate limits are managed via exponential backoff.