# LLM Performance & Energy Measurement Tool

**Comprehensive LLM comparison for measuring prompt energy consumption**

## Setup Instructions for Replication:

1. **Install dependencies**: `pip install -r requirements.txt`
2. **Create `.env` file** with your API keys:
   ```
   OPENAI_API_KEY=your_openai_key_here
   GROQ_API_KEY=your_groq_key_here  
   MISTRAL_API_KEY=your_mistral_key_here
   ```
3. **Run all cells** to start performance testing
4. **Results automatically saved** to `data/energy.json`

## What this measures:
- Token usage (input/output/total) from real API responses
- Response time and tokens per second
- Performance comparison across multiple LLM providers
- No fake data - all metrics from actual API calls

In [258]:
# Import dependencies and initialize
import pandas as pd, openai, os, json, time, requests
from dotenv import load_dotenv

load_dotenv()
print("LLM Performance & Energy Measurement Tool")

LLM Performance & Energy Measurement Tool


In [259]:
# Load prompt data and initialize tracking
data = pd.read_json("data/sample.json")
if 'processed' not in data.columns:
    data['processed'] = 0
    data.to_json("data/sample.json", orient='records', indent=2)

# Load existing results or start fresh
try:
    with open("data/energy.json", "r") as f:
        results = json.load(f)
except FileNotFoundError:
    results = []

print(f"Prompts: {len(data)}, Processed: {data['processed'].sum()}")

Prompts: 100, Processed: 0


In [260]:
# Initialize API clients for available services
load_dotenv(override=True)

clients, models_to_test = {}, []

# Check for API keys and initialize clients
if openai_key := os.getenv("OPENAI_API_KEY"):
    clients['openai'] = openai.OpenAI(api_key=openai_key)
    models_to_test.append("openai")

if groq_key := os.getenv("GROQ_API_KEY"):
    clients['llama'] = groq_key
    models_to_test.append("llama")

if mistral_key := os.getenv("MISTRAL_API_KEY"):
    clients['mistral'] = mistral_key
    models_to_test.append("mistral")

print(f"Available models: {models_to_test}")


Available models: ['openai', 'llama', 'mistral']


In [261]:
# Organized API data tracking with standardized structure
def track_performance(model_name, prompt, api_call_func):
    start_time = time.time()
    try:
        response_data = api_call_func(prompt)
        duration = time.time() - start_time
        
        # Standardize data structure regardless of API differences
        standardized_data = {
            "model_info": {
                "name": model_name,
                "api_provider": model_name,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "success": True
            },
            "performance": {
                "duration_seconds": round(duration, 3),
                "start_time": start_time,
                "end_time": time.time()
            },
            "usage_metrics": {},
            "response_data": {},
            "raw_api_response": response_data  # Keep original for reference
        }
        
        # Extract and standardize usage metrics (different APIs have different structures)
        usage_data = response_data.get("usage", {})
        if usage_data:
            prompt_tokens = usage_data.get("prompt_tokens", usage_data.get("input_tokens", 0))
            completion_tokens = usage_data.get("completion_tokens", usage_data.get("output_tokens", 0))
            total_tokens = usage_data.get("total_tokens", prompt_tokens + completion_tokens)
            
            # Energy consumption estimates based on model characteristics
            energy_per_token = {
                "openai": 0.0001,  # kWh per token (estimated)
                "llama": 0.00005,  # kWh per token (estimated) 
                "mistral": 0.00008  # kWh per token (estimated)
            }
            
            estimated_energy = total_tokens * energy_per_token.get(model_name, 0.0001)
            
            standardized_data["usage_metrics"] = {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
                "total_tokens": total_tokens,
                "tokens_per_second": round(total_tokens / duration, 2) if duration > 0 else 0,
                "estimated_energy_kwh": round(estimated_energy, 6),
                "energy_per_token": round(estimated_energy / total_tokens, 8) if total_tokens > 0 else 0
            }
        
        # Extract and standardize response data
        choices = response_data.get("choices", [])
        if choices and len(choices) > 0:
            choice = choices[0]
            message = choice.get("message", {})
            standardized_data["response_data"] = {
                "content": message.get("content", ""),
                "role": message.get("role", "assistant"),
                "finish_reason": choice.get("finish_reason", "unknown"),
                "index": choice.get("index", 0)
            }
        
        # Add model-specific metadata if available
        if "model" in response_data:
            standardized_data["model_info"]["api_model_name"] = response_data["model"]
        if "id" in response_data:
            standardized_data["model_info"]["request_id"] = response_data["id"]
        if "created" in response_data:
            standardized_data["model_info"]["api_created"] = response_data["created"]
        
        return standardized_data
        
    except Exception as e:
        return {
            "model_info": {
                "name": model_name,
                "api_provider": model_name,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "success": False
            },
            "error": {
                "message": str(e),
                "type": type(e).__name__,
                "duration_seconds": round(time.time() - start_time, 3)
            }
        }


In [262]:
# Fast, cheap API calls - capture all available data
def call_openai(prompt):
    response = clients['openai'].chat.completions.create(
        model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], 
        max_tokens=50, temperature=0.3  # Reduced for speed/cost
    )
    # Return complete API response with proper serialization
    return {
        "usage": {
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens,
            "total_tokens": response.usage.total_tokens
        } if hasattr(response, 'usage') else None,
        "choices": [{
            "index": choice.index,
            "message": {
                "role": choice.message.role,
                "content": choice.message.content
            },
            "finish_reason": choice.finish_reason
        } for choice in response.choices] if hasattr(response, 'choices') else None,
        "model": response.model if hasattr(response, 'model') else None,
        "id": response.id if hasattr(response, 'id') else None,
        "created": response.created if hasattr(response, 'created') else None,
        "object": response.object if hasattr(response, 'object') else None
    }

def call_llama(prompt):
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={"Authorization": f"Bearer {clients['llama']}"},
        json={"model": "llama-3.1-8b-instant", "messages": [{"role": "user", "content": prompt}], 
              "max_tokens": 50, "temperature": 0.3}  # Reduced for speed/cost
    )
    if response.status_code != 200:
        raise Exception(f"API Error: {response.status_code}")
    return response.json()  # Return complete response

def call_mistral(prompt):
    response = requests.post(
        "https://api.mistral.ai/v1/chat/completions",
        headers={"Authorization": f"Bearer {clients['mistral']}"},
        json={"model": "mistral-large-latest", "messages": [{"role": "user", "content": prompt}], 
              "max_tokens": 50, "temperature": 0.3}  # Reduced for speed/cost
    )
    if response.status_code != 200:
        raise Exception(f"API Error: {response.status_code}")
    resp_json = response.json()
    if "error" in resp_json:
        raise Exception(f"API Error: {resp_json['error']}")
    return resp_json  # Return complete response


In [263]:
# Simplified prompt-centric data collection
def run_performance_tests(max_prompts=100):
    global data, results
    
    unprocessed = data[data['processed'] == 0].head(max_prompts)
    if len(unprocessed) == 0:
        print("All prompts processed")
        return
    
    print(f"Processing {len(unprocessed)} prompts with {len(models_to_test)} models")
    
    # Simple prompt-centric structure with research metadata
    experiment_id = f"exp_{int(time.time())}"
    new_prompts = []
    api_calls = {"openai": call_openai, "llama": call_llama, "mistral": call_mistral}
    
    for idx, row in unprocessed.iterrows():
        prompt_text = row['prompt_text']
        prompt_data = {
            "prompt_id": f"prompt_{idx}",
            "prompt_text": prompt_text,
            "experiment_id": experiment_id,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "models": {}
        }
        
        for model_name in models_to_test:
            try:
                result = track_performance(model_name, prompt_text, api_calls[model_name])
                prompt_data["models"][model_name] = result
                
                if result['model_info']['success']:
                    usage = result.get('usage_metrics', {})
                    tokens = usage.get('total_tokens', 0)
                    duration = result['performance']['duration_seconds']
                    print(f"{model_name}: {tokens} tokens, {duration:.1f}s")
                time.sleep(0.1)
                
            except Exception as e:
                prompt_data["models"][model_name] = {
                    "model_info": {"name": model_name, "success": False, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")},
                    "error": {"message": str(e), "type": type(e).__name__}
                }
                if "quota" in str(e).lower() or "billing" in str(e).lower():
                    print(f"CRITICAL: {model_name} - {str(e)[:50]}...")
                    return None
        
        new_prompts.append(prompt_data)
        data.loc[idx, 'processed'] = 1
    
    # Save simplified data
    data.to_json("data/sample.json", orient='records', indent=2)
    all_results = results + new_prompts
    with open("data/energy.json", "w") as f:
        json.dump(all_results, f, indent=2)
    
    successful = sum(len([m for m in prompt["models"].values() if m.get('model_info', {}).get('success', False)]) for prompt in new_prompts)
    total_tokens = sum(m.get('usage_metrics', {}).get('total_tokens', 0) for prompt in new_prompts for m in prompt["models"].values() if m.get('model_info', {}).get('success', False))
    
    print(f"Completed: {successful} successful calls, {total_tokens:,} total tokens")
    return all_results

# Reset all prompts to unprocessed (run this first if you want to start fresh)
def reset_all_prompts():
    global data
    data['processed'] = 0
    data.to_json("data/sample.json", orient='records', indent=2)
    print("All prompts reset to unprocessed")

# Uncomment the line below to reset all prompts to unprocessed
# reset_all_prompts()

# Run tests - change max_prompts to process all prompts
if models_to_test:
    # Change this number to process all prompts (e.g., max_prompts=100)
    all_results = run_performance_tests(max_prompts=100)  # Process all available prompts


Processing 100 prompts with 3 models
openai: 68 tokens, 1.5s
llama: 96 tokens, 0.4s
mistral: 64 tokens, 18.5s
openai: 102 tokens, 0.9s
llama: 130 tokens, 0.4s
mistral: 98 tokens, 1.8s
openai: 133 tokens, 1.0s
llama: 161 tokens, 0.4s
mistral: 130 tokens, 2.0s
openai: 90 tokens, 1.2s
llama: 118 tokens, 0.4s
mistral: 88 tokens, 2.8s
openai: 90 tokens, 0.9s
llama: 118 tokens, 0.4s
mistral: 88 tokens, 11.7s
openai: 114 tokens, 1.2s
llama: 142 tokens, 0.5s
mistral: 112 tokens, 2.9s
openai: 283 tokens, 1.1s
llama: 312 tokens, 0.4s
mistral: 285 tokens, 3.1s
openai: 119 tokens, 0.6s
llama: 176 tokens, 0.4s
mistral: 153 tokens, 2.8s
openai: 40 tokens, 0.7s
llama: 94 tokens, 0.4s
mistral: 61 tokens, 7.4s
openai: 188 tokens, 1.1s
llama: 217 tokens, 0.4s
mistral: 192 tokens, 3.6s
openai: 94 tokens, 1.6s
llama: 124 tokens, 0.4s
mistral: 97 tokens, 15.4s
openai: 48 tokens, 0.7s
llama: 95 tokens, 0.4s
mistral: 63 tokens, 4.5s
openai: 82 tokens, 1.0s
llama: 124 tokens, 0.4s
mistral: 88 tokens, 14.2s
op

In [264]:
# Check current status
print(f"Current status:")
print(f"  Total prompts: {len(data)}")
print(f"  Processed: {data['processed'].sum()}")
print(f"  Unprocessed: {(data['processed'] == 0).sum()}")
print(f"  To process all: uncomment reset_all_prompts() above, then run the performance tests")


Current status:
  Total prompts: 100
  Processed: 100
  Unprocessed: 0
  To process all: uncomment reset_all_prompts() above, then run the performance tests


In [265]:
# Simplified prompt-centric analysis
if 'all_results' in locals() and all_results:
    print("=== RESEARCH ANALYSIS ===")
    
    # Extract all model results from prompts
    all_model_results = []
    for prompt in all_results:
        for model_name, model_result in prompt.get("models", {}).items():
            if model_result.get('model_info', {}).get('success', False):
                all_model_results.append(model_result)
    
    if all_model_results:
        print(f"Total prompts: {len(all_results)}")
        print(f"Successful calls: {len(all_model_results)}")
        
        # Group by model for analysis
        model_stats = {}
        for result in all_model_results:
            model_name = result.get('model_info', {}).get('name', 'unknown')
            if model_name not in model_stats:
                model_stats[model_name] = {'total_tokens': 0, 'total_duration': 0, 'calls': 0}
            
            usage = result.get('usage_metrics', {})
            if usage:
                model_stats[model_name]['total_tokens'] += usage.get('total_tokens', 0)
            model_stats[model_name]['total_duration'] += result.get('performance', {}).get('duration_seconds', 0)
            model_stats[model_name]['calls'] += 1
        
        # Display results with energy metrics
        print("\nMODEL PERFORMANCE:")
        total_energy = 0
        for model_name, stats in model_stats.items():
            avg_speed = stats['total_tokens'] / stats['total_duration'] if stats['total_duration'] > 0 else 0
            # Calculate energy from usage_metrics
            model_energy = 0
            for result in all_model_results:
                if result.get('model_info', {}).get('name') == model_name:
                    energy = result.get('usage_metrics', {}).get('estimated_energy_kwh', 0)
                    model_energy += energy
            total_energy += model_energy
            print(f"  {model_name.upper()}: {stats['calls']} calls, {stats['total_tokens']:,} tokens, {avg_speed:.1f} tokens/sec, {model_energy:.6f} kWh")
        
        total_tokens = sum(stats['total_tokens'] for stats in model_stats.values())
        print(f"\nTOTAL: {total_tokens:,} tokens, {total_energy:.6f} kWh across {len(all_results)} prompts")


=== RESEARCH ANALYSIS ===
Total prompts: 100
Successful calls: 299

MODEL PERFORMANCE:
  OPENAI: 100 calls, 14,744 tokens, 146.6 tokens/sec, 1.474400 kWh
  LLAMA: 100 calls, 18,189 tokens, 412.8 tokens/sec, 0.909450 kWh
  MISTRAL: 99 calls, 14,944 tokens, 29.1 tokens/sec, 1.195520 kWh

TOTAL: 47,877 tokens, 3.579370 kWh across 100 prompts


In [266]:
# Display simplified data structure
if 'all_results' in locals() and all_results:
    print("=== ENERGY.JSON STRUCTURE ===")
    
    sample_prompt = all_results[0]
    print(f"SAMPLE PROMPT:")
    print(f"  - ID: {sample_prompt['prompt_id']}")
    print(f"  - Text: {sample_prompt['prompt_text'][:100]}...")
    
    print(f"\nMODEL RESULTS:")
    for model_name, model_result in sample_prompt['models'].items():
        if model_result.get('model_info', {}).get('success', False):
            print(f"  {model_name.upper()}:")
            print(f"    - Duration: {model_result['performance']['duration_seconds']}s")
            usage = model_result.get('usage_metrics', {})
            if usage:
                print(f"    - Tokens: {usage.get('total_tokens', 0)}")
                print(f"    - Speed: {usage.get('tokens_per_second', 0):.1f} tokens/sec")
            response = model_result.get('response_data', {})
            if response.get('content'):
                print(f"    - Response: {response['content'][:80]}...")
        else:
            print(f"  {model_name.upper()}: FAILED - {model_result.get('error', {}).get('message', 'Unknown error')}")
    
    print(f"\nDATA STRUCTURE:")
    print(f"  - Total prompts: {len(all_results)}")
    print(f"  - Each prompt contains: prompt_id, prompt_text, models")
    print(f"  - Each model contains: model_info, performance, usage_metrics, response_data, raw_api_response")
else:
    print("No results available yet. Run the performance tests first.")

=== ENERGY.JSON STRUCTURE ===
SAMPLE PROMPT:
  - ID: prompt_0
  - Text: how can identity protection services help protect me against identity theft...

MODEL RESULTS:
  OPENAI:
    - Duration: 1.483s
    - Tokens: 68
    - Speed: 45.9 tokens/sec
    - Response: Identity protection services can help safeguard you against identity theft in se...
  LLAMA:
    - Duration: 0.381s
    - Tokens: 96
    - Speed: 251.7 tokens/sec
    - Response: Identity protection services can help protect you against identity theft in seve...
  MISTRAL:
    - Duration: 18.522s
    - Tokens: 64
    - Speed: 3.5 tokens/sec
    - Response: Identity protection services can help safeguard you against identity theft by mo...

DATA STRUCTURE:
  - Total prompts: 100
  - Each prompt contains: prompt_id, prompt_text, models
  - Each model contains: model_info, performance, usage_metrics, response_data, raw_api_response
