# LLM Energy Consumption Analysis

**Comprehensive energy tracking for Large Language Models**

## Models Tested:
- **GPT-4o-mini** (OpenAI) - Fast, efficient
- **Claude 3.5 Sonnet** (Anthropic) - Advanced reasoning  
- **Mistral Large** (Mistral.ai) - Open-source power

## What This Does:
- Measures server-side energy consumption per token
- Tracks costs across different providers
- Compares model efficiency
- Generates comprehensive datasets
- **Easy setup** - Just add your API keys to `.env`
- **Robust error handling** - Automatically handles failures
- **Rich data collection** - Captures everything for analysis

In [189]:
# 🚀 Beautiful LLM Energy Analysis Setup
import pandas as pd
import openai
import anthropic
import os
import json
import time
import requests
from datetime import datetime
from dotenv import load_dotenv

print("🔋 LLM Energy Analysis System")
print("=" * 50)

🔋 LLM Energy Analysis System


In [190]:
# Load and prepare the dataset
data = pd.read_json("data/sample.json")

# Add processed column if it doesn't exist
if 'processed' not in data.columns:
    data['processed'] = 0
    data.to_json("data/sample.json", orient='records', indent=2)

# Load existing energy data
try:
    with open("data/energy.json", "r") as f:
        existing_energy = json.load(f)
except FileNotFoundError:
    existing_energy = []

print(f"Loaded {len(data)} prompts, {data['processed'].sum()} already processed")
print(f"Existing energy data: {len(existing_energy)} records")

Loaded 100 prompts, 4 already processed
Existing energy data: 0 records


In [191]:
# show sample of the data
data.head()

Unnamed: 0,prompt_text,processed
0,how can identity protection services help prot...,1
1,Beside OFAC's selective sanction that target t...,1
2,You are the text completion model and you must...,1
3,The sum of the perimeters of three equal squar...,1
4,What is the type of the variables in the follo...,0


In [192]:
# Get energy data from LLM API calls
import openai
import anthropic
import os
import json
import time
import requests
from datetime import datetime
from dotenv import load_dotenv


In [193]:
# Set up LLM clients
load_dotenv(override=True)

clients = {}
models_to_test = []

# OpenAI setup
openai_key = os.getenv("OPENAI_API_KEY")
if openai_key:
    try:
        clients['openai'] = openai.OpenAI(api_key=openai_key)
        models_to_test.append("openai")
    except Exception as e:
        print(f"OpenAI setup failed: {e}")

# Anthropic setup
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
if anthropic_key:
    try:
        clients['anthropic'] = anthropic.Anthropic(api_key=anthropic_key)
        models_to_test.append("anthropic")
    except Exception as e:
        print(f"Anthropic setup failed: {e}")

# Mistral setup
mistral_key = os.getenv("MISTRAL_API_KEY")
if mistral_key:
    clients['mistral'] = mistral_key
    models_to_test.append("mistral")

print(f"Available models: {models_to_test}")

# Test API connectivity and remove failed models
working_models = []
for model in models_to_test:
    try:
        if model == "openai":
            # Test with a simple call
            test_response = clients['openai'].chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": "test"}],
                max_tokens=5
            )
            working_models.append(model)
        elif model == "anthropic":
            # Test with a simple call
            test_response = clients['anthropic'].messages.create(
                model="claude-sonnet-4-20250514",
                messages=[{"role": "user", "content": "test"}],
                max_tokens=5
            )
            working_models.append(model)
        elif model == "mistral":
            # Test with a simple call
            test_response = requests.post(
                "https://api.mistral.ai/v1/chat/completions",
                headers={"Authorization": f"Bearer {clients['mistral']}"},
                json={
                    "model": "mistral-large-latest",
                    "messages": [{"role": "user", "content": "test"}],
                    "max_tokens": 5
                }
            )
            if test_response.status_code == 200:
                working_models.append(model)
    except Exception as e:
        print(f"Model {model} failed connectivity test: {str(e)[:100]}...")

models_to_test = working_models
print(f"Working models: {models_to_test}")


Available models: ['openai', 'anthropic', 'mistral']
Working models: ['openai', 'anthropic', 'mistral']


In [194]:
# Comprehensive energy and performance tracking
def track_energy(model_name, prompt, api_call_func):
    start_time = time.time()
    try:
        response_data = api_call_func(prompt)
        response_content = response_data.get("content", "")
        
        # Extract comprehensive token data
        usage_data = response_data.get("usage", {})
        if usage_data:
            input_tokens = usage_data.get("prompt_tokens", usage_data.get("input_tokens", 0))
            output_tokens = usage_data.get("completion_tokens", usage_data.get("output_tokens", 0))
            total_tokens = usage_data.get("total_tokens", input_tokens + output_tokens)
        else:
            input_tokens = output_tokens = total_tokens = 0
        
        duration = time.time() - start_time
        
        # Comprehensive energy calculations
        energy_per_token = {
            "openai": 0.0000012,
            "anthropic": 0.0000015,  
            "mistral": 0.0000010
        }
        
        server_energy_kwh = total_tokens * energy_per_token.get(model_name, 0.000001)
        co2_emissions = server_energy_kwh * 0.4  # kg CO2 per kWh
        
        # Detailed cost calculations
        cost_rates = {
            "openai": {"input": 0.00015, "output": 0.0006},
            "anthropic": {"input": 0.003, "output": 0.015},
            "mistral": {"input": 0.0002, "output": 0.0002}
        }
        
        rates = cost_rates.get(model_name, {"input": 0, "output": 0})
        input_cost = (input_tokens / 1000) * rates["input"]
        output_cost = (output_tokens / 1000) * rates["output"]
        total_cost = input_cost + output_cost
        
        # Calculate efficiency metrics
        tokens_per_second = total_tokens / duration if duration > 0 else 0
        energy_efficiency = server_energy_kwh / total_tokens if total_tokens > 0 else 0
        cost_efficiency = total_cost / total_tokens if total_tokens > 0 else 0
        
        # Comprehensive result with all possible data
        return {
            # Core identification
            "model": model_name,
            "prompt": prompt,
            "response": response_content,
            "timestamp": datetime.now().isoformat(),
            "success": True,
            
            # Performance metrics
            "duration_seconds": round(duration, 4),
            "tokens_per_second": round(tokens_per_second, 2),
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "total_tokens": total_tokens,
            "token_ratio": round(output_tokens / input_tokens, 3) if input_tokens > 0 else 0,
            
            # Energy metrics
            "server_energy_kwh": round(server_energy_kwh, 8),
            "server_emissions_kg_co2": round(co2_emissions, 8),
            "energy_per_token": round(energy_efficiency, 10),
            "energy_per_second": round(server_energy_kwh / duration, 8) if duration > 0 else 0,
            
            # Cost metrics
            "total_cost_usd": round(total_cost, 6),
            "input_cost_usd": round(input_cost, 6),
            "output_cost_usd": round(output_cost, 6),
            "cost_per_token": round(cost_efficiency, 8),
            "cost_per_second": round(total_cost / duration, 6) if duration > 0 else 0,
            
            # API metadata
            "api_model": response_data.get("model", None),
            "api_response_id": response_data.get("id", None),
            "api_created": response_data.get("created", None),
            "api_finish_reason": response_data.get("finish_reason", None),
            "api_object": response_data.get("object", None),
            "api_system_fingerprint": response_data.get("system_fingerprint", None),
            
            # Response quality metrics
            "response_length": len(response_content),
            "response_words": len(response_content.split()),
            "response_sentences": len([s for s in response_content.split('.') if s.strip()]),
            
            # Usage details for analysis
            "usage_details": usage_data
        }
        
    except Exception as e:
        return {
            "model": model_name,
            "prompt": prompt,
            "error": str(e),
            "timestamp": datetime.now().isoformat(),
            "success": False,
            "duration_seconds": round(time.time() - start_time, 4)
        }


In [195]:
# Make API calls to each model with full response capture
def call_openai(prompt):
    response = clients['openai'].chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0.7
    )
    return {
        "content": response.choices[0].message.content,
        "usage": response.usage.__dict__ if hasattr(response, 'usage') else None,
        "model": response.model,
        "id": response.id,
        "created": response.created,
        "finish_reason": response.choices[0].finish_reason,
        "system_fingerprint": getattr(response, 'system_fingerprint', None)
    }

def call_anthropic(prompt):
    response = clients['anthropic'].messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=100,
        messages=[{"role": "user", "content": prompt}]
    )
    return {
        "content": response.content[0].text,
        "usage": response.usage.__dict__ if hasattr(response, 'usage') else None,
        "model": response.model,
        "id": response.id,
        "role": response.role,
        "stop_reason": response.stop_reason,
        "stop_sequence": getattr(response, 'stop_sequence', None),
        "type": getattr(response, 'type', None)
    }

def call_mistral(prompt):
    api_key = clients['mistral']
    print(f"Using API key: {api_key[:10]}...")
    
    response = requests.post(
        "https://api.mistral.ai/v1/chat/completions",
        headers={"Authorization": f"Bearer {api_key}"},
        json={
            "model": "mistral-large-latest",
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 100,
            "temperature": 0.7
        }
    )
    
    print(f"Response status: {response.status_code}")
    
    # Handle response with proper encoding
    try:
        resp_json = response.json()
        print(f"Response keys: {list(resp_json.keys())}")
    except Exception as e:
        print(f"JSON decode error: {e}")
        print(f"Raw response: {response.text[:200]}...")
        raise Exception(f"Failed to decode response: {e}")
    
    # Handle potential API errors
    if "error" in resp_json:
        raise Exception(f"API Error: {resp_json['error']}")
    
    # Check if response has expected structure
    if "choices" not in resp_json:
        raise Exception(f"Unexpected response format: {resp_json}")
    
    return {
        "content": resp_json["choices"][0]["message"]["content"],
        "usage": resp_json.get("usage", None),
        "model": resp_json.get("model", None),
        "id": resp_json.get("id", None),
        "created": resp_json.get("created", None),
        "finish_reason": resp_json["choices"][0].get("finish_reason", None),
        "object": resp_json.get("object", None),
        "system_fingerprint": resp_json.get("system_fingerprint", None)
    }


In [196]:
# Main testing function - processes unprocessed prompts with working models only
def run_energy_tests(max_prompts=2):
    global data, existing_energy
    
    # Get unprocessed prompts
    unprocessed = data[data['processed'] == 0].head(max_prompts)
    if len(unprocessed) == 0:
        print("All prompts already processed")
        return
    
    if not models_to_test:
        print("No working models available")
        return
    
    print(f"Processing {len(unprocessed)} prompts with {len(models_to_test)} working models")
    
    new_prompt_results = []
    
    for idx, row in unprocessed.iterrows():
        prompt = row['prompt_text']
        
        # Create prompt entry with all model results
        prompt_entry = {
            "prompt_text": prompt,
            "timestamp": datetime.now().isoformat(),
            "models": {}
        }
        
        prompt_success = True
        
        for model_name in models_to_test:
            try:
                if model_name == "openai":
                    result = track_energy(model_name, prompt, call_openai)
                elif model_name == "anthropic":
                    result = track_energy(model_name, prompt, call_anthropic)
                elif model_name == "mistral":
                    result = track_energy(model_name, prompt, call_mistral)
                
                # Store model result within prompt entry
                prompt_entry["models"][model_name] = result
                
                if result['success']:
                    print(f"{model_name}: {result['total_tokens']} tokens, {result['server_energy_kwh']:.6f} kWh")
                else:
                    print(f"{model_name}: Failed - {result.get('error', 'Unknown error')[:50]}...")
                    prompt_success = False
                
                time.sleep(0.5)  # Rate limiting
                
            except Exception as e:
                error_msg = f"{model_name}: Exception - {str(e)[:50]}..."
                print(error_msg)
                prompt_entry["models"][model_name] = {
                    "model": model_name,
                    "prompt": prompt,
                    "error": str(e),
                    "timestamp": datetime.now().isoformat(),
                    "success": False
                }
                prompt_success = False
                
                # Stop processing if there's a critical error
                if "quota" in str(e).lower() or "billing" in str(e).lower() or "credit" in str(e).lower():
                    print(f"CRITICAL ERROR: {error_msg}")
                    print("Stopping processing due to billing/quota issue. Please fix the problem and restart.")
                    return None
        
        new_prompt_results.append(prompt_entry)
        
        # Only mark as processed if at least one model succeeded
        if prompt_success:
            data.loc[idx, 'processed'] = 1
            print(f"Prompt {idx} marked as processed")
        else:
            print(f"Prompt {idx} failed - not marking as processed")
    
    # Save updated sample.json with processed status
    data.to_json("data/sample.json", orient='records', indent=2)
    print("Updated sample.json with processed status")
    
    # Append new results to existing data
    all_results = existing_energy + new_prompt_results
    
    # Clean data for JSON serialization
    def clean_for_json(data):
        if isinstance(data, dict):
            return {k: clean_for_json(v) for k, v in data.items() if v is not None}
        elif isinstance(data, list):
            return [clean_for_json(item) for item in data]
        elif hasattr(data, '__dict__'):
            # Convert objects to dict, skip non-serializable attributes
            try:
                return clean_for_json(data.__dict__)
            except:
                return str(data)
        else:
            return data
    
    # Clean and save updated energy data
    cleaned_results = clean_for_json(all_results)
    with open("data/energy.json", "w") as f:
        json.dump(cleaned_results, f, indent=2)
    
    successful_models = sum(len([m for m in prompt["models"].values() if m.get('success', False)]) for prompt in new_prompt_results)
    total_models = sum(len(prompt["models"]) for prompt in new_prompt_results)
    print(f"Completed: {successful_models} successful model calls, {total_models} total, {len(all_results)} prompts")
    
    return all_results

# Run the tests
if models_to_test:
    all_results = run_energy_tests(max_prompts=2)
else:
    print("No working models available for testing")


Processing 2 prompts with 3 working models
openai: 140 tokens, 0.000168 kWh
anthropic: 143 tokens, 0.000215 kWh
Using API key: bFOrSsqJFp...
Response status: 200
Response keys: ['id', 'created', 'model', 'usage', 'object', 'choices']
mistral: 138 tokens, 0.000138 kWh
Prompt 4 marked as processed
openai: 164 tokens, 0.000197 kWh
anthropic: 166 tokens, 0.000249 kWh
Using API key: bFOrSsqJFp...
Response status: 200
Response keys: ['id', 'created', 'model', 'usage', 'object', 'choices']
mistral: 162 tokens, 0.000162 kWh
Prompt 5 marked as processed
Updated sample.json with processed status
Completed: 6 successful model calls, 6 total, 2 prompts


In [197]:
# Analysis and Summary
if 'all_results' in locals() and all_results:
    # Flatten data for analysis
    flattened_data = []
    for prompt_entry in all_results:
        for model_name, model_result in prompt_entry.get("models", {}).items():
            model_result["prompt_text"] = prompt_entry["prompt_text"]
            flattened_data.append(model_result)
    
    if flattened_data:
        df = pd.DataFrame(flattened_data)
        successful = df[df['success'] == True]
        
        if len(successful) > 0:
            print(f"Successful calls: {len(successful)}, Failed: {len(df) - len(successful)}")
            
            # Summary by model
            for model in successful['model'].unique():
                model_data = successful[successful['model'] == model]
                total_energy = model_data['server_energy_kwh'].sum()
                total_cost = model_data['total_cost_usd'].sum()
                
                print(f"{model}: {total_energy:.6f} kWh, ${total_cost:.4f}, {len(model_data)} calls")
            
            # Overall totals
            total_energy = successful['server_energy_kwh'].sum()
            total_cost = successful['total_cost_usd'].sum()
            total_emissions = successful['server_emissions_kg_co2'].sum()
            
            print(f"Total: {total_energy:.6f} kWh, ${total_cost:.4f}, {total_emissions:.6f} kg CO2")
        else:
            print("No successful API calls")
    else:
        print("No model results found")
else:
    print("No results available")


Successful calls: 6, Failed: 0
openai: 0.000365 kWh, $0.0001, 2 calls
anthropic: 0.000463 kWh, $0.0033, 2 calls
mistral: 0.000300 kWh, $0.0001, 2 calls
Total: 0.001128 kWh, $0.0035, 0.000451 kg CO2


In [198]:
# Clean up failed entries and reset for fresh start
print("Cleaning up failed entries...")

# Load current data
with open("data/energy.json", "r") as f:
    current_data = json.load(f)

# Clean up prompt entries - keep only prompts with at least one successful model
cleaned_data = []
for prompt_entry in current_data:
    if "models" in prompt_entry:
        # Keep only successful model results within each prompt
        successful_models = {}
        for model_name, model_result in prompt_entry["models"].items():
            if model_result.get('success', False):
                successful_models[model_name] = model_result
        
        # Only keep prompt if it has at least one successful model
        if successful_models:
            prompt_entry["models"] = successful_models
            cleaned_data.append(prompt_entry)

print(f"Kept {len(cleaned_data)} prompts with successful results")

# Save cleaned data
with open("data/energy.json", "w") as f:
    json.dump(cleaned_data, f, indent=2)

# Reset processed status for fresh start
data['processed'] = 0
data.to_json("data/sample.json", orient='records', indent=2)
print("Reset processed status - ready for fresh start")


Cleaning up failed entries...
Kept 2 prompts with successful results
Reset processed status - ready for fresh start


In [199]:
energy_data = pd.read_json("data/energy.json")

In [200]:
# View the new data structure
if 'all_results' in locals() and all_results:
    print("New data structure (prompt-centric):")
    print(f"Total prompts: {len(all_results)}")
    
    for i, prompt_entry in enumerate(all_results[:2]):  # Show first 2 prompts
        print(f"\nPrompt {i+1}:")
        print(f"Text: {prompt_entry['prompt_text'][:50]}...")
        print(f"Models tested: {list(prompt_entry['models'].keys())}")
        
        for model_name, model_result in prompt_entry['models'].items():
            if model_result.get('success', False):
                print(f"  {model_name}: {model_result['total_tokens']} tokens, {model_result['server_energy_kwh']:.6f} kWh")
            else:
                print(f"  {model_name}: Failed - {model_result.get('error', 'Unknown')[:30]}...")
else:
    print("No results to display")


New data structure (prompt-centric):
Total prompts: 2

Prompt 1:
Text: What is the type of the variables in the following...
Models tested: ['openai', 'anthropic', 'mistral']
  openai: 140 tokens, 0.000168 kWh
  anthropic: 143 tokens, 0.000215 kWh
  mistral: 138 tokens, 0.000138 kWh

Prompt 2:
Text: I have 1000 documents to download from a website. ...
Models tested: ['openai', 'anthropic', 'mistral']
  openai: 164 tokens, 0.000197 kWh
  anthropic: 166 tokens, 0.000249 kWh
  mistral: 162 tokens, 0.000162 kWh


In [201]:
# Simple API Status Check
def check_api_status():
    print("Checking API Status...")
    
    for model in ['openai', 'anthropic', 'mistral']:
        if model in models_to_test:
            print(f"{model.upper()}: Ready")
        else:
            print(f"{model.upper()}: Not available")
    
    print(f"Working models: {len(models_to_test)}")
    return len(models_to_test) > 0

# Quick status check
check_api_status()


Checking API Status...
OPENAI: Ready
ANTHROPIC: Ready
MISTRAL: Ready
Working models: 3


True

In [202]:
# Comprehensive Data Analysis
def analyze_energy_data():
    if 'all_results' in locals() and all_results:
        print("Comprehensive Energy Analysis")
        print("=" * 50)
        
        # Flatten data for analysis
        flattened_data = []
        for prompt_entry in all_results:
            for model_name, model_result in prompt_entry.get("models", {}).items():
                model_result["prompt_text"] = prompt_entry["prompt_text"]
                flattened_data.append(model_result)
        
        if flattened_data:
            df = pd.DataFrame(flattened_data)
            successful = df[df['success'] == True]
            
            if len(successful) > 0:
                print(f"Successful calls: {len(successful)}")
                print(f"Failed calls: {len(df) - len(successful)}")
                print()
                
                # Model comparison
                for model in successful['model'].unique():
                    model_data = successful[successful['model'] == model]
                    
                    total_energy = model_data['server_energy_kwh'].sum()
                    total_cost = model_data['total_cost_usd'].sum()
                    avg_tokens = model_data['total_tokens'].mean()
                    avg_speed = model_data['tokens_per_second'].mean()
                    avg_efficiency = model_data['energy_per_token'].mean()
                    
                    print(f"{model.upper()}:")
                    print(f"  Energy: {total_energy:.6f} kWh")
                    print(f"  Cost: ${total_cost:.4f}")
                    print(f"  Avg Tokens: {avg_tokens:.1f}")
                    print(f"  Avg Speed: {avg_speed:.1f} tokens/sec")
                    print(f"  Efficiency: {avg_efficiency:.8f} kWh/token")
                    print(f"  Calls: {len(model_data)}")
                    print()
                
                # Overall summary
                total_energy = successful['server_energy_kwh'].sum()
                total_cost = successful['total_cost_usd'].sum()
                total_emissions = successful['server_emissions_kg_co2'].sum()
                total_tokens = successful['total_tokens'].sum()
                
                print("OVERALL SUMMARY:")
                print(f"  Total Energy: {total_energy:.6f} kWh")
                print(f"  Total Cost: ${total_cost:.4f}")
                print(f"  Total CO2: {total_emissions:.6f} kg")
                print(f"  Total Tokens: {total_tokens:,}")
                print(f"  Avg Energy/Token: {total_energy/total_tokens:.8f} kWh")
                print(f"  Avg Cost/Token: ${total_cost/total_tokens:.8f}")
            else:
                print("No successful API calls to analyze")
        else:
            print("No model results found")
    else:
        print("No results available")

# Run comprehensive analysis
analyze_energy_data()


No results available


In [203]:
energy_data.head()

Unnamed: 0,prompt_text,timestamp,models
0,What is the type of the variables in the follo...,2025-10-19 03:35:04.830390,"{'openai': {'model': 'openai', 'prompt': 'What..."
1,I have 1000 documents to download from a websi...,2025-10-19 03:35:18.795042,"{'openai': {'model': 'openai', 'prompt': 'I ha..."
