# Energy Estimation Model for LLMs

This notebook builds a robust predictive model to estimate energy consumption (Wh) based on prompt characteristics and API response metrics.

In [44]:
# Import required libraries for energy prediction modeling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

In [45]:
# Load and prepare energy consumption dataset
df = pd.read_json('data/energy.jsonl', lines=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [46]:
# Extract comprehensive features from prompts and API responses
def extract_prompt_features(df):
    df = df.copy()
    
    df['prompt_length'] = df['prompt'].str.len()
    df['prompt_word_count'] = df['prompt'].str.split().str.len()
    df['prompt_sentence_count'] = df['prompt'].str.count(r'[.!?]+')
    
    df['prompt_uppercase_ratio'] = df['prompt'].str.count(r'[A-Z]') / df['prompt_length'].replace(0, 1)
    df['prompt_digit_ratio'] = df['prompt'].str.count(r'[0-9]') / df['prompt_length'].replace(0, 1)
    df['prompt_special_char_ratio'] = df['prompt'].str.count(r'[^a-zA-Z0-9\s]') / df['prompt_length'].replace(0, 1)
    df['prompt_question_marks'] = df['prompt'].str.count(r'\?')
    df['prompt_exclamation_marks'] = df['prompt'].str.count(r'\!')
    
    df['is_english'] = df['prompt'].str.contains(r'^[a-zA-Z\s\d\W]*$', regex=True).astype(int)
    
    df['response_length'] = df['response'].str.len()
    df['response_word_count'] = df['response'].str.split().str.len()
    
    df['tokens_per_second'] = df['total_tokens'] / df['duration'].replace(0, 1)
    df['energy_per_token'] = df['energy_consumed_wh'] / df['total_tokens'].replace(0, 1)
    df['energy_per_word'] = df['energy_consumed_wh'] / df['prompt_word_count'].replace(0, 1)
    
    le = LabelEncoder()
    df['model_encoded'] = le.fit_transform(df['model'])
    
    return df

df = extract_prompt_features(df)

In [47]:
# Prepare features and clean dataset for modeling
feature_columns = [
    'prompt_length', 'prompt_word_count', 'prompt_sentence_count',
    'prompt_uppercase_ratio', 'prompt_digit_ratio', 'prompt_special_char_ratio',
    'prompt_question_marks', 'prompt_exclamation_marks', 'is_english',
    'response_length', 'response_word_count',
    'prompt_tokens', 'completion_tokens', 'total_tokens',
    'duration', 'time_to_first_token', 'tokens_per_second',
    'model_encoded'
]

df_clean = df[feature_columns + ['energy_consumed_wh']].dropna()
df_clean = df_clean[df_clean['energy_consumed_wh'] > 0]

X = df_clean[feature_columns]
y = df_clean['energy_consumed_wh']

In [48]:
# Split data and scale features for training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df_clean['model_encoded']
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [49]:
# Train multiple models and evaluate performance
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Elastic Net': ElasticNet(alpha=0.1, l1_ratio=0.5),
    'Random Forest': RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=5, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42),
    'Extra Trees': ExtraTreesRegressor(n_estimators=200, max_depth=20, min_samples_split=5, random_state=42)
}

results = {}
trained_models = {}

for name, model in models.items():
    if name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Elastic Net']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2,
        'MAE': mae,
        'MAPE': mape
    }
    
    trained_models[name] = model

In [50]:
# Analyze model performance and select best model
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R2', ascending=False)

best_model_name = results_df.index[0]
best_model = trained_models[best_model_name]

if best_model_name in ['Random Forest', 'Gradient Boosting', 'Extra Trees']:
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

In [51]:
# Create energy prediction function and generate predictions
def predict_energy(prompt_text, model_name, expected_tokens=50, expected_duration=1.0):
    prompt_length = len(prompt_text)
    prompt_word_count = len(prompt_text.split())
    prompt_sentence_count = prompt_text.count('.') + prompt_text.count('!') + prompt_text.count('?')
    prompt_uppercase_ratio = sum(1 for c in prompt_text if c.isupper()) / max(prompt_length, 1)
    prompt_digit_ratio = sum(1 for c in prompt_text if c.isdigit()) / max(prompt_length, 1)
    prompt_special_char_ratio = sum(1 for c in prompt_text if not c.isalnum() and not c.isspace()) / max(prompt_length, 1)
    prompt_question_marks = prompt_text.count('?')
    prompt_exclamation_marks = prompt_text.count('!')
    is_english = 1 if prompt_text.isascii() else 0
    
    model_encoding = {
        'gpt-4o-mini-2024-07-18': 0,
        'llama-3.1-8b-instant': 1,
        'mistral-large-latest': 2
    }
    
    model_encoded = model_encoding.get(model_name, 0)
    
    features = np.array([
        prompt_length, prompt_word_count, prompt_sentence_count,
        prompt_uppercase_ratio, prompt_digit_ratio, prompt_special_char_ratio,
        prompt_question_marks, prompt_exclamation_marks, is_english,
        expected_tokens * 10,
        expected_tokens * 2,
        prompt_word_count * 1.3,
        expected_tokens,
        prompt_word_count * 1.3 + expected_tokens,
        expected_duration,
        expected_duration * 0.8,
        (prompt_word_count * 1.3 + expected_tokens) / max(expected_duration, 0.1),
        model_encoded
    ]).reshape(1, -1)
    
    if best_model_name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Elastic Net']:
        features_scaled = scaler.transform(features)
        prediction = best_model.predict(features_scaled)[0]
    else:
        prediction = best_model.predict(features)[0]
    
    return max(0, prediction)

predictions = []
for index, row in df.iterrows():
    predicted_energy = predict_energy(row['prompt'], row['model'], row['total_tokens'], row['duration'])
    
    prediction_record = {
        'prompt': row['prompt'],
        'model': row['model'],
        'timestamp': row['timestamp'],
        'duration': row['duration'],
        'time_to_first_token': row['time_to_first_token'],
        'prompt_tokens': row['prompt_tokens'],
        'completion_tokens': row['completion_tokens'],
        'total_tokens': row['total_tokens'],
        'tokens_per_second': row['tokens_per_second'],
        'energy_consumed_wh': row['energy_consumed_wh'],
        'predicted_energy_wh': round(predicted_energy, 6),
        'response': row['response']
    }
    
    predictions.append(prediction_record)

predictions_df = pd.DataFrame(predictions)
predictions_df.to_json('data/predictions.jsonl', orient='records', lines=True)