# CSCN8010 Final Project - Energy Estimator

## Group #8
* Eris Leksi
* Erica Holden
* Reham Abuarquob

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('../data/alpaca_llama3_70b_server.csv')

# Drop unnecessary column
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [4]:
y = df['energy_consumption_llm_total']  # Use the energy consumption column for prediction
X = df.drop(columns=['energy_consumption_llm_total'])
X = pd.get_dummies(X, drop_first=True)  # Convert categorical variables to dummy variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
# Calculate accuracy as the percentage of predictions within 10% of actual values
accuracy = (abs(y_test - y_pred) / y_test < 0.1).mean()
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {model.score(X_test, y_test)}')
print(f'Accuracy: {accuracy}')


Mean Squared Error: 8.604060410085752e-10
R^2 Score: 0.9997555045203818
Accuracy: 0.9854368932038835


In [5]:
# Save the model and vectorizer for future use
import joblib
joblib.dump(model, '../models/energy_estimator_model.pkl')

['../models/energy_estimator_model.pkl']

# To use

import pandas as pd
import joblib

## 1. Load the saved model
model = joblib.load('../models/energy_estimator_model.pkl')

## 2. Prepare your new prompt as a DataFrame with the same columns as your original X
## Example: Suppose your original X had columns ['prompt', 'model_name', ...]
new_data = pd.DataFrame([{
    'prompt': "Your new prompt here",
    'model_name': "llama3:70b",
    # ...add other required columns with appropriate values...
}])

## 3. Apply the same preprocessing as before
## (e.g., one-hot encoding, dropping columns, etc.)
## You must ensure the columns match the training data!
new_data_encoded = pd.get_dummies(new_data, drop_first=True)

## 4. Align columns with training data (add missing columns with 0)
for col in model.feature_names_in_:
    if col not in new_data_encoded.columns:
        new_data_encoded[col] = 0
new_data_encoded = new_data_encoded[model.feature_names_in_]

## 5. Predict
prediction = model.predict(new_data_encoded)
print(f'Predicted energy_consumption_llm_total: {prediction[0]}')