# CSCN8010 Final Project - Energy Estimator

## Group #8
* Eris Leksi
* Erica Holden
* Reham Abuarquob

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from util.Vectorizer import Vectorizer

df = pd.read_csv('./data/alpaca_llama3_70b_server.csv')

# Drop unnecessary column
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

y = df['energy_consumption_llm']


In [None]:
vectorizer = Vectorizer(df['prompt'].tolist())

df['prompt_vector'] = df['prompt'].apply(lambda x: vectorizer.sentence_vector(x))
X_vec = np.vstack(df['prompt_vector'].values)  # shape: (num_samples, vector_size)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
# Calculate accuracy as the percentage of predictions within 10% of actual values
accuracy = (abs(y_test - y_pred) / y_test < 0.1).mean()
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {model.score(X_test, y_test)}')
print(f'Accuracy: {accuracy}')


0    [-0.21221407, 0.20534322, 0.036475558, -0.0463...
1    [-0.24852364, 0.25339648, 0.03967554, -0.05737...
2    [-0.3752487, 0.36678365, 0.06098565, -0.085890...
3    [-0.14536364, 0.13336667, 0.023797568, -0.0336...
4    [-0.32648405, 0.32264075, 0.049987968, -0.0756...
Name: prompt_vector, dtype: object
Mean Squared Error: 3.4114237173773934e-06
R^2 Score: 0.030599928164560808
Accuracy: 0.08737864077669903


In [None]:
# Save the model and vectorizer for future use
import joblib
joblib.dump(model, 'xgboost_model.pkl')
joblib.dump(vectorizer, 'text_vectorizer.pkl')