In [6]:
%pip install -r requirements.txt

Collecting pandas==2.0.3
  Using cached pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl (11.8 MB)
Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl (19.8 MB)
Collecting scikit-learn==1.3.2
  Using cached scikit_learn-1.3.2-cp39-cp39-macosx_10_9_x86_64.whl (10.2 MB)
Collecting pyvis==0.3.2
  Using cached pyvis-0.3.2-py3-none-any.whl (756 kB)
Collecting networkx==3.1
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting matplotlib==3.7.2
  Using cached matplotlib-3.7.2-cp39-cp39-macosx_10_12_x86_64.whl (7.4 MB)
Collecting geopandas==0.14.4
  Using cached geopandas-0.14.4-py3-none-any.whl (1.1 MB)
Collecting fiona==1.9.6
  Using cached fiona-1.9.6-cp39-cp39-macosx_10_15_x86_64.whl (18.7 MB)
Collecting streamlit_folium==0.23.2
  Using cached streamlit_folium-0.23.2-py3-none-any.whl (328 kB)
Collecting seaborn==0.13.2
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting lightgbm==4.5.0
  Using cached lightgbm-4.5.0-py3-none-macosx_1

In [4]:
import os 
import pandas as pd
import numpy as np
import pickle
import json
import zipfile
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from datetime import datetime
from pages.visualizations import extract_zip


In [8]:

def compress_to_zip(filename, df):
    csv_path = f"./data/{filename}.csv"
    zip_path = f"./data/{filename}.zip"
    df.to_csv(csv_path, index=True) 
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(csv_path, os.path.basename(csv_path))

    print(f"DataFrame saved and compressed into: {zip_path}")
    os.remove(csv_path)  


In [9]:
  
import fiona
import geopandas as gpd
from shapely.geometry import shape

file_path = "./data/us-states.json"
    
    # Read the GeoJSON data using geopandas
with open(file_path, 'r') as f:
        geojson_data = json.load(f)
features = geojson_data["features"]
    
# Create a list of geometries (Polygons)
geometries = [shape(feature["geometry"]) for feature in features]

# Create a list of state names
state_names = [feature["properties"]["name"] for feature in features]

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame({'state': state_names, 'geometry': geometries})

# Calculate centroids (latitude and longitude)
gdf['centroid'] = gdf.geometry.centroid
gdf['latitude'] = gdf['centroid'].apply(lambda x: x.y)
gdf['longitude'] = gdf['centroid'].apply(lambda x: x.x)

# Extract relevant columns

state_coordinates = gdf[['state', 'latitude', 'longitude', 'geometry']]
print(state_coordinates.head())

compress_to_zip("state_coordinates",state_coordinates)


        state   latitude   longitude  \
0     Alabama  32.789907  -86.827783   
1      Alaska  64.220419 -152.542689   
2     Arizona  34.293393 -111.663296   
3    Arkansas  34.898249  -92.440920   
4  California  37.253895 -119.614389   

                                            geometry  
0  POLYGON ((-87.359 35.001, -85.607 34.985, -85....  
1  MULTIPOLYGON (((-131.602 55.118, -131.569 55.2...  
2  POLYGON ((-109.043 37.000, -109.048 31.332, -1...  
3  POLYGON ((-94.474 36.502, -90.153 36.496, -90....  
4  POLYGON ((-123.233 42.006, -122.379 42.012, -1...  
DataFrame saved and compressed into: ./data/state_coordinates.zip


In [10]:


# Hand mapped respondent to state
respondent_to_state = {
    'BANC': 'California', 'PSEI': 'California', 'SW': 'Arizona', 'WACM': 'Arizona', 'MISO': 'Michigan', 'SCEG': 'South Carolina',
    'SPA': 'Texas', 'NY': 'New York', 'GVL': 'Georgia', 'FPL': 'Florida', 'PSCO': 'Colorado', 'DUK': 'North Carolina', 
    'ISNE': 'Massachusetts', 'HST': 'Texas', 'DOPD': 'Texas', 'US48': 'North America', 'PJM': 'Pennsylvania', 'AZPS': 'Arizona', 
    'CHPD': 'Texas', 'LDWP': 'California', 'SC': 'South Carolina', 'PNM': 'New Mexico', 'FMPP': 'Florida', 'FLA': 'Florida', 
    'SCL': 'California', 'IID': 'California', 'SWPP': 'Arkansas', 'WAUW': 'Washington', 'TEX': 'Texas', 'MIDA': 'Michigan', 
    'SOCO': 'Georgia', 'NEVP': 'Nevada', 'BPAT': 'Washington', 'ERCO': 'Texas', 'NW': 'Montana', 'CAR': 'North Carolina', 
    'FPC': 'Florida', 'GCPD': 'Texas', 'AECI': 'Missouri', 'PACW': 'California', 'MIDW': 'Wisconsin', 'CPLE': 'Florida', 
    'JEA': 'Florida', 'SRP': 'Arizona', 'PGE': 'California', 'TEN': 'Tennessee', 'CAL': 'California', 'IPCO': 'Oklahoma', 
    'AVA': 'Georgia', 'SEC': 'Texas', 'CISO': 'California', 'LGEE': 'Florida', 'TAL': 'Florida', 'TEC': 'Texas', 
    'NYIS': 'New York', 'TVA': 'Tennessee', 'CPLW': 'Texas', 'TPWR': 'Texas', 'CENT': 'Texas', 'TIDC': 'Texas', 
    'SE': 'Texas', 'WALC': 'Arizona', 'PACE': 'Utah', 'EPE': 'Texas', 'TEPC': 'Texas', 'NWMT': 'Montana', 
    'NE': 'Nebraska'
}

# Load data
data = extract_zip("EIA930LoadAndForecast")

data["state"] = data["respondent"].map(respondent_to_state)

# Save the updated dataset
compress_to_zip("EIA930LoadAndForecast_with_states",data)
# Data cleaning and transformation
data['value'] = pd.to_numeric(data['value'], errors='coerce')
data['period'] = pd.to_datetime(data['period'])
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Sunday']
for day in days:
    data[day] = (data['period'].dt.day_name() == day).astype(int)
data = data.dropna().query("period.dt.year >= 2022")
print(data)


DataFrame saved and compressed into: ./data/EIA930LoadAndForecast_with_states.zip
        respondent type type_name              period     value  revision_id  \
0             BANC    D    Demand 2022-01-02 02:00:00   2091.00       302352   
1             PSEI    D    Demand 2022-01-02 02:00:00   4437.00       302352   
2               SW    D    Demand 2022-01-02 02:00:00  12142.00       302352   
3             WACM    D    Demand 2022-01-02 02:00:00   3212.00       302352   
4             MISO    D    Demand 2022-01-02 02:00:00  74428.00       302352   
...            ...  ...       ...                 ...       ...          ...   
7715101        TEN    D    Demand 2024-07-04 06:00:00  20502.36       579043   
7715102        GVL    D    Demand 2024-07-04 06:00:00    256.00       579043   
7715103       PACW    D    Demand 2024-07-04 06:00:00   2495.00       579043   
7715104       NEVP    D    Demand 2024-07-04 06:00:00   6843.00       579043   
7715105       AZPS    D    Demand 2024

In [11]:



# Mark anomalies
def mark_anomalies(data):
    data['is_zero'] = (data['value'] == 0).astype(int)
    data['is_negative'] = (data['value'] < 0).astype(int)
    
    data['is_spike'] = data.groupby(['respondent', 'type_name'])['value'].transform(lambda x: (x > x.quantile(0.999)).astype(int))
    data['is_spike'] = data['is_spike'].fillna(0)
    return data

# Impute data
def impute_data(data):
    # Mark impute column
    data['impute'] = (data['is_zero'] + data['is_negative'] + data['is_spike'] > 0).astype(int)
    
    # Split into actuals and forecast
    actuals = data[data['type_name'] == "Demand"]
    forecast = data[data['type_name'] == "Day-ahead demand forecast"]
    
    # Merge actuals with forecast
    joined = pd.merge(
        actuals,
        forecast.rename(columns={'value': 'forecast'}),
        on=['respondent', 'period'],
        how='left'
    )
    
    # Rename columns to avoid suffixes like `_x` and `_y`
    joined = joined.rename(columns={
        'impute_x': 'impute',
        'type_x': 'type',
        'type_name_x': 'type_name',
    })
    
    # Add imputed values
    joined['imputed'] = np.where(
        (joined['impute'] == 1) & ~joined['forecast'].isna(),
        joined['forecast'],
        np.where(
            (joined['impute'] == 1) & joined['forecast'].isna() & ~joined['forecast'].shift(1).isna(),
            joined['forecast'].shift(1),
            np.where(
                (joined['impute'] == 1) & joined['forecast'].isna() & joined['forecast'].shift(1).isna(),
                joined['value'].shift(1),
                joined['value']
            )
        )
    )
    
    # Return cleaned data
    return joined[['respondent', 'period', 'type', 'type_name', 'imputed']].rename(
        columns={'imputed': 'value'}
    ).drop_duplicates()
data_marked = mark_anomalies(data)
data_imputed = impute_data(data_marked)

# Save the updated dataset
compress_to_zip("data_imputed",data_imputed)

for i in range(5):
    print(f"{i+1} of 5")
    data_marked = mark_anomalies(data_marked)
    data_imputed = impute_data(data_marked)

raw_imputed = pd.merge(data, data_imputed.rename(columns={'value': 'imputed'}),
                       on=['respondent', 'type', 'type_name', 'period'], how='left')
raw_imputed['is_imputed'] = (raw_imputed['value'] != raw_imputed['imputed']).astype(int)

compress_to_zip("raw_imputed",raw_imputed)

print(raw_imputed['is_imputed'].sum())


DataFrame saved and compressed into: ./data/data_imputed.zip
1 of 5
2 of 5
3 of 5
4 of 5
5 of 5
DataFrame saved and compressed into: ./data/raw_imputed.zip
1471658


In [12]:

# Calculate MAPEs
actuals = raw_imputed[raw_imputed['type_name'] == "Demand"]
forecast = raw_imputed[raw_imputed['type_name'] == "Day-ahead demand forecast"]
joined = pd.merge(actuals, forecast[['respondent', 'period', 'value']].rename(columns={'value': 'forecast'}),
                  on=['respondent', 'period'], how='left')
joined['abs_error'] = np.abs(joined['value'] - joined['forecast']) / np.abs(joined['value'])

MAPE = joined[joined['abs_error'] != np.inf].groupby('respondent')['abs_error'].mean().reset_index(name='MAPE')
compress_to_zip("MAPE",MAPE)

# Load edges and calculate correlations
edges = extract_zip("eia_930_edges")
exclude = ["CISO", "ERCO", "SWPP", "MISO", "NYIS", "ISNE", "CAL", "PJM"]

edges = edges.merge(MAPE, left_on="node1", right_on="respondent").rename(columns={"MAPE": "MAPE_node1"})
edges = edges.merge(MAPE, left_on="node2", right_on="respondent").rename(columns={"MAPE": "MAPE_node2"})
edges['abs_diff'] = np.abs(edges['MAPE_node1'] - edges['MAPE_node2'])
edges = edges.query("~node1.isin(@exclude) & ~node2.isin(@exclude)").sort_values('abs_diff', ascending=False)
compress_to_zip("edges_with_MAPE",edges)
print(edges)

# Wide format and correlation matrix
duplicates = actuals[actuals.duplicated(subset=['period', 'respondent'], keep=False)]
if not duplicates.empty:
    print("Duplicates found in actuals before pivot:")
    print(duplicates)
    # dedup
    actuals = actuals.drop_duplicates(subset=['period', 'respondent'])



DataFrame saved and compressed into: ./data/MAPE.zip
DataFrame saved and compressed into: ./data/edges_with_MAPE.zip
    node1 node2 respondent_x  MAPE_node1 respondent_y  MAPE_node2  abs_diff
7    AECI   SPA         AECI    0.036209          SPA    1.044349  1.008140
239   SPA  AECI          SPA    1.044349         AECI    0.036209  1.008140
95   PSCO   PNM         PSCO    0.723519          PNM    0.063648  0.659871
252   PNM  PSCO          PNM    0.063648         PSCO    0.723519  0.659871
113  PSCO  WACM         PSCO    0.723519         WACM    0.142075  0.581444
..    ...   ...          ...         ...          ...         ...       ...
31   GCPD  BPAT         GCPD    0.023107         BPAT    0.020857  0.002250
127  BPAT   AVA         BPAT    0.020857          AVA    0.019132  0.001726
26    AVA  BPAT          AVA    0.019132         BPAT    0.020857  0.001726
226   TEC  FMPP          TEC    0.046723         FMPP    0.045600  0.001123
219  FMPP   TEC         FMPP    0.045600       

In [13]:

# Perform pivot operation
actuals_wide = actuals.pivot(index='period', columns='respondent', values='imputed')
correlation_matrix = actuals_wide.corr(method='pearson', min_periods=1)
correlation_matrix.to_csv("./data/correlation_matrix.csv", index=True) 
print(correlation_matrix)

respondent      AECI       AVA      AZPS      BANC      BPAT       CAL  \
respondent                                                               
AECI        1.000000  0.557731  0.308503  0.373136  0.512310  0.251463   
AVA         0.557731  1.000000  0.071045  0.356518  0.942282  0.239249   
AZPS        0.308503  0.071045  1.000000  0.815741  0.003894  0.791049   
BANC        0.373136  0.356518  0.815741  1.000000  0.293236  0.878408   
BPAT        0.512310  0.942282  0.003894  0.293236  1.000000  0.176209   
...              ...       ...       ...       ...       ...       ...   
TVA         0.797252  0.404350  0.489897  0.512612  0.357459  0.369879   
US48        0.717378  0.431817  0.695393  0.708961  0.360759  0.582142   
WACM        0.289553  0.468788  0.308211  0.338997  0.406905  0.244657   
WALC        0.124428 -0.128594  0.688994  0.570709 -0.168939  0.512349   
WAUW        0.642405  0.736001  0.338746  0.484163  0.676043  0.417565   

respondent       CAR      CENT      C

In [14]:

# Simple LDWP Model
relevant_cols = ['CISO', 'BPAT', 'LDWP', 'PACE', 'NEVP', 'AZPS', 'WALC']
reg_data = actuals_wide[relevant_cols].dropna()
reg_data['LDWP_lag1'] = reg_data['LDWP'].shift(1)
reg_data['LDWP_lag24'] = reg_data['LDWP'].shift(24)

reg_data = reg_data.dropna()
X = reg_data[['LDWP_lag1', 'LDWP_lag24', 'CISO', 'BPAT', 'PACE', 'NEVP', 'AZPS', 'WALC']]
y = reg_data['LDWP']

# Define hyperparameter grids for GridSearchCV
linear_param_grid = {
    'fit_intercept': [True, False]
}

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'random_state': [42]
}

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 10],
    'random_state': [42]
}

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, shuffle=True, random_state=614
        )


In [110]:


# Model 1: Linear Regression with GridSearchCV and Cross-Validation
linear_model = LinearRegression()
linear_grid_search = GridSearchCV(linear_model, linear_param_grid, cv=5, scoring='neg_mean_absolute_error')
linear_grid_search.fit(X_train, y_train)

# Best Linear Regression Model
best_linear_model = linear_grid_search.best_estimator_
linear_predictions = best_linear_model.predict(X_test)
linear_mape = mean_absolute_percentage_error(y_test, linear_predictions)
with open("./models/linear_regression_model.pkl", 'wb') as f:
    pickle.dump(best_linear_model, f)
print("Best Linear Regression Model saved successfully.")

# Model 2: Random Forest Regressor with GridSearchCV and Cross-Validation
rf_model = RandomForestRegressor(random_state=42)
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='neg_mean_absolute_error')
rf_grid_search.fit(X_train, y_train)

# Perform cross-validation for Random Forest
rf_cv_results = cross_validate(rf_grid_search.best_estimator_, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Best Random Forest Model
best_rf_model = rf_grid_search.best_estimator_
rf_predictions = best_rf_model.predict(X_test)
rf_mape = mean_absolute_percentage_error(y_test, rf_predictions)
with open("./models/random_forest_model.pkl", 'wb') as f:
    pickle.dump(best_rf_model, f)
print("Best Random Forest Model saved successfully.")

# Model 3: Gradient Boosting Regressor with GridSearchCV and Cross-Validation
gb_model = GradientBoostingRegressor(random_state=42)
gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=5, scoring='neg_mean_absolute_error')
gb_grid_search.fit(X_train, y_train)

# Perform cross-validation for Gradient Boosting
gb_cv_results = cross_validate(gb_grid_search.best_estimator_, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Best Gradient Boosting Model
best_gb_model = gb_grid_search.best_estimator_
gb_predictions = best_gb_model.predict(X_test)
gb_mape = mean_absolute_percentage_error(y_test, gb_predictions)
with open("./models/gradient_boosting_model.pkl", 'wb') as f:
    pickle.dump(best_gb_model, f)
print("Best Gradient Boosting Model saved successfully.")

# Compare model accuracy
print("Model Performance (MAPE):")
print(f"Linear Regression: {linear_mape:.4f}")
print(f"Random Forest Regressor: {rf_mape:.4f}")
print(f"Gradient Boosting Regressor: {gb_mape:.4f}")

# Evaluation results dictionary
evaluation_results = {
    "Linear Regression": {"MAPE": linear_mape},
    "Random Forest": {"MAPE": rf_mape},
    "Gradient Boosting": {"MAPE": gb_mape},
}

# Save evaluation results to JSON
with open("./data/evaluation_results.json", "w") as file:
    json.dump(evaluation_results, file)

print("Evaluation results saved successfully.")

# Print model coefficients or feature importances
print("Linear Model Coefficients:", best_linear_model.coef_)
print("Random Forest Feature Importances:", best_rf_model.feature_importances_)
print("Gradient Boosting Feature Importances:", best_gb_model.feature_importances_)

print("Results saved successfully.")

Best Linear Regression Model saved successfully.
Best Random Forest Model saved successfully.
Best Gradient Boosting Model saved successfully.
Model Performance (MAPE):
Linear Regression: 0.0430
Random Forest Regressor: 0.0322
Gradient Boosting Regressor: 0.0327
Evaluation results saved successfully.
Linear Model Coefficients: [ 0.86237141  0.12472476 -0.00631999  0.01101798  0.05736264 -0.0310797
  0.01150089  0.07930391]
Random Forest Feature Importances: [0.94671055 0.01579027 0.01025258 0.00575556 0.00784952 0.00471721
 0.00330078 0.00562353]
Gradient Boosting Feature Importances: [0.94347466 0.01569727 0.0119251  0.00652526 0.00788263 0.00463366
 0.00462161 0.00523981]
Results saved successfully.


In [112]:
base_models_1 = [
    ('linear', best_linear_model),
    ('rf', best_rf_model)
]

base_models_2 = [
    ('linear', best_linear_model),
    ('gb', best_gb_model)
]

base_models_3 = [
    ('rf', best_rf_model),
    ('gb', best_gb_model)
]

# Linear Regression Models with different hyperparameters (create multiple models)
linear_model_1 = LinearRegression(fit_intercept=True)
linear_model_2 = LinearRegression(fit_intercept=False)


# Create a dictionary to hold the models
stacking_results = {}

# Stacking Regressors
stacking_model_1 = StackingRegressor(estimators=base_models_1, final_estimator=linear_model_1)
stacking_model_2 = StackingRegressor(estimators=base_models_2, final_estimator=linear_model_2)
# Train each stacking model
stacking_model_1.fit(X_train, y_train)
stacking_model_2.fit(X_train, y_train)

# Predictions for each model
stacking_predictions_1 = stacking_model_1.predict(X_test)
stacking_predictions_2 = stacking_model_2.predict(X_test)

# Calculate MAPE for each stacking model
stacking_mape_1 = mean_absolute_percentage_error(y_test, stacking_predictions_1)
stacking_mape_2 = mean_absolute_percentage_error(y_test, stacking_predictions_2)

# Save stacking models
with open("./models/stacking_model_1.pkl", 'wb') as f:
    pickle.dump(stacking_model_1, f)
with open("./models/stacking_model_2.pkl", 'wb') as f:
    pickle.dump(stacking_model_2, f)


# Store results
stacking_results["Stacking Model 1"] = {"MAPE": stacking_mape_1}
stacking_results["Stacking Model 2"] = {"MAPE": stacking_mape_2}

evaluation_results.update(stacking_results)
# Save evaluation results to JSON
with open("./data/evaluation_results.json", "w") as file:
    json.dump(evaluation_results, file)

# Output performance results
print("Stacking Models Performance (MAPE):")
print(f"Stacking Model 1 (Linear + RF): {stacking_mape_1:.4f}")
print(f"Stacking Model 2 (Linear + GB): {stacking_mape_2:.4f}")


Stacking Models Performance (MAPE):
Stacking Model 1 (Linear + RF): 0.0333
Stacking Model 2 (Linear + GB): 0.0344


In [21]:
with open("./data/evaluation_results.json", "r") as file:
    evaluation_results = json.load(file) 
    
ridge = Ridge(alpha=1.0) 
ridge.fit(X_train, y_train)
ridge_predictions = ridge.predict(X_test)
ridge_mape = mean_absolute_percentage_error(y_test, ridge_predictions)
with open("./models/ridge.pkl", 'wb') as f:
    pickle.dump(ridge, f)


param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],  
    'activation': ['relu', 'tanh', 'logistic'],    
    'solver': ['adam', 'sgd'],                    
    'alpha': [0.0001, 0.001, 0.01],                
    'learning_rate': ['constant', 'adaptive'],     
    'max_iter': [500, 1000, 1500],                  
}

mlp = MLPRegressor(random_state=42)

grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

grid_search.fit(X_train, y_train)
best_mlp = grid_search.best_estimator_
mlp_predictions = best_mlp.predict(X_test)
mlp_mape = mean_absolute_percentage_error(y_test, mlp_predictions)
with open("./models/mlp.pkl", 'wb') as f:
    pickle.dump(mlp, f)




Linear Performance (MAPE):
Ridge: 0.0430
MLPRegressor: 0.0419
SVR: 0.3543


In [None]:

param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [0.01, 0.1, 1, 'scale'],
    'epsilon': [0.01, 0.1, 0.5, 1.0] 
}

svr = SVR(kernel='rbf')

grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_percentage_error')
grid_search.fit(X_train, y_train)

best_svr = grid_search.best_estimator_
svr_predictions = best_svr.predict(X_test)
svr_mape = mean_absolute_percentage_error(y_test, svr_predictions)
with open("./models/svr.pkl", 'wb') as f:
    pickle.dump(svr, f)


In [None]:

# Store results
results = {}
results["Ridge"] = {"MAPE": ridge_mape}
results["MLP"] = {"MAPE": mlp_mape}
results["SVR"] = {"MAPE": svr_mape}

evaluation_results.update(results)
# Save evaluation results to JSON
with open("./data/evaluation_results.json", "w") as file:
    json.dump(evaluation_results, file)

# Output performance results
print("Linear Performance (MAPE):")
print(f"Ridge: {ridge_mape:.4f}")
print(f"MLPRegressor: {mlp_mape:.4f}")
print(f"SVR: {svr_mape:.4f}")


In [22]:

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'max_iter': 500, 'solver': 'adam'}


In [19]:

knn = KNeighborsRegressor()
param_grid = {
    'n_neighbors': np.arange(1, 68)
}

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error')
grid_search.fit(X_train, y_train)

best_n_neighbors = grid_search.best_params_['n_neighbors']
print(f"Best number of neighbors: {best_n_neighbors}")
best_knn = grid_search.best_estimator_
knn_predictions = best_knn.predict(X_test)
knn_mape = mean_absolute_percentage_error(y_test, knn_predictions)
print(f"Best KNN Model MAPE: {knn_mape:.4f}")
results = {}
results["KNN"] = {"MAPE": knn_mape}
evaluation_results.update(results)
# Save evaluation results to JSON
with open("./data/evaluation_results.json", "w") as file:
    json.dump(evaluation_results, file)
with open("./models/knn.pkl", 'wb') as f:
    pickle.dump(best_knn, f)

Best number of neighbors: 3
Best KNN Model MAPE: 0.0451


In [24]:
with open("./data/predictions.json", "r") as file:
    predictions = json.load(file) 
updates = {}
updates["Ridge"] = ridge_predictions.tolist()
updates["MLP"] = mlp_predictions.tolist()
updates["SVR"] = svr_predictions.tolist()
updates["KNN"] = knn_predictions.tolist()
predictions.update(updates)
with open("./data/predictions.json", "w") as file:
    json.dump(predictions, file)

In [23]:
import joblib
import gzip

# with zipfile.ZipFile('./models/stacking_model_1.zip', 'r') as zipf:
#     zipf.extractall('./')
#     print(zipf.namelist())
folder_path = "./models"
for file_name in os.listdir(folder_path):
    # Check if the file is a pickle file
    if file_name.endswith('.pkl'):
        file_path = os.path.join(folder_path, file_name)
        compressed_file_path = file_path + '.gz'  # Add .gz extension
        
        # Read the pickle file
        with open(file_path, 'rb') as f_in:
            data = pickle.load(f_in)
        
        with gzip.open(compressed_file_path, 'wb') as f_out:
            pickle.dump(data, f_out)
        
        print(f"Compressed: {file_name} -> {compressed_file_path}")
        os.remove(file_path)
        print(f"Deleted original file: {file_name}")
    

Compressed: knn.pkl -> ./models/knn.pkl.gz
Deleted original file: knn.pkl
Compressed: ridge.pkl -> ./models/ridge.pkl.gz
Deleted original file: ridge.pkl
Compressed: mlp.pkl -> ./models/mlp.pkl.gz
Deleted original file: mlp.pkl
Compressed: svr.pkl -> ./models/svr.pkl.gz
Deleted original file: svr.pkl


In [148]:
file_path = './models/random_forest_model.pkl.gz'

# Get file size in bytes
file_size = os.path.getsize(file_path)

# Convert to KB or MB for readability
file_size_kb = file_size / 1024  # Convert to KB
file_size_mb = file_size_kb / 1024  # Convert to MB

print(f"File size: {file_size} bytes")
print(f"File size: {file_size_kb:.2f} KB")
print(f"File size: {file_size_mb:.2f} MB")

file_path = './models/random_forest_model.pkl'

# Get file size in bytes
file_size = os.path.getsize(file_path)

# Convert to KB or MB for readability
file_size_kb = file_size / 1024  # Convert to KB
file_size_mb = file_size_kb / 1024  # Convert to MB

print(f"File size: {file_size} bytes")
print(f"File size: {file_size_kb:.2f} KB")
print(f"File size: {file_size_mb:.2f} MB")



File size: 25308793 bytes
File size: 24715.62 KB
File size: 24.14 MB
File size: 112608325 bytes
File size: 109969.07 KB
File size: 107.39 MB


In [28]:
with open("./data/predictions.json", "r") as file:
    predictions = json.load(file) 
print(predictions.keys())

dict_keys(['Actual', 'Linear Regression', 'Random Forest', 'Gradient Boosting', 'Stacking Model 1', 'Stacking Model 2', 'Ridge', 'MLP', 'SVR', 'KNN'])
