<a href="https://colab.research.google.com/github/emwebaze/PUE-prediction/blob/main/notebooks/predict_PUE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict PUE
Build a model or formular for predicting PUE from features extracted from geographical features of villages.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import pickle

## Load and preprocess data

Load data of existing mini grids.

In [None]:
# Load data
data = pd.read_csv("existing_minigrid_trainingdata.csv")

# Sample data loading (replace this with actual data loading if needed)
# Here, I’ll assume the data is already in a DataFrame `data` from the uploaded document

# Selected features (excluding specified ones)
features = ['building_count', 'permanent_building_count', 'educational_facilities',
            'health_facilities', 'social_facilities', 'services', 'primary_roads', 'secondary_roads',
            'tertiary_roads', 'unclassified_roads', 'percentage_crop_land', 'percentage_built_area',
            'mean_pvout_solar_radiation', 'mean_wind_speed']
X = data[features]
y = data['winch_prob']

# Split 30% of the data randomly for evaluation
np.random.seed(42)  # For reproducibility
eval_indices = np.random.choice(X.index, size=int(0.3 * len(X)), replace=False)
X_eval = X.loc[eval_indices]
y_eval = y.loc[eval_indices]
X_train = X  # Train on full dataset
y_train = y

# Helper function to evaluate models
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name} Performance on 30% Evaluation Set:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")



In [None]:
# 1. Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_eval)

# Save Linear Regression model
with open('lin_reg.pkl', 'wb') as f:
    pickle.dump(lin_reg, f)

# Coefficients
print("Linear Regression Coefficients:")
for feature, coef in zip(features, lin_reg.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {lin_reg.intercept_:.4f}")

evaluate_model(y_eval, y_pred_lin, "Linear Regression")

# Sample PUE formula (Linear)
print("\nLinear PUE Formula:")
formula = f"PUE = {lin_reg.intercept_:.4f}"
for i, coef in enumerate(lin_reg.coef_):
    formula += f" + {coef:.4f} * {features[i]}"
print(formula)

# 2. Polynomial Regression (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_eval_poly = poly.transform(X_eval)

poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)
y_pred_poly = poly_reg.predict(X_eval_poly)

# Save Polynomial Regression model and PolynomialFeatures object
with open('poly_reg.pkl', 'wb') as f:
    pickle.dump(poly_reg, f)
with open('poly_features.pkl', 'wb') as f:
    pickle.dump(poly, f)

evaluate_model(y_eval, y_pred_poly, "Polynomial Regression (degree 2)")

# 3. Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_eval)

# Save Random Forest model
with open('rf_reg.pkl', 'wb') as f:
    pickle.dump(rf_reg, f)

# Feature importance
print("\nRandom Forest Feature Importances:")
for feature, importance in zip(features, rf_reg.feature_importances_):
    print(f"{feature}: {importance:.4f}")

evaluate_model(y_eval, y_pred_rf, "Random Forest Regression")

# 4. XGBoost Regression
xgb_reg = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42, objective='reg:squarederror')
xgb_reg.fit(X_train, y_train)
y_pred_xgb = xgb_reg.predict(X_eval)

# Save XGBoost model
with open('xgb_reg.pkl', 'wb') as f:
    pickle.dump(xgb_reg, f)

# Feature importance
print("\nXGBoost Feature Importances:")
for feature, importance in zip(features, xgb_reg.feature_importances_):
    print(f"{feature}: {importance:.4f}")

evaluate_model(y_eval, y_pred_xgb, "XGBoost Regression")

Linear Regression Coefficients:
building_count: 0.0001
permanent_building_count: 0.0008
educational_facilities: -0.0743
health_facilities: 0.0472
social_facilities: -0.0000
services: -0.0000
primary_roads: 0.0000
secondary_roads: -0.0610
tertiary_roads: -0.0279
unclassified_roads: -0.0167
percentage_crop_land: 0.0017
percentage_built_area: 0.0164
mean_pvout_solar_radiation: 0.0004
mean_wind_speed: 0.1129
Intercept: -0.2687

Linear Regression Performance on 30% Evaluation Set:
Mean Squared Error: 0.0143
R² Score: 0.6570

Linear PUE Formula:
PUE = -0.2687 + 0.0001 * building_count + 0.0008 * permanent_building_count + -0.0743 * educational_facilities + 0.0472 * health_facilities + -0.0000 * social_facilities + -0.0000 * services + 0.0000 * primary_roads + -0.0610 * secondary_roads + -0.0279 * tertiary_roads + -0.0167 * unclassified_roads + 0.0017 * percentage_crop_land + 0.0164 * percentage_built_area + 0.0004 * mean_pvout_solar_radiation + 0.1129 * mean_wind_speed

Polynomial Regression

In [None]:
!zip -r pue_models.zip lin_reg.pkl poly_reg.pkl poly_features.pkl rf_reg.pkl xgb_reg.pkl

  adding: lin_reg.pkl (deflated 24%)
  adding: poly_reg.pkl (deflated 21%)
  adding: poly_features.pkl (deflated 36%)
  adding: rf_reg.pkl (deflated 84%)
  adding: xgb_reg.pkl (deflated 87%)
