In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the given data
benin_df = pd.read_csv(r'C:\Users\Eyor.G\Pictures\Projects\solar-site-optimizer\data\raw\benin-malanville.csv', parse_dates=["Timestamp"])
benin_df.drop('Comments', axis=1, inplace=True)

In [3]:
# Scaling Numerical features
scaler = StandardScaler()
numerical_cols = benin_df.select_dtypes(include=['float64', 'int64']).columns
benin_df[numerical_cols] = scaler.fit_transform(benin_df[numerical_cols])

In [4]:
# Spliting data
selected_features = ['GHI', 'DNI', 'DHI', 'Tamb', 'TModA', 'TModB']

x = benin_df[selected_features] # Best correlated features
y = benin_df['ModA'] # Target variable (Solar power output)

# ModA is highly correlated with GHI (0.994), DNI (0.892), and DHI (0.829).
# Including irrelevant features may introduce noise and reduce model performance.

In [5]:
# Split the dataset into 80-20
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Model Building**

In [6]:
lr = LinearRegression()
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)

models = {'Liner Regression': lr,
          'Decision Tree': dt,
          'Random Forest': rf}

results = {}

In [8]:
# Perform cross-validation and collect results
for model_name, model in models.items():
    # Cross-validation for R-squared
    r2_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='r2')
    average_r2 = r2_scores.mean()

    # Cross-validation for RMSE using negative MSE
    mse_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
    average_rmse = np.sqrt(-mse_scores.mean())  # Convert negative MSE to RMSE

    results[model_name] = {'RMSE': average_rmse, 'R Squared': average_r2}

# Print the results and find the best model
best_model_name = None
best_model = None
best_rmse = float('inf')

In [None]:
for model_name, result in results.items():
    print(f'{model_name} - RMSE: {result["RMSE"]:.4f}, R Squared: {result["R Squared"]:.4f}')
    if result['RMSE'] < best_rmse:  # Find the model with the lowest RMSE
        best_rmse = result['RMSE']
        best_model_name = model_name
        best_model = models[model_name]

Liner Regression - RMSE: 0.0895, R Squared: 0.9920
Decision Tree - RMSE: 0.0621, R Squared: 0.9961
Random Forest - RMSE: 0.0428, R Squared: 0.9982


In [11]:
# Save the best model
if best_model is not None:
    joblib.dump(best_model, f'{best_model_name.replace(" ", "_").lower()}_model.pkl')
    print(f'Saved the best model: {best_model_name} with RMSE: {best_rmse:.4f}')

Saved the best model: Random Forest with RMSE: 0.0428
