In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score

# Load dataset
df = pd.read_csv('/content/cleaned_solar_dataset.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df['installation_date'] = pd.to_datetime(df['installation_date'], errors='coerce')
df['subsidy_%'] = (df['govt_subsidy_availed'] / df['system_cost']).clip(0, 1) * 100


In [None]:

features = [
    "household_size", "house_area_sqft", "household_income",
    "monthly_consumption_kwh", "solar_generation_kwh", "battery_storage_kwh",
    "net_energy_sent_to_grid_kwh", "system_cost", "govt_subsidy_availed",
    "loan_amount", "interest_rate_%", "emi_per_month", "payback_period_years",
    "maintenance_cost_per_year", "annual_output_kwh", "subsidy_%"
]


In [None]:
sns.kdeplot(df['monthly_consumption_kwh'], label='Consumption', fill=True); sns.kdeplot(df['solar_generation_kwh'], label='Generation', fill=True); plt.legend(); plt.title('Consumption vs Generation'); plt.show()

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=False, cmap='coolwarm'); plt.title('Correlation'); plt.show()

In [None]:
sns.scatterplot(x='system_cost', y='monthly_savings_rs', data=df); plt.title('Savings vs Cost'); plt.show()

In [None]:
sns.histplot(df['payback_period_years'], kde=True); plt.title('Payback Distribution'); plt.show()

In [None]:
sns.scatterplot(x='subsidy_%', y='roi_%', data=df); plt.title('Subsidy vs ROI'); plt.show()

In [None]:
sns.scatterplot(x='annual_output_kwh', y='co2_saved_kg_per_year', data=df); plt.title('CO2 Saved vs Output'); plt.show()

In [None]:
X = df[features]; y = df['roi_%']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_roi = RandomForestRegressor(n_estimators=100, random_state=42)
model_roi.fit(X_train, y_train)
pred = model_roi.predict(X_test)
print('ROI RMSE:', np.sqrt(mean_squared_error(y_test, pred)), '| R2:', r2_score(y_test, pred))

In [None]:
y_pp = df['payback_period_years']
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X, y_pp, test_size=0.2, random_state=42)
model_pp = RandomForestRegressor(n_estimators=100, random_state=42)
model_pp.fit(X_train_pp, y_train_pp)
pred_pp = model_pp.predict(X_test_pp)
print('Payback RMSE:', np.sqrt(mean_squared_error(y_test_pp, pred_pp)), '| R2:', r2_score(y_test_pp, pred_pp))

In [None]:
y_ms = df['monthly_savings_rs']
X_train_ms, X_test_ms, y_train_ms, y_test_ms = train_test_split(X, y_ms, test_size=0.2, random_state=42)
model_ms = RandomForestRegressor(n_estimators=100, random_state=42)
model_ms.fit(X_train_ms, y_train_ms)
pred_ms = model_ms.predict(X_test_ms)
print('Savings RMSE:', np.sqrt(mean_squared_error(y_test_ms, pred_ms)), '| R2:', r2_score(y_test_ms, pred_ms))

In [None]:
df['high_roi'] = (df['roi_%'] >= 30).astype(int)
y_clf = df['high_roi']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.2, random_state=42)
model_clf = RandomForestClassifier(n_estimators=100, random_state=42)
model_clf.fit(X_train_c, y_train_c)
pred_c = model_clf.predict(X_test_c)
proba_c = model_clf.predict_proba(X_test_c)[:, 1]
print('Classifier Accuracy:', accuracy_score(y_test_c, pred_c), '| ROC AUC:', roc_auc_score(y_test_c, proba_c))

In [None]:

for model, name in zip([model_roi, model_pp, model_ms, model_clf], ['ROI', 'Payback', 'Savings', 'Classifier']):
    importances = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_}).sort_values(by='Importance', ascending=False)
    sns.barplot(x='Importance', y='Feature', data=importances)
    plt.title(f'Feature Importance - {name}')
    plt.tight_layout()
    plt.show()
