# Solar Installation Analysis Notebook
This notebook contains code for data cleaning, visualizations, and ML modeling.

In [None]:

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score

# Load cleaned dataset
df = pd.read_csv('cleaned_solar_dataset.csv')


## 2. Data Cleaning

In [None]:

# Convert date and engineer features
df['installation_date'] = pd.to_datetime(df['installation_date'], errors='coerce')
df['subsidy_%'] = (df['govt_subsidy_availed'] / df['system_cost']) * 100
df.columns = [c.lower().replace(' ', '_') for c in df.columns]
df.head()


## 3. Visualizations
### 3.1 Distribution of Consumption vs Generation

In [None]:

plt.figure(figsize=(8,4))
sns.kdeplot(df['monthly_consumption_kwh'], label='Consumption')
sns.kdeplot(df['solar_generation_kwh'], label='Generation')
plt.legend()
plt.show()


## 4. Machine Learning Models
### 4.1 ROI Prediction

In [None]:

# ROI Prediction
features = [
    'household_size','house_area_sqft','household_income',
    'monthly_consumption_kwh','solar_generation_kwh','battery_storage_kwh',
    'net_energy_sent_to_grid_kwh','system_cost','govt_subsidy_availed',
    'loan_amount','interest_rate_%','emi_per_month','payback_period_years',
    'maintenance_cost_per_year','annual_output_kwh','subsidy_%'
]
target = 'roi_%'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('ROI RMSE:', mean_squared_error(y_test, pred, squared=False))
print('ROI R2:', r2_score(y_test, pred))


### 4.2 Payback Period Prediction

In [None]:

# Payback Period Prediction
target_pp = 'payback_period_years'
X_pp = df[features]
y_pp = df[target_pp]
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_pp, y_pp, test_size=0.2, random_state=42)
model_pp = RandomForestRegressor(n_estimators=100, random_state=42)
model_pp.fit(X_train_pp, y_train_pp)
pred_pp = model_pp.predict(X_test_pp)
print('Payback RMSE:', mean_squared_error(y_test_pp, pred_pp, squared=False))
print('Payback R2:', r2_score(y_test_pp, pred_pp))


### 4.3 Monthly Savings Prediction

In [None]:

# Monthly Savings Prediction
target_ms = 'monthly_savings_rs'
X_ms = df[features]
y_ms = df[target_ms]
X_train_ms, X_test_ms, y_train_ms, y_test_ms = train_test_split(X_ms, y_ms, test_size=0.2, random_state=42)
model_ms = RandomForestRegressor(n_estimators=100, random_state=42)
model_ms.fit(X_train_ms, y_train_ms)
pred_ms = model_ms.predict(X_test_ms)
print('Savings RMSE:', mean_squared_error(y_test_ms, pred_ms, squared=False))
print('Savings R2:', r2_score(y_test_ms, pred_ms))


### 4.4 High ROI Classification

In [None]:

# High ROI Classification
df['high_roi'] = (df['roi_%'] >= 30).astype(int)
X_clf = df[features]
y_clf = df['high_roi']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_c, y_train_c)
pred_c = clf.predict(X_test_c)
print('Accuracy:', accuracy_score(y_test_c, pred_c))
print('ROC AUC:', roc_auc_score(y_test_c, clf.predict_proba(X_test_c)[:,1]))
