# EV vs ICE Decision Support System - ML Training

## Layer 2: ML Intelligence Layer

This notebook implements PCA and K-Means clustering to categorize Indian cities based on their EV infrastructure readiness.

### Objectives:
1. Load and preprocess the ML-ready dataset
2. Apply PCA for dimensionality reduction
3. Use K-Means clustering to group cities
4. Export cluster assignments for frontend consumption

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load and Explore Data

In [None]:
# Load the ML-ready dataset
# Columns: State, City, Vehicle Class, Energy Charge (₹/kWh), Cost_per_km_₹, CO2_per_km_kg, Maintenance Cost (₹/km), Charging Stations Count
df = pd.read_csv('../public/data/EV_ICE_ML_READY.csv')

print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nFirst 5 rows:")
df.head()

In [None]:
# Check data types and missing values
print("Data Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nUnique Cities:", df['City'].nunique())
print("Unique States:", df['State'].nunique())

## 2. Feature Engineering for Clustering

We'll aggregate city-level metrics for clustering. Each city will have:
- Average Energy Charge
- Average Cost per km
- Average CO2 per km
- Average Maintenance Cost
- Charging Stations Count

In [None]:
# Aggregate metrics at city level (average across vehicle classes)
city_metrics = df.groupby(['State', 'City']).agg({
    'Energy Charge (₹/kWh)': 'mean',
    'Cost_per_km_₹': 'mean',
    'CO2_per_km_kg': 'mean',
    'Maintenance Cost (₹/km)': 'mean',
    'Charging Stations Count': 'first'  # Same for all vehicle classes in a city
}).reset_index()

# Rename columns for clarity
city_metrics.columns = ['State', 'City', 'Avg_Energy_Charge', 'Avg_Cost_per_km', 
                         'Avg_CO2_per_km', 'Avg_Maintenance_Cost', 'Charging_Stations']

print("City-level aggregated data:")
city_metrics

In [None]:
# Create derived features for better clustering
# Economic Index: Lower cost per km and energy charge = better
# Environmental Index: Lower CO2 and higher charging stations = better

# Normalize features for index calculation
scaler = StandardScaler()
features_for_index = ['Avg_Energy_Charge', 'Avg_Cost_per_km', 'Avg_CO2_per_km', 
                      'Avg_Maintenance_Cost', 'Charging_Stations']
city_metrics_scaled = scaler.fit_transform(city_metrics[features_for_index])

# Create composite indices
# Economic Index: Inverse of normalized cost (lower is better, so we invert)
city_metrics['Economic_Index'] = 1 - (city_metrics_scaled[:, 1] - city_metrics_scaled[:, 1].min()) / \
                                      (city_metrics_scaled[:, 1].max() - city_metrics_scaled[:, 1].min() + 1e-10)

# Environmental Index: Based on charging infrastructure and lower CO2
charging_norm = (city_metrics['Charging_Stations'] - city_metrics['Charging_Stations'].min()) / \
                (city_metrics['Charging_Stations'].max() - city_metrics['Charging_Stations'].min() + 1e-10)
co2_norm = 1 - (city_metrics['Avg_CO2_per_km'] - city_metrics['Avg_CO2_per_km'].min()) / \
               (city_metrics['Avg_CO2_per_km'].max() - city_metrics['Avg_CO2_per_km'].min() + 1e-10)

city_metrics['Environmental_Index'] = (charging_norm + co2_norm) / 2

print("City metrics with indices:")
city_metrics

## 3. PCA for Dimensionality Reduction

In [None]:
# Select features for PCA
features_for_pca = ['Avg_Energy_Charge', 'Avg_Cost_per_km', 'Avg_CO2_per_km', 
                    'Avg_Maintenance_Cost', 'Charging_Stations']

X = city_metrics[features_for_pca].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Total Variance Explained:", sum(pca.explained_variance_ratio_))

# Add PCA components to dataframe
city_metrics['PC1'] = X_pca[:, 0]
city_metrics['PC2'] = X_pca[:, 1]

In [None]:
# Visualize PCA loadings
loadings = pd.DataFrame(
    pca.components_.T,
    columns=['PC1', 'PC2'],
    index=features_for_pca
)

plt.figure(figsize=(10, 6))
sns.heatmap(loadings, annot=True, cmap='RdBu_r', center=0)
plt.title('PCA Component Loadings')
plt.tight_layout()
plt.savefig('pca_loadings.png', dpi=150)
plt.show()

## 4. K-Means Clustering

In [None]:
# Determine optimal number of clusters using Elbow Method
inertias = []
silhouette_scores = []
K_range = range(2, min(8, len(city_metrics)))

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_pca)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_pca, kmeans.labels_))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(K_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (K)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method for Optimal K')

ax2.plot(K_range, silhouette_scores, 'go-')
ax2.set_xlabel('Number of Clusters (K)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Analysis')

plt.savefig('elbow_method.png', dpi=150)
plt.show()

print("\nSilhouette Scores:")
for k, score in zip(K_range, silhouette_scores):
    print(f"  K={k}: {score:.4f}")

In [None]:
# Apply K-Means with 3 clusters (EV-Ready, Moderate, Low-Infra)
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
city_metrics['Cluster_ID'] = kmeans.fit_predict(X_pca)

# Map cluster IDs to meaningful names based on Economic and Environmental indices
cluster_stats = city_metrics.groupby('Cluster_ID').agg({
    'Economic_Index': 'mean',
    'Environmental_Index': 'mean',
    'Charging_Stations': 'mean'
}).reset_index()

print("Cluster Statistics:")
print(cluster_stats)

# Assign cluster names based on combined index score
cluster_stats['Combined_Score'] = cluster_stats['Economic_Index'] + cluster_stats['Environmental_Index']
cluster_stats = cluster_stats.sort_values('Combined_Score', ascending=False)

cluster_name_map = {}
cluster_names = ['EV-Ready', 'Moderate', 'Low-Infra']
for i, (_, row) in enumerate(cluster_stats.iterrows()):
    if i < len(cluster_names):
        cluster_name_map[row['Cluster_ID']] = cluster_names[i]

city_metrics['Cluster'] = city_metrics['Cluster_ID'].map(cluster_name_map)

print("\nCluster Name Mapping:")
print(cluster_name_map)
print("\nCities by Cluster:")
print(city_metrics[['State', 'City', 'Cluster', 'Economic_Index', 'Environmental_Index']].sort_values('Cluster'))

In [None]:
# Visualize clusters in PCA space
plt.figure(figsize=(12, 8))
colors = {'EV-Ready': '#22c55e', 'Moderate': '#eab308', 'Low-Infra': '#ef4444'}

for cluster in city_metrics['Cluster'].unique():
    mask = city_metrics['Cluster'] == cluster
    plt.scatter(
        city_metrics.loc[mask, 'PC1'],
        city_metrics.loc[mask, 'PC2'],
        c=colors.get(cluster, '#888888'),
        label=cluster,
        s=150,
        alpha=0.7
    )
    
    # Add city labels
    for _, row in city_metrics[mask].iterrows():
        plt.annotate(
            row['City'],
            (row['PC1'], row['PC2']),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=9
        )

plt.xlabel('Principal Component 1 (Economic Factors)')
plt.ylabel('Principal Component 2 (Environmental Factors)')
plt.title('City Clusters: EV Infrastructure Readiness')
plt.legend(title='Cluster')
plt.tight_layout()
plt.savefig('city_clusters.png', dpi=150)
plt.show()

## 5. Export Cluster Assignments

In [None]:
# Scale indices to 0-1 range for frontend
scaler_minmax = MinMaxScaler()
city_metrics[['Economic_Index', 'Environmental_Index']] = scaler_minmax.fit_transform(
    city_metrics[['Economic_Index', 'Environmental_Index']]
)

# Prepare final output for frontend
output_df = city_metrics[['State', 'City', 'Cluster', 'Economic_Index', 'Environmental_Index',
                           'Avg_Cost_per_km', 'Avg_CO2_per_km', 'Charging_Stations', 'Avg_Maintenance_Cost']].copy()

# Rename columns to match frontend expectations
output_df.columns = ['State', 'City', 'Cluster', 'Economic_Index', 'Environmental_Index',
                      'Cost_Advantage', 'CO2_Advantage', 'Charging_Density', 'Maintenance_Cost']

# Round values for cleaner output
output_df['Economic_Index'] = output_df['Economic_Index'].round(4)
output_df['Environmental_Index'] = output_df['Environmental_Index'].round(4)
output_df['Cost_Advantage'] = output_df['Cost_Advantage'].round(2)
output_df['CO2_Advantage'] = output_df['CO2_Advantage'].round(4)
output_df['Charging_Density'] = output_df['Charging_Density'].round(2)
output_df['Maintenance_Cost'] = output_df['Maintenance_Cost'].round(2)

# Export to CSV
output_df.to_csv('../public/data/CITY_CLUSTERS.csv', index=False)

print("Exported cluster assignments to ../public/data/CITY_CLUSTERS.csv")
print("\nFinal Output:")
output_df

## 6. Summary Statistics

In [None]:
# Summary by cluster
summary = city_metrics.groupby('Cluster').agg({
    'City': 'count',
    'Avg_Cost_per_km': 'mean',
    'Avg_CO2_per_km': 'mean',
    'Charging_Stations': 'mean',
    'Economic_Index': 'mean',
    'Environmental_Index': 'mean'
}).round(3)

summary.columns = ['Cities Count', 'Avg Cost/km', 'Avg CO2/km', 'Avg Charging Stations',
                    'Avg Economic Index', 'Avg Environmental Index']

print("Cluster Summary:")
summary

In [None]:
# Create a summary visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Economic vs Environmental Index scatter
for cluster in city_metrics['Cluster'].unique():
    mask = city_metrics['Cluster'] == cluster
    axes[0].scatter(
        city_metrics.loc[mask, 'Economic_Index'],
        city_metrics.loc[mask, 'Environmental_Index'],
        c=colors.get(cluster, '#888888'),
        label=cluster,
        s=150,
        alpha=0.7
    )

axes[0].set_xlabel('Economic Index')
axes[0].set_ylabel('Environmental Index')
axes[0].set_title('City Positioning: Economic vs Environmental')
axes[0].legend(title='Cluster')

# Cluster distribution
cluster_counts = city_metrics['Cluster'].value_counts()
axes[1].pie(
    cluster_counts.values,
    labels=cluster_counts.index,
    colors=[colors.get(c, '#888888') for c in cluster_counts.index],
    autopct='%1.0f%%',
    startangle=90
)
axes[1].set_title('Cluster Distribution')

plt.tight_layout()
plt.savefig('cluster_summary.png', dpi=150)
plt.show()