# Notebook 7: Phase 3 (Version 2) - K-Means with Tuned Hyperparameters

**Goal**: Combine the "Divide & Conquer" strategy (K-Means) with the **Optimized Hyperparameters** found in Phase 2.
**Hypothesis**: Tuning the individual cluster models providing further performance gains?
- **Comparison**: Phase 3 v1 (Default Params) vs Phase 3 v2 (Tuned Params).

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../src')))
from preprocessing import load_data

SPLIT_YEAR = 2015
TARGET = 'Value_co2_emissions_kt_by_country'

# Load Best Params
with open('../data/results/best_hyperparameters.json', 'r') as f:
    best_params = json.load(f)
    
print("Loaded Best Params:")
print(best_params)

# Load Data
df_common = load_data('../data/processed/common_preprocessed.csv')
df_lr = load_data('../data/processed/lr_final_prep.csv')
df_xgb = load_data('../data/processed/xgb_final_prep.csv')

# Align indices for LR
common_idx_lr = df_common.index.intersection(df_lr.index)
df_lr = df_lr.loc[common_idx_lr]
# Align indices for XGB
common_idx_xgb = df_common.index.intersection(df_xgb.index)
df_xgb = df_xgb.loc[common_idx_xgb]

# Restore Year
if 'Year' not in df_lr.columns: df_lr['Year'] = df_common.loc[df_lr.index, 'Year']
if 'Year' not in df_xgb.columns: df_xgb['Year'] = df_common.loc[df_xgb.index, 'Year']

Loaded Best Params:
{'Linear Regression': {'alpha': 10.0}, 'XGBoost': {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}}
Loaded data from ../data/processed/common_preprocessed.csv: (3473, 25)
Loaded data from ../data/processed/lr_final_prep.csv: (2190, 193)
Loaded data from ../data/processed/xgb_final_prep.csv: (3473, 25)


## 1. Clustering (Same as Phase 3 v1)

In [2]:
cluster_cols = ['gdp_per_capita', 'Access to electricity (% of population)', 
                'Renewable energy share in the total final energy consumption (%)', 
                'Primary energy consumption per capita (kWh/person)']

# Aggregate by Entity (Mean) - TRAIN DATA ONLY
df_profile = df_common[df_common['Year'] < SPLIT_YEAR].groupby('Entity')[cluster_cols].mean().dropna()

scaler_cluster = StandardScaler()
X_cluster = scaler_cluster.fit_transform(df_profile)

# K-Means (K=3)
kmeans = KMeans(n_clusters=3, random_state=42)
df_profile['Cluster'] = kmeans.fit_predict(X_cluster)

# Map Clusters
df_lr['Cluster'] = df_common.loc[df_lr.index, 'Entity'].map(df_profile['Cluster'])
df_xgb['Cluster'] = df_common.loc[df_xgb.index, 'Entity'].map(df_profile['Cluster'])

df_lr.dropna(subset=['Cluster'], inplace=True)
df_xgb.dropna(subset=['Cluster'], inplace=True)

In [3]:
results = []

def eval_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return r2_score(y_test, preds)

def prepare(df):
    train = df[df['Year'] < SPLIT_YEAR]
    test = df[df['Year'] >= SPLIT_YEAR]
    drop_cols = [c for c in [TARGET, 'Year', 'Cluster'] if c in df.columns]
    return train.drop(columns=drop_cols), train[TARGET], test.drop(columns=drop_cols), test[TARGET], len(test)

# --- 1. Tuned Linear Regression ---
params_lr = best_params.get('Linear Regression', {})
print(f"\nrunning LR with Params: {params_lr}")
weighted_r2_lr = 0
total_n_lr = 0

for c in sorted(df_lr['Cluster'].unique()):
    data_c = df_lr[df_lr['Cluster'] == c].copy()
    X_tr, y_tr, X_te, y_te, n = prepare(data_c)
    if n > 0:
        r2 = eval_model(Ridge(**params_lr), X_tr, y_tr, X_te, y_te)
        weighted_r2_lr += r2 * n
        total_n_lr += n
        results.append({'Algorithm': 'LR (Tuned)', 'Cluster': int(c), 'R2': r2})

if total_n_lr > 0:
    avg_lr = weighted_r2_lr / total_n_lr
    results.append({'Algorithm': 'LR (Tuned)', 'Cluster': 'Weighted_Avg', 'R2': avg_lr})
    print(f"Tuned LR Weighted R2: {avg_lr:.4f}")

# --- 2. Tuned XGBoost ---
params_xgb = best_params.get('XGBoost', {})
print(f"\nrunning XGB with Params: {params_xgb}")
weighted_r2_xgb = 0
total_n_xgb = 0

for c in sorted(df_xgb['Cluster'].unique()):
    data_c = df_xgb[df_xgb['Cluster'] == c].copy()
    X_tr, y_tr, X_te, y_te, n = prepare(data_c)
    if n > 0:
        r2 = eval_model(XGBRegressor(**params_xgb, random_state=42, n_jobs=-1), X_tr, y_tr, X_te, y_te)
        weighted_r2_xgb += r2 * n
        total_n_xgb += n
        results.append({'Algorithm': 'XGB (Tuned)', 'Cluster': int(c), 'R2': r2})

if total_n_xgb > 0:
    avg_xgb = weighted_r2_xgb / total_n_xgb
    results.append({'Algorithm': 'XGB (Tuned)', 'Cluster': 'Weighted_Avg', 'R2': avg_xgb})
    print(f"Tuned XGB Weighted R2: {avg_xgb:.4f}")

# Save Results
pd.DataFrame(results).to_csv('../data/results/phase3_v2_tuned_results.csv', index=False)
print("Saved Phase 3 v2 results.")


running LR with Params: {'alpha': 10.0}


Tuned LR Weighted R2: 0.9608

running XGB with Params: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}


Tuned XGB Weighted R2: 0.7985
Saved Phase 3 v2 results.
