# Notebook 9: Phase 5 - Real-World Evaluation (2020-2023)

**Goal**: Test the **Cluster-Based Linear Regression (Tuned)** model on fresh, unseen data from 2020-2023 fetched directly from the World Bank API.
**Challenge**: WB data has lag (CO2 often stops at 2020). We will check what is available and use GDP/Energy inputs to predict CO2, then compare where possible.

In [1]:
import wbdata
import pandas as pd
import numpy as np
import datetime
import json
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from sklearn.metrics import r2_score, mean_absolute_error

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../src')))
from preprocessing import load_data

TARGET = 'Value_co2_emissions_kt_by_country'

# 1. Define Indicators 
potential_indicators = {
    'NY.GDP.PCAP.PP.KD': 'gdp_per_capita',  
    'EG.ELC.ACCS.ZS': 'Access to electricity (% of population)', 
    'EG.FEC.RNEW.ZS': 'Renewable energy share in the total final energy consumption (%)', 
    'EG.USE.PCAP.KG.OE': 'Primary energy consumption per capita', 
    'EN.GHG.CO2.MT.CE.AR5': 'CO2_Mt_AR5', 
    'SP.POP.TOTL': 'Population'           
}

# 2. Fetch Data (2015-2023)
data_date = (datetime.datetime(2015, 1, 1), datetime.datetime(2023, 12, 31))
print("Fetching World Bank Data...")

dfs = []
successful_indicators = {}

for code, name in potential_indicators.items():
    try:
        data = wbdata.get_dataframe({code: name}, country='all', date=data_date)
        if data is not None and not data.empty:
            dfs.append(data)
            successful_indicators[code] = name
    except Exception as e:
        print(f"Failed to fetch {name} ({code}): {e}")
        try:
             data = wbdata.get_dataframe({code: name}, country='all', data_date=data_date)
             if data is not None and not data.empty:
                dfs.append(data)
                successful_indicators[code] = name
        except:
             pass

if not dfs:
    raise ValueError("No data could be fetched!")

df_wb = pd.concat(dfs, axis=1)
df_wb = df_wb.loc[:,~df_wb.columns.duplicated()]
df_wb = df_wb.reset_index().rename(columns={'country': 'Entity', 'date': 'Year'})
df_wb['Year'] = df_wb['Year'].astype(int)
df_wb = df_wb.sort_values(by=['Entity', 'Year'])

# Unit Conversions
if 'Primary energy consumption per capita' in df_wb.columns:
    df_wb['Primary energy consumption per capita (kWh/person)'] = df_wb['Primary energy consumption per capita'] * 11.63
    df_wb.drop(columns=['Primary energy consumption per capita'], inplace=True)

if 'CO2_Mt_AR5' in df_wb.columns:
    df_wb[TARGET] = df_wb['CO2_Mt_AR5'] * 1000
    df_wb.drop(columns=['CO2_Mt_AR5'], inplace=True)

# Standardize Country Names
name_map = {
    'Viet Nam': 'Vietnam',
    'Egypt, Arab Rep.': 'Egypt',
    'Iran, Islamic Rep.': 'Iran',
    'Korea, Rep.': 'South Korea',
    'Korea, Dem. People\'s Rep.': 'North Korea',
    'Venezuela, RB': 'Venezuela',
    'Yemen, Rep.': 'Yemen',
    'Slovak Republic': 'Slovakia',
    'Russian Federation': 'Russia',
    'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
    'Kygrgyz Republic': 'Kyrgyzstan',
    'Lao PDR': 'Laos'
}
df_wb['Entity'] = df_wb['Entity'].replace(name_map)

df_new = df_wb[df_wb['Year'] >= 2015].copy()
df_new = df_new.groupby('Entity', group_keys=False).apply(lambda x: x.ffill().bfill())
df_new.fillna(df_new.median(numeric_only=True), inplace=True)


Fetching World Bank Data...


  df_new = df_new.groupby('Entity', group_keys=False).apply(lambda x: x.ffill().bfill())


## 2. Assign Clusters

In [2]:
df_old = load_data('../data/processed/common_preprocessed.csv')

# Robust Construction of Population
# Find Density and Land Area cols regardless of exact name
density_col = next((c for c in df_old.columns if 'Density' in c), None)
land_col = next((c for c in df_old.columns if 'Land Area' in c), None)

if density_col and land_col:
    print(f"Constructing Population using {density_col} and {land_col}")
    df_old['Population'] = df_old[density_col] * df_old[land_col]
else:
    print("WARNING: Could not find Density or Land Area columns to construct Population. Scale will be wrong.")

cluster_cols_candidate = ['gdp_per_capita', 'Access to electricity (% of population)', 
                          'Renewable energy share in the total final energy consumption (%)', 
                          'Primary energy consumption per capita (kWh/person)']

cluster_cols = [c for c in cluster_cols_candidate if c in df_new.columns]

# Fit Scaler & KMeans on Old Data
df_profile_train = df_old[df_old['Year'] < 2015].groupby('Entity')[cluster_cols].mean().dropna()
scaler_cluster = StandardScaler()
X_cluster_train = scaler_cluster.fit_transform(df_profile_train)
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_cluster_train)

# Predict Clusters for New Data
df_profile_new = df_new.groupby('Entity')[cluster_cols].mean().dropna()
X_cluster_new = scaler_cluster.transform(df_profile_new)
new_labels = kmeans.predict(X_cluster_new)
entity_cluster_map_new = pd.Series(new_labels, index=df_profile_new.index)

df_new['Cluster'] = df_new['Entity'].map(entity_cluster_map_new)
df_new = df_new.dropna(subset=['Cluster'])

Loaded data from ../data/processed/common_preprocessed.csv: (3473, 25)
Constructing Population using Density\n(P/Km2) and Land Area(Km2)


## 3. Retrain & Evaluate (With Population)

In [3]:
with open('../data/results/best_hyperparameters.json', 'r') as f:
    best_params = json.load(f)
params_lr = best_params.get('Linear Regression', {})

entity_cluster_map_old = pd.Series(kmeans.labels_, index=df_profile_train.index)

# Features: Cluster Cols + Population
model_features = cluster_cols[:] # Copy
if 'Population' in df_old.columns and 'Population' in df_new.columns:
     model_features.append('Population')
else:
     print("Population not available in both datasets. Model will lack scale.")

print(f"Training with features: {model_features}")

models = {}
for c in range(3):
    df_old_c = df_old[df_old['Entity'].map(entity_cluster_map_old) == c].dropna(subset=model_features + [TARGET])
    if len(df_old_c) == 0: continue
    X_train = df_old_c[model_features]
    y_train = df_old_c[TARGET]
    model = Ridge(**params_lr)
    model.fit(X_train, y_train)
    models[c] = model

preds = []
for idx, row in df_new.iterrows():
    c = row['Cluster']
    if c in models and not row[model_features].isna().any():
        X_input = row[model_features].values.reshape(1, -1)
        preds.append(models[c].predict(X_input)[0])
    else:
        preds.append(np.nan)

df_new['Predicted_CO2'] = preds

# Evaluate
if TARGET in df_new.columns:
    df_eval = df_new.dropna(subset=[TARGET, 'Predicted_CO2'])
    print(f"Eval Records: {len(df_eval)}")
    if len(df_eval) > 0:
        r2 = r2_score(df_eval[TARGET], df_eval['Predicted_CO2'])
        mae = mean_absolute_error(df_eval[TARGET], df_eval['Predicted_CO2'])
        print(f"R2 Score (Real-World WB Data): {r2:.4f}")
        print(f"MAE: {mae:.2f} kt")
        
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=df_eval[TARGET], y=df_eval['Predicted_CO2'], hue=df_eval['Cluster'])
        plt.plot([df_eval[TARGET].min(), df_eval[TARGET].max()], [df_eval[TARGET].min(), df_eval[TARGET].max()], 'r--')
        plt.title(f'Evaluation on World Bank Data (Predicted vs Real CO2)\nR2={r2:.4f}')
        plt.xlabel('Real CO2 (kt)')
        plt.ylabel('Predicted CO2 (kt)')
        plt.xscale('log'); plt.yscale('log')
        plt.savefig('../reports/figures/realworld_eval_metrics.png')
        plt.close()

result_cols = ['Year'] + model_features + ['Predicted_CO2']
if TARGET in df_new.columns:
    result_cols.append(TARGET)

print("\nPredictions for Vietnam (2020-2023):")
print(df_new[(df_new['Entity'] == 'Vietnam') & (df_new['Year'] >= 2020)][result_cols])

df_new.to_csv('../data/results/phase5_realworld_predictions_2023.csv', index=False)

Training with features: ['gdp_per_capita', 'Access to electricity (% of population)', 'Renewable energy share in the total final energy consumption (%)', 'Primary energy consumption per capita (kWh/person)', 'Population']


  return f(*arrays, *other_args, **kwargs)






Eval Records: 2394
R2 Score (Real-World WB Data): 0.8235
MAE: 561348.67 kt



Predictions for Vietnam (2020-2023):
      Year  gdp_per_capita  Access to electricity (% of population)  \
2343  2020    11851.396569                                     99.8   
2342  2021    12048.901994                                    100.0   
2341  2022    12979.763905                                    100.0   
2340  2023    13545.934662                                     99.8   

      Renewable energy share in the total final energy consumption (%)  \
2343                                               18.9                  
2342                                               24.2                  
2341                                               24.2                  
2340                                               24.2                  

      Primary energy consumption per capita (kWh/person)   Population  \
2343                                       11528.600734    98079191.0   
2342                                       11190.380474    98935098.0   
2341            