# Notebook 3: Phase 0.5 - Danh gia mo hinh voi Random Split

**Muc tieu:**
- So sanh hieu suat 3 thuat toan: Linear Regression, SVR, XGBoost
- Tren 2 cach tiep can du lieu: Pooled Data va Panel Data
- Su dung Random Split (80/20) de chia train/test
- Luu y: Random Split co the gay Data Leakage voi du lieu chuoi thoi gian (se so sanh voi Phase 1)

**Ket qua mong doi:** Bang so sanh 6 thi nghiem (3 models x 2 approaches)

## 1. Thiet lap moi truong va tai du lieu

In [None]:
# 1.1 Import thu vien
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from xgboost import XGBRegressor

import sys
import os

# Them src vao path
sys.path.append(os.path.abspath(os.path.join('../src')))
from evaluation import evaluate_model, compare_models
from preprocessing import load_data

# Cau hinh
pd.set_option('display.max_columns', 30)
RANDOM_STATE = 42
TEST_SIZE = 0.2
TARGET = 'Value_co2_emissions_kt_by_country'

print('Thu vien da duoc import thanh cong.')

In [None]:
# 1.2 Tai du lieu da tien xu ly cho tung model
df_lr = load_data('../data/processed/lr_final_prep.csv')
df_svr = load_data('../data/processed/svr_final_prep.csv')
df_xgb = load_data('../data/processed/xgb_final_prep.csv')

In [None]:
# 1.3 Dinh nghia cac ham tach features theo Pooled/Panel approach

def get_pooled_features(df, target, has_entity_onehot=True):
    """
    Pooled Data: Loai bo lag features va Entity encoding.
    Coi moi quan sat la doc lap, khong xet yeu to quoc gia/thoi gian.
    """
    df_copy = df.copy()
    
    # Xac dinh cac cot can loai bo
    lag_cols = [c for c in df_copy.columns if 'lag' in c.lower()]
    
    if has_entity_onehot:
        entity_cols = [c for c in df_copy.columns if c.startswith('Entity_')]
    else:
        entity_cols = ['Entity'] if 'Entity' in df_copy.columns else []
    
    year_cols = ['Year'] if 'Year' in df_copy.columns else []
    
    drop_cols = lag_cols + entity_cols + year_cols + [target]
    drop_cols = [c for c in drop_cols if c in df_copy.columns]
    
    X = df_copy.drop(columns=drop_cols)
    y = df_copy[target]
    
    return X, y


def get_panel_features(df, target, has_entity_onehot=True):
    """
    Panel Data: Giu lai lag features va Entity encoding.
    Su dung Fixed Effects (Entity) va Lag Features de hoc xu huong lich su.
    """
    df_copy = df.copy()
    
    year_cols = ['Year'] if 'Year' in df_copy.columns else []
    drop_cols = year_cols + [target]
    drop_cols = [c for c in drop_cols if c in df_copy.columns]
    
    X = df_copy.drop(columns=drop_cols)
    y = df_copy[target]
    
    return X, y


print('Cac ham tach features da duoc dinh nghia.')

In [None]:
# 1.4 Kiem tra so luong features cho moi approach

X_lr_pooled, _ = get_pooled_features(df_lr, TARGET, has_entity_onehot=True)
X_lr_panel, _ = get_panel_features(df_lr, TARGET, has_entity_onehot=True)
print(f'LR - Pooled features: {X_lr_pooled.shape[1]}, Panel features: {X_lr_panel.shape[1]}')

X_svr_pooled, _ = get_pooled_features(df_svr, TARGET, has_entity_onehot=True)
X_svr_panel, _ = get_panel_features(df_svr, TARGET, has_entity_onehot=True)
print(f'SVR - Pooled features: {X_svr_pooled.shape[1]}, Panel features: {X_svr_panel.shape[1]}')

X_xgb_pooled, _ = get_pooled_features(df_xgb, TARGET, has_entity_onehot=False)
X_xgb_panel, _ = get_panel_features(df_xgb, TARGET, has_entity_onehot=False)
print(f'XGBoost - Pooled features: {X_xgb_pooled.shape[1]}, Panel features: {X_xgb_panel.shape[1]}')

## 2. Pooled Data Approach

**Dac diem:** Coi moi quan sat la doc lap, khong su dung thong tin ve quoc gia (Entity) va lich su (Lag features).

In [None]:
# 2.1 Linear Regression - Pooled
print('=== 2.1 LINEAR REGRESSION - POOLED ===')

X, y = get_pooled_features(df_lr, TARGET, has_entity_onehot=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

print(f'Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}')
print(f'So luong features: {X_train.shape[1]}')

model_lr_pooled = Ridge(alpha=1.0, random_state=RANDOM_STATE)
model_lr_pooled.fit(X_train, y_train)
y_pred = model_lr_pooled.predict(X_test)

result_lr_pooled = evaluate_model(y_test, y_pred, 'Linear Regression - Pooled')

In [None]:
# 2.2 SVR - Pooled
# Luu y: SVR can scale target vi gia tri CO2 rat lon (hang tram nghin den hang trieu)
# Su dung TransformedTargetRegressor de tu dong scale/inverse scale target
print('=== 2.2 SVR - POOLED ===')

X, y = get_pooled_features(df_svr, TARGET, has_entity_onehot=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

print(f'Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}')
print(f'So luong features: {X_train.shape[1]}')

# Wrap SVR voi StandardScaler cho target
# C=10: Gia tri can bang giua underfitting va overfitting
model_svr_pooled = TransformedTargetRegressor(
    regressor=SVR(kernel='rbf', C=10, epsilon=0.1),
    transformer=StandardScaler()
)
model_svr_pooled.fit(X_train, y_train)
y_pred = model_svr_pooled.predict(X_test)

result_svr_pooled = evaluate_model(y_test, y_pred, 'SVR - Pooled')

In [None]:
# 2.3 XGBoost - Pooled
print('=== 2.3 XGBOOST - POOLED ===')

X, y = get_pooled_features(df_xgb, TARGET, has_entity_onehot=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

print(f'Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}')
print(f'So luong features: {X_train.shape[1]}')

model_xgb_pooled = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
model_xgb_pooled.fit(X_train, y_train, verbose=False)
y_pred = model_xgb_pooled.predict(X_test)

result_xgb_pooled = evaluate_model(y_test, y_pred, 'XGBoost - Pooled')

## 3. Panel Data Approach

**Dac diem:** Su dung Fixed Effects (Entity encoding) va Lag Features de hoc xu huong lich su cua tung quoc gia.

In [None]:
# 3.1 Linear Regression - Panel
print('=== 3.1 LINEAR REGRESSION - PANEL ===')

X, y = get_panel_features(df_lr, TARGET, has_entity_onehot=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

print(f'Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}')
print(f'So luong features: {X_train.shape[1]} (bao gom Entity One-Hot va Lag features)')

model_lr_panel = Ridge(alpha=1.0, random_state=RANDOM_STATE)
model_lr_panel.fit(X_train, y_train)
y_pred = model_lr_panel.predict(X_test)

result_lr_panel = evaluate_model(y_test, y_pred, 'Linear Regression - Panel')

In [None]:
# 3.2 SVR - Panel
# Luu y: SVR can scale target vi gia tri CO2 rat lon
print('=== 3.2 SVR - PANEL ===')

X, y = get_panel_features(df_svr, TARGET, has_entity_onehot=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

print(f'Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}')
print(f'So luong features: {X_train.shape[1]} (bao gom Entity One-Hot va Lag features)')

# Wrap SVR voi StandardScaler cho target
# C=10: Gia tri can bang giua underfitting va overfitting
model_svr_panel = TransformedTargetRegressor(
    regressor=SVR(kernel='rbf', C=10, epsilon=0.1),
    transformer=StandardScaler()
)
model_svr_panel.fit(X_train, y_train)
y_pred = model_svr_panel.predict(X_test)

result_svr_panel = evaluate_model(y_test, y_pred, 'SVR - Panel')

In [None]:
# 3.3 XGBoost - Panel
print('=== 3.3 XGBOOST - PANEL ===')

X, y = get_panel_features(df_xgb, TARGET, has_entity_onehot=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

print(f'Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}')
print(f'So luong features: {X_train.shape[1]} (bao gom Entity Ordinal va Lag features)')

model_xgb_panel = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
model_xgb_panel.fit(X_train, y_train, verbose=False)
y_pred = model_xgb_panel.predict(X_test)

result_xgb_panel = evaluate_model(y_test, y_pred, 'XGBoost - Panel')

## 4. Tong hop va so sanh ket qua

In [None]:
# 4.1 Tao bang so sanh tong hop
all_results = [
    result_lr_pooled,
    result_svr_pooled,
    result_xgb_pooled,
    result_lr_panel,
    result_svr_panel,
    result_xgb_panel
]

df_results = compare_models(all_results)

df_results['Approach'] = ['Pooled', 'Pooled', 'Pooled', 'Panel', 'Panel', 'Panel']
df_results['Algorithm'] = ['Linear Regression', 'SVR', 'XGBoost'] * 2

df_results = df_results[['Algorithm', 'Approach', 'RMSE', 'MAE', 'R2']]

print('\n=== BANG TONG HOP KET QUA PHASE 0.5 ===')
print(df_results.to_string(index=False))

In [None]:
# 4.2 Truc quan hoa ket qua
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics = ['RMSE', 'MAE', 'R2']
colors = ['#2ecc71', '#3498db']

for idx, metric in enumerate(metrics):
    ax = axes[idx]
    
    pooled_vals = df_results[df_results['Approach'] == 'Pooled'][metric].values
    panel_vals = df_results[df_results['Approach'] == 'Panel'][metric].values
    
    x = np.arange(3)
    width = 0.35
    
    bars1 = ax.bar(x - width/2, pooled_vals, width, label='Pooled', color=colors[0])
    bars2 = ax.bar(x + width/2, panel_vals, width, label='Panel', color=colors[1])
    
    ax.set_xlabel('Thuat toan')
    ax.set_ylabel(metric)
    ax.set_title(f'So sanh {metric}')
    ax.set_xticks(x)
    ax.set_xticklabels(['LR', 'SVR', 'XGBoost'])
    ax.legend()
    
    for bar in bars1:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width()/2, height),
                    xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=8)
    for bar in bars2:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width()/2, height),
                    xytext=(0, 3), textcoords='offset points', ha='center', va='bottom', fontsize=8)

plt.suptitle('Phase 0.5: So sanh Pooled vs Panel Data (Random Split)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# 4.3 Nhan xet ket qua
print('\n=== NHAN XET KET QUA PHASE 0.5 ===')

best_idx = df_results['R2'].idxmax()
best_model = df_results.loc[best_idx]

print(f'\n1. Model tot nhat: {best_model["Algorithm"]} - {best_model["Approach"]}')
print(f'   R2 = {best_model["R2"]:.4f}, RMSE = {best_model["RMSE"]:.2f}')

pooled_r2_mean = df_results[df_results['Approach'] == 'Pooled']['R2'].mean()
panel_r2_mean = df_results[df_results['Approach'] == 'Panel']['R2'].mean()

print(f'\n2. So sanh Approach:')
print(f'   - Pooled R2 trung binh: {pooled_r2_mean:.4f}')
print(f'   - Panel R2 trung binh: {panel_r2_mean:.4f}')

if panel_r2_mean > pooled_r2_mean:
    print('   => Panel Data cho ket qua tot hon, cho thay Lag features va Entity encoding co ich.')
else:
    print('   => Pooled Data cho ket qua tuong duong hoac tot hon.')

print('\n3. Luu y:')
print('   - Random Split co the gay Data Leakage voi du lieu chuoi thoi gian.')
print('   - Can so sanh voi Phase 1 (Time-Series Split) de danh gia chinh xac hon.')

In [None]:
# 4.4 Luu ket qua ra file CSV de su dung cho bao cao
import os
os.makedirs('../data/results', exist_ok=True)
df_results.to_csv('../data/results/phase05_results.csv', index=False)
print('Da luu ket qua vao: ../data/results/phase05_results.csv')