In [19]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from itertools import product
import xgboost as xgb

In [20]:
# Load data
X_train_input = np.load("Data/X_train.npy") 
Y_train_input = np.load("Data/Y_train.npy")  

X_train_input = np.delete(X_train_input, 1, axis=1)
print("New shapes:", X_train_input.shape, Y_train_input.shape)

# Train/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X_train_input, Y_train_input, test_size=200, random_state=42, shuffle=True
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

New shapes: (700, 5) (700,)


In [21]:
# Hyperparameter grids
max_depth_list = [3, 4, 5, 6]
learning_rate_list = [0.01, 0.05, 0.1, 0.2]
n_estimators_list = [100, 200, 300]

# K-Fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_r2 = -np.inf
best_params = None
results = []

In [22]:
for max_depth, lr, n_estimators in product(max_depth_list, learning_rate_list, n_estimators_list):
    r2_scores = []

    for train_idx, val_idx in kf.split(X_train_scaled):
        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        Y_tr, Y_val = Y_train[train_idx], Y_train[val_idx]

        model = xgb.XGBRegressor(
            max_depth=max_depth,
            learning_rate=lr,
            n_estimators=n_estimators,
            objective='reg:squarederror',
            random_state=42,
            verbosity=0
        )
        model.fit(X_tr, Y_tr)

        Y_val_pred = model.predict(X_val)
        r2_scores.append(r2_score(Y_val, Y_val_pred))

    mean_r2 = np.mean(r2_scores)
    results.append({'max_depth': max_depth, 'learning_rate': lr, 'n_estimators': n_estimators, 'mean_r2': mean_r2})

    if mean_r2 > best_r2:
        best_r2 = mean_r2
        best_params = {'max_depth': max_depth, 'learning_rate': lr, 'n_estimators': n_estimators}

print("Best Cross Validation R2:", best_r2)
print("Best Hyperparameters:", best_params)

Best Cross Validation R2: 0.9749150418454399
Best Hyperparameters: {'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 300}


In [23]:
# Train final model
final_model = xgb.XGBRegressor(
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    objective='reg:squarederror',
    random_state=42,
    verbosity=0
)
final_model.fit(X_train_scaled, Y_train)

# Evaluate on held-out test set
Y_test_pred = final_model.predict(X_test_scaled)
test_r2 = r2_score(Y_test, Y_test_pred)
print("R2 on held-out 200 test samples:", test_r2)

R2 on held-out 200 test samples: 0.9739853178480371
