# 4. Advanced Models - Gold Price Forecasting

**Models:** SVR, AdaBoost, Extra Trees, Bagging Regressor

**References:**
- Drucker et al. (1997) - SVR
- Freund & Schapire (1997) - AdaBoost
- Geurts et al. (2006) - Extra Trees
- Breiman (1996) - Bagging

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

print('Libraries imported successfully!')

In [None]:
# Load preprocessed data
X_train = np.load('../data/processed/X_train_scaled.npy')
X_test = np.load('../data/processed/X_test_scaled.npy')
y_train = np.load('../data/processed/y_train.npy')
y_test = np.load('../data/processed/y_test.npy')
train_data = pd.read_csv('../data/processed/train_data.csv', parse_dates=['Date'])
test_data = pd.read_csv('../data/processed/test_data.csv', parse_dates=['Date'])

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

In [None]:
# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    print(f'{model_name}: RMSE=${rmse:.2f}, MAE=${mae:.2f}, MAPE={mape:.2f}%, R2={r2:.4f}')
    return {'Model': model_name, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}

results = []
tscv = TimeSeriesSplit(n_splits=5)

## 4.1 SVR (Support Vector Regression)

SVR uses kernel functions to map data to higher dimensions for better regression.

In [None]:
print('Training SVR...')
# Use subset for hyperparameter search (SVR is slow on large datasets)
svr_params = {'C': [1, 10, 100], 'kernel': ['rbf'], 'epsilon': [0.1, 0.2]}
svr_search = GridSearchCV(SVR(), svr_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
svr_search.fit(X_train[:3000], y_train[:3000])

print(f'Best SVR params: {svr_search.best_params_}')

# Train on full data with best params
svr_model = SVR(**svr_search.best_params_)
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
results.append(evaluate_model(y_test, y_pred_svr, 'SVR'))

## 4.2 AdaBoost Regressor

AdaBoost combines multiple weak learners sequentially, focusing on errors.

In [None]:
print('Training AdaBoost...')
ada_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
ada_search = GridSearchCV(AdaBoostRegressor(random_state=42), ada_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
ada_search.fit(X_train, y_train)

print(f'Best AdaBoost params: {ada_search.best_params_}')

ada_model = ada_search.best_estimator_
y_pred_ada = ada_model.predict(X_test)
results.append(evaluate_model(y_test, y_pred_ada, 'AdaBoost'))

## 4.3 Extra Trees Regressor

Extra Trees (Extremely Randomized Trees) adds more randomization than Random Forest.

In [None]:
print('Training Extra Trees...')
et_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5]}
et_search = GridSearchCV(ExtraTreesRegressor(random_state=42), et_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
et_search.fit(X_train, y_train)

print(f'Best Extra Trees params: {et_search.best_params_}')

et_model = et_search.best_estimator_
y_pred_et = et_model.predict(X_test)
results.append(evaluate_model(y_test, y_pred_et, 'Extra Trees'))

## 4.4 Bagging Regressor

Bagging reduces variance by training multiple models on bootstrap samples.

In [None]:
print('Training Bagging Regressor...')
bag_params = {'n_estimators': [10, 50, 100], 'max_samples': [0.5, 0.8, 1.0]}
bag_search = GridSearchCV(BaggingRegressor(random_state=42), bag_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
bag_search.fit(X_train, y_train)

print(f'Best Bagging params: {bag_search.best_params_}')

bag_model = bag_search.best_estimator_
y_pred_bag = bag_model.predict(X_test)
results.append(evaluate_model(y_test, y_pred_bag, 'Bagging'))

## 4.5 Results Comparison

In [None]:
results_df = pd.DataFrame(results).sort_values('RMSE')
print('\n' + '='*60)
print('ADVANCED MODELS RESULTS')
print('='*60)
print(results_df.to_string(index=False))
print('='*60)

In [None]:
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0,0].barh(results_df['Model'], results_df['RMSE'], color='steelblue')
axes[0,0].set_xlabel('RMSE ($)')
axes[0,0].set_title('RMSE by Model')

axes[0,1].barh(results_df['Model'], results_df['MAE'], color='darkgreen')
axes[0,1].set_xlabel('MAE ($)')
axes[0,1].set_title('MAE by Model')

axes[1,0].barh(results_df['Model'], results_df['MAPE'], color='darkorange')
axes[1,0].set_xlabel('MAPE (%)')
axes[1,0].set_title('MAPE by Model')

axes[1,1].barh(results_df['Model'], results_df['R2'], color='purple')
axes[1,1].set_xlabel('R2 Score')
axes[1,1].set_title('R2 by Model')

plt.tight_layout()
plt.savefig('../reports/figures/model_comparison_advanced.png', dpi=150, bbox_inches='tight')
plt.show()

## 4.6 Save Models

In [None]:
# Save models
joblib.dump(svr_model, '../models/svr.pkl')
joblib.dump(ada_model, '../models/adaboost.pkl')
joblib.dump(et_model, '../models/extra_trees.pkl')
joblib.dump(bag_model, '../models/bagging.pkl')

# Save results
results_df.to_csv('../reports/advanced_results.csv', index=False)

print('All models saved!')
print(f'\nBest Advanced Model: {results_df.iloc[0]["Model"]}')
print(f'Best RMSE: ${results_df.iloc[0]["RMSE"]:.2f}')