In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import RandomizedSearchCV

In [2]:
gdpnow = pd.read_csv('gdpnow_daily_gdp_interp.csv', index_col='Unnamed: 0', parse_dates=True) #date_parser=dateparse)
# Ensure that load_df index is in the same date format
gdpnow.index = pd.to_datetime(gdpnow.index)

In [3]:
Target = 'Final_GDP_Interp'
Drop = ['GDP Nowcast', 'Final_GDP_Interp', 'Quarter being forecasted', 'Advance Estimate From BEA', 'Publication Date of Advance Estimate',
       'Days until advance estimate', 'Forecast Error', 'Data releases']

In [4]:
# From here on lets try two different datasets as in how to work with the NAs:
# 1. ffil
# 2. dropping NAs

gdpnow_filled = gdpnow.fillna(method='ffill')
gdpnow_dropped = gdpnow.dropna(axis=0, thresh=34)

In [5]:
gdpnow_filled.shape, gdpnow_dropped.shape

((2537, 36), (1496, 36))

In [6]:
# defining X and y for both data sets (dropped and filled)

X_filled = gdpnow_filled.drop(columns=Drop)
y_filled = gdpnow_filled[Target]

X_dropped = gdpnow_dropped.drop(columns=Drop)
y_dropped = gdpnow_dropped[Target]

assert(X_filled.shape[0]==y_filled.shape[0])
assert(X_dropped.shape[0]==y_dropped.shape[0])

In [7]:
# Splitting into test and train sets

X_filled_train, X_filled_test, y_filled_train, y_filled_test = \
    train_test_split(X_filled, y_filled, test_size=0.20)

X_dropped_train, X_dropped_test, y_dropped_train, y_dropped_test = \
    train_test_split(X_dropped, y_dropped, test_size=0.20)


In [8]:
preproc = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

preproc_selector = Pipeline([
    ('preprocessing', preproc),  # Include the preprocessing steps with PCA
    ('feature_selection', SelectPercentile(
        mutual_info_regression,
        percentile=90 # Keep 90% of all features
    ))
])

In [12]:
model = XGBRegressor(random_state=42)

param_distributions = {
    'model__n_estimators': [200],
    'model__learning_rate': [0.1],
    'model__max_depth': [16, 20],
}

pipe = Pipeline([
    ('preprocessor', preproc),
    ('model', model)
])

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
    scoring='neg_mean_absolute_error',  # Assuming MSE is the metric of interest; adjust as needed.
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

random_search.fit(X_filled_train, y_filled_train)

print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)



Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.6s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.7s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.8s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.8s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.9s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.9s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.8s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.9s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   1.4s
[CV] END model__learning_rate=0.1, model

In [13]:
model = XGBRegressor(random_state=42)

param_distributions = {
    'model__n_estimators': [200],
    'model__learning_rate': [0.1],
    'model__max_depth': [16, 20],
}

pipe = Pipeline([
    ('preprocessor', preproc),
    ('model', model)
])

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
    scoring='neg_mean_absolute_error',  # Assuming MSE is the metric of interest; adjust as needed.
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

random_search.fit(X_dropped_train, y_dropped_train)

print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)



Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.4s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.4s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.4s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.5s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.6s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.8s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.7s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.8s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   1.4s
[CV] END model__learning_rate=0.1, model