In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

# Import data

In [2]:
gdpnow = pd.read_csv('gdpnow_daily_gdp_interp.csv', index_col='Unnamed: 0', parse_dates=True) #date_parser=dateparse)
# Ensure that load_df index is in the same date format
gdpnow.index = pd.to_datetime(gdpnow.index)

## Let's define X and y

In [3]:
Target = 'Final_GDP_Interp'
Drop = ['GDP Nowcast', 'Final_GDP_Interp', 'Quarter being forecasted', 'Advance Estimate From BEA', 'Publication Date of Advance Estimate',
       'Days until advance estimate', 'Forecast Error', 'Data releases']

# How to deal with all the NAs ?

From here on lets try two different datasets as in how to work with the NAs:
1. ffil
2. dropping NAs

In [4]:
gdpnow_filled = gdpnow.fillna(method='ffill')
gdpnow_dropped = gdpnow.dropna(axis=0, thresh=34)

In [5]:
gdpnow_filled.shape, gdpnow_dropped.shape

((2537, 36), (1496, 36))

# Defining X and y

We now need the X and Y for the three different data sets

In [21]:
# defining X and y for all three data sets (initial, dropped and filled)
X = gdpnow.drop(columns=Drop)
y = gdpnow[Target]

X_filled = gdpnow_filled.drop(columns=Drop)
y_filled = gdpnow_filled[Target]

X_dropped = gdpnow_dropped.drop(columns=Drop)
y_dropped = gdpnow_dropped[Target]

assert(X.shape[0]==y.shape[0])
assert(X_filled.shape[0]==y_filled.shape[0])
assert(X_dropped.shape[0]==y_dropped.shape[0])

In [22]:
# Splitting into test and train sets
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.20)

X_filled_train, X_filled_test, y_filled_train, y_filled_test = \
    train_test_split(X_filled, y_filled, test_size=0.20)

X_dropped_train, X_dropped_test, y_dropped_train, y_dropped_test = \
    train_test_split(X_dropped, y_dropped, test_size=0.20)


# Pipeline

In [59]:
preproc = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

preproc_selector = Pipeline([
    ('preprocessing', preproc),  # Include the preprocessing steps with PCA
    ('feature_selection', SelectPercentile(
        mutual_info_regression,
        percentile=90 # Keep 90% of all features
    ))
])

# Modeling

In [45]:
#Modeling with initial dataset

model = XGBRegressor(random_state=42)

param_distributions = {
    'model__n_estimators': [200],
    'model__learning_rate': [0.1],
    'model__max_depth': [16, 20],
}

pipe = Pipeline([
    ('preprocessor', preproc),
    ('model', model)
])

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
    scoring='r2',  # Assuming MSE is the metric of interest; adjust as needed.
    cv=5,
    verbose=2,
    #random_state=42,
    n_jobs=-1  # Use all available cores
)

random_search.fit(X_train, y_train)

print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.8s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.9s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.9s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.9s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   3.0s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   3.1s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   3.1s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.1s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   1.6s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   1.7s


In [46]:
random_search.score(X_test, y_test)

0.6881729311142857

In [71]:
#Modeling with filled dataset

model = XGBRegressor(random_state=42)

param_distributions = {
    'model__n_estimators': [200],
    'model__learning_rate': stats.norm(0.1, 2),
    'model__max_depth': [16, 20],
}

pipe = Pipeline([
    ('preprocessor', preproc),
    ('model', model)
])

random_search_filled = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
    scoring='neg_mean_absolute_error',  # Assuming MSE is the metric of interest; adjust as needed.
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

random_search_filled.fit(X_filled_train, y_filled_train)

print("Best parameters found: ", random_search_filled.best_params_)
print("Best score found: ", random_search_filled.best_score_)

NameError: name 'stats' is not defined

In [55]:
random_search_filled.score(X_filled_test, y_filled_test)

-0.18529869131963153

In [74]:
#Modeling with 'dropped' dataset

model = XGBRegressor(random_state=42)

param_distributions = {
    'model__n_estimators': [200],
    'model__learning_rate': [0.1],
    'model__max_depth': [16, 20],
}

pipe = Pipeline([
    ('preprocessor', preproc),
    ('model', model)
])

random_search_dropped = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
    scoring='neg_mean_absolute_error',  # Assuming MSE is the metric of interest; adjust as needed.
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

random_search_dropped.fit(X_dropped_train, y_dropped_train)

print("Best parameters found: ", random_search_dropped.best_params_)
print("Best score found: ", random_search_dropped.best_score_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.2s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.3s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.4s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.5s
[CV] END model__learning_rate=0.1, model__max_depth=16, model__n_estimators=200; total time=   2.6s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.6s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.7s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   2.7s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   1.5s
[CV] END model__learning_rate=0.1, model__max_depth=20, model__n_estimators=200; total time=   1.5s


In [57]:
random_search_dropped.score(X_dropped_test, y_dropped_test)

-0.328145426030583

In [76]:
y_dropped_pred = random_search_dropped.predict(X_dropped_test)

In [77]:
from sklearn.metrics import mean_absolute_error, r2_score

# Calculate evaluation metrics
mae_train = mean_absolute_error(y_dropped_train, random_search_dropped.predict(X_dropped_train))
mae_test = mean_absolute_error(y_dropped_test, random_search_dropped.predict(X_dropped_test))
r2_train = r2_score(y_dropped_train, random_search_dropped.predict(X_dropped_train))
r2_test = r2_score(y_dropped_test, random_search_dropped.predict(X_dropped_test))

print("Train MAE:", mae_train)
print("Test MAE:", mae_test)
print("Train R^2:", r2_train)
print("Test R^2:", r2_test)

Train MAE: 0.0007511268750933231
Test MAE: 0.328145426030583
Train R^2: 0.9999999417227854
Test R^2: 0.9004843275268284


# Finetuning

### Using a feature selector

We have shown that either filling in or dropping gives way more promising results than just imputing

Thus, we will focus on these approaches

In [73]:
#Modeling with filled dataset

model = XGBRegressor(random_state=42)

param_distributions = {
    'model__n_estimators': [200],
    'model__learning_rate': stats.norm(0.1 , 0.1),
    'model__max_depth': [16, 20],
     'preprocessor_selector__feature_selection__percentile': [60, 70, 80, 90],
}

pipe = Pipeline([
    ('preprocessor_selector', preproc_selector),
    ('model', model)
])

random_search_filled = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
    scoring='neg_mean_absolute_error',  # Assuming MSE is the metric of interest; adjust as needed.
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

random_search_filled.fit(X_filled_train, y_filled_train)

print("Best parameters found: ", random_search_filled.best_params_)
print("Best score found: ", random_search_filled.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END model__learning_rate=0.14967141530112327, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=90; total time=   1.7s
[CV] END model__learning_rate=0.14967141530112327, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=90; total time=   1.9s
[CV] END model__learning_rate=0.14967141530112327, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=90; total time=   1.9s
[CV] END model__learning_rate=0.14967141530112327, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=90; total time=   1.9s
[CV] END model__learning_rate=0.14967141530112327, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=90; total time=   2.2s
[CV] END model__learning_rate=0.08617356988288154, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selecti



[CV] END model__learning_rate=0.05208257621547101, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=70; total time=   2.7s
[CV] END model__learning_rate=0.06907876241487855, model__max_depth=20, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=90; total time=   3.8s
[CV] END model__learning_rate=0.06907876241487855, model__max_depth=20, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=90; total time=   4.4s
[CV] END model__learning_rate=0.05208257621547101, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=70; total time=   2.4s
[CV] END model__learning_rate=0.05208257621547101, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selection__percentile=70; total time=   2.6s
[CV] END model__learning_rate=0.1812525822394198, model__max_depth=16, model__n_estimators=200, preprocessor_selector__feature_selectio

55 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/die_gregette/.pyenv/versions/3.10.6/envs/project-btm/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/die_gregette/.pyenv/versions/3.10.6/envs/project-btm/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/die_gregette/.pyenv/versions/3.10.6/envs/project-btm/lib/python3.10/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_para

Best parameters found:  {'model__learning_rate': 0.2564643655814006, 'model__max_depth': 16, 'model__n_estimators': 200, 'preprocessor_selector__feature_selection__percentile': 70}
Best score found:  -0.16224548873478714


In [67]:
pipe.get_params()

{'memory': None,
 'steps': [('preprocessor_selector',
   Pipeline(steps=[('preprocessing',
                    Pipeline(steps=[('imputer', KNNImputer()),
                                    ('scaler', RobustScaler())])),
                   ('feature_selection',
                    SelectPercentile(percentile=80,
                                     score_func=<function mutual_info_regression at 0x1514a2170>))])),
  ('model',
   XGBRegressor(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
   