In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time

from sklearn.model_selection import  cross_validate, learning_curve, GridSearchCV, TimeSeriesSplit, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor 

In [4]:
#Preprocessing 
DATA_PATH = r"index.csv" 
df = pd.read_csv(DATA_PATH, sep=",")

df.insert(0, "Date", pd.to_datetime(df[["Year", "Month", "Day"]]))      # create datetime column
df.set_index(["Date"], inplace=True)                                    # set index to date, easy to handle 
df.drop(columns=["Year", "Month", "Day"], inplace=True)                 # deop redundant columns
df.drop(columns=["Federal Funds Target Rate", "Federal Funds Upper Target", "Federal Funds Lower Target"], inplace=True)        # these columns will not be used

df.drop(df.loc["2017-01-01":].index, inplace=True)

df = df.dropna(subset=['Inflation Rate']) #Drop NaN values (includes mid month entries) 

df['Real GDP (Percent Change)'] = df['Real GDP (Percent Change)'].ffill() #Forward fill GDP instead of going quaterly

#Add lag fetures to input variables
lags = [1, 2, 3]
col = ["Effective Federal Funds Rate", "Inflation Rate", "Unemployment Rate", "Real GDP (Percent Change)"]

for i in col:
    if i not in df.columns:
        raise KeyError(f"Column missing: {i}")
    for j in lags:
        df[f"{i} lag{j}"] = df[i].shift(j)

df = df.dropna(subset=[f"{i} lag{j}" for i in col for j in lags]) # remove rows with any NaNs created by shifting

targ_df = pd.DataFrame(df["Real GDP (Percent Change)"]) #create target 

X = df.drop(columns=["Real GDP (Percent Change)"])
y = targ_df

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print(df.shape)
df.head()

(705, 16)


Unnamed: 0_level_0,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate,Effective Federal Funds Rate lag1,Effective Federal Funds Rate lag2,Effective Federal Funds Rate lag3,Inflation Rate lag1,Inflation Rate lag2,Inflation Rate lag3,Unemployment Rate lag1,Unemployment Rate lag2,Unemployment Rate lag3,Real GDP (Percent Change) lag1,Real GDP (Percent Change) lag2,Real GDP (Percent Change) lag3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1958-04-01,1.26,2.6,7.4,2.4,1.2,1.67,2.72,2.8,3.2,3.2,6.7,6.4,5.8,-10.0,-10.0,-10.0
1958-05-01,0.63,2.6,7.4,2.4,1.26,1.2,1.67,2.4,2.8,3.2,7.4,6.7,6.4,2.6,-10.0,-10.0
1958-06-01,0.93,2.6,7.3,2.1,0.63,1.26,1.2,2.4,2.4,2.8,7.4,7.4,6.7,2.6,2.6,-10.0
1958-07-01,0.68,9.6,7.5,2.4,0.93,0.63,1.26,2.1,2.4,2.4,7.3,7.4,7.4,2.6,2.6,2.6
1958-08-01,1.53,9.6,7.4,2.1,0.68,0.93,0.63,2.4,2.1,2.4,7.5,7.3,7.4,9.6,2.6,2.6


In [7]:
X_train2 = X_train.drop(columns=[c for c in X_train.columns if "GDP" in c])

abs(X_train.corrwith(y_train["Real GDP (Percent Change)"])).sort_values(ascending=False)
rand=42

In [8]:
_ = StandardScaler().fit_transform(X_train)
pca = PCA(n_components=0.95, random_state=rand)     # PCs explain 95% of the variance 
X_pca = pca.fit_transform(_)
X_pca.shape
names = pca.get_feature_names_out()
X_pca_df = pd.DataFrame({names[0] : X_pca[:,0], 
                         names[1] : X_pca[:,1], 
                         names[2] : X_pca[:,2], 
                         names[3] : X_pca[:,3], 
                         names[4] : X_pca[:,4]})
X_pca_df.set_index(X_train.index, inplace=True)
X_pca_df.head(5)

Unnamed: 0_level_0,pca0,pca1,pca2,pca3,pca4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1958-04-01,-0.431238,0.104763,-7.048191,-0.455776,0.306868
1958-05-01,-0.928656,1.395528,-5.534547,0.21636,-2.093056
1958-06-01,-1.440439,2.291753,-3.613657,0.538588,-2.046057
1958-07-01,-1.832944,2.931455,-1.82981,0.470061,0.342342
1958-08-01,-2.087232,3.295746,-0.870203,0.78626,-1.030927


In [9]:
pipe_PCA = Pipeline([("PCA", PCA(n_components=0.95, random_state=rand)),("model", RandomForestRegressor(n_estimators=100, random_state=rand))])

cv_reg = cross_validate(
    pipe_PCA, X_train, y_train["Real GDP (Percent Change)"], cv=tss, n_jobs=-1,
    scoring=("r2", "neg_mean_squared_error", "neg_mean_absolute_error"),
    return_train_score=True)


print(f"CV Mean R2 (train):  {cv_reg['train_r2'].mean():.3f} ± {cv_reg['train_r2'].std():.3f}")
print(f"CV Mean R2 (val):    {cv_reg['test_r2'].mean():.3f}  ± {cv_reg['test_r2'].std():.3f}")
print(f"CV Mean RMSE (val):  {np.mean(np.sqrt(-cv_reg['test_neg_mean_squared_error'])):.3f}")
print(f"CV Mean MAE (val):   {-cv_reg['test_neg_mean_absolute_error'].mean():.3f}")

NameError: name 'tss' is not defined

In [None]:
pipe = Pipeline([("model", RandomForestRegressor(n_estimators=100, random_state=rand))])

cv_reg = cross_validate(
    pipe, X_train, y_train["Real GDP (Percent Change)"], cv=tss, n_jobs=-1,
    scoring=("r2", "neg_mean_squared_error", "neg_mean_absolute_error"),
    return_train_score=True)


print(f"CV Mean R2 (train):  {cv_reg['train_r2'].mean():.3f} ± {cv_reg['train_r2'].std():.3f}")
print(f"CV Mean R2 (val):    {cv_reg['test_r2'].mean():.3f}  ± {cv_reg['test_r2'].std():.3f}")
print(f"CV Mean RMSE (val):  {np.mean(np.sqrt(-cv_reg['test_neg_mean_squared_error'])):.3f}")
print(f"CV Mean MAE (val):   {-cv_reg['test_neg_mean_absolute_error'].mean():.3f}")

CV Mean R2 (train):  0.942 ± 0.002
CV Mean R2 (val):    0.392  ± 0.156
CV Mean RMSE (val):  2.453
CV Mean MAE (val):   1.755


In [None]:
pipe = Pipeline([("model", DecisionTreeRegressor(max_depth=2, random_state=rand))])

cv_reg = cross_validate(
    pipe, X_train, y_train["Real GDP (Percent Change)"], cv=tss, n_jobs=-1,
    scoring=("r2", "neg_mean_squared_error", "neg_mean_absolute_error"),
    return_train_score=True)


print(f"CV Mean R2 (train):  {cv_reg['train_r2'].mean():.3f} ± {cv_reg['train_r2'].std():.3f}")
print(f"CV Mean R2 (val):    {cv_reg['test_r2'].mean():.3f}  ± {cv_reg['test_r2'].std():.3f}")
print(f"CV Mean RMSE (val):  {np.mean(np.sqrt(-cv_reg['test_neg_mean_squared_error'])):.3f}")
print(f"CV Mean MAE (val):   {-cv_reg['test_neg_mean_absolute_error'].mean():.3f}")

CV Mean R2 (train):  0.544 ± 0.014
CV Mean R2 (val):    0.434  ± 0.130
CV Mean RMSE (val):  2.403
CV Mean MAE (val):   1.651
