In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('master_data.csv', index_col=0)
df.index = pd.to_datetime(df.index)
df = df.loc['1989-08-23':] #df = df.loc['1989-07-7':]
df['mom'] = -df['mom_d'] + df['mom_u']
df = df.drop(columns=['mom_u', 'mom_d'])
df['label_day'] = ~df['decision'].isna()
df = df.fillna(method='ffill')
df = df.iloc[1:]
df.to_csv('master_data_clean.csv')

### To do...
fix the decision column

In [4]:
# define the columns
monthly = ['pce','ue','cars','house','cli','exports','rgdp','gdpd','veloc', 'ffr','mich']
labels = ['decision', 'ffr', 'change']
daily = ['spx','usd', 'loan']
categorical = ['fed_party','potus_party','recess', 'mom']

In [5]:
# union of all columns
all_cols = monthly + daily + categorical + labels
shared = set(df.columns.intersection(all_cols).to_list())
df_cols = df.columns.to_list()

# find the unique values of all_cols
unique_ourlabels = {col if col not in shared else None for col in all_cols}
unique_dflabels = {col if col not in shared else None for col in df_cols}
unique_ourlabels.remove(None)
unique_dflabels.remove(None)
display(unique_ourlabels)
display(unique_dflabels)



set()

{'label_day'}

In [6]:
# forward fill all nan values
def collapse(df, key = 'label_day', start = 7, end = 60, cat_col = [], daily_col = [], other = [], labels = []):
    # Get our different dataframes
    categorical = df[cat_col]
    daily = df[daily_col]
    monthly = df[other]
    keep = df[key]
    labels = df[labels]

    # create the windows
    event_days = df[df[key]].index

    ################## Handle Categorical and Label Data ##################
    cat_collapsed = categorical[keep]
    y_labels = labels[keep]

    ################## Handle Monthly Data ##################
    # Create event windows
    windows = []
    for day in event_days:
        # Calculate start and end days of the window
        start_day = day - pd.Timedelta(days=start)
        end_day = day - pd.Timedelta(days=end)

        # Ensure start_day is not before the start of the dataset
        if end_day < df.index[0]:
            end_day = df.index[0]

        windows.append((start_day, end_day))

    # Initialize a list to store aggregated data
    aggregated_data = []

    for (start_day, end_day) in windows:
        # Select data within the window
        window_data = monthly.loc[end_day:start_day]

        # Apply your custom aggregation function
        final_vals = window_data.loc[start_day]
        change = final_vals - window_data.iloc[0]
        # relabel the change columns to include a d_ prefix
        change = change.rename(lambda x: 'd_' + x)

        # concatenate the data horizontally
        aggregated_window = pd.concat([final_vals, change], axis=0)
        aggregated_data.append(aggregated_window)

    # Combine aggregated data
    aggregated_monthly = pd.concat(aggregated_data, axis=1).T
    aggregated_monthly.index = event_days

    ################## Handle Daily Data ##################
    # define the model
    model = linear_model.LinearRegression()
    daily_agg = []

    # Get the names of the features
    daily_feat = daily.columns
    b0_name = ['b0_' + feat for feat in daily_feat]
    b1_name = ['b1_' + feat for feat in daily_feat]

    for (start_day, end_day) in windows:
        # Select data within the window
        window_data = daily.loc[end_day:start_day]
        time = np.arange(len(window_data)).reshape(-1, 1)

        # Initialize lists to store betas for each feature
        beta0 = []
        beta1 = []

        # Loop through each feature and fit the model
        for feat in daily_feat:
            model.fit(time, window_data[feat])
            beta0.append(model.intercept_)
            beta1.append(model.coef_[0])

        # Create DataFrames for betas
        beta0_df = pd.DataFrame([beta0], columns=b0_name)
        beta1_df = pd.DataFrame([beta1], columns=b1_name)

        # concatenate the data horizontally
        aggregated_window = pd.concat([beta0_df, beta1_df], axis=1)
        daily_agg.append(aggregated_window)

    # Combine aggregated data
    aggregated_daily = pd.concat(daily_agg, axis=0)
    aggregated_daily.index = event_days

    ################## Combine Data ##################
    # Combine all data
    combined = pd.concat([cat_collapsed, aggregated_monthly, aggregated_daily], axis=1)
    return combined, y_labels

In [7]:
date_ranges = [(7,60),(2,60),(7,120),(2,120),(7,42),(2,42)]

In [8]:
# loop through the date ranges and save the data
for dates in date_ranges:
    beta_data, label_data = collapse(df, start = dates[0], end = dates[1], cat_col = categorical, daily_col = daily, other = monthly, labels = labels)
    beta_data.to_csv('beta_dates/beta_data_' + str(dates[0]) + '_' + str(dates[1]) + '.csv')
label_data.to_csv('beta_dates/true_labels.csv')

In [12]:
# make a pipe of a random forest on beta_data_2_42.csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# import cross validation
from sklearn.model_selection import cross_val_score

# load the data
beta_data = pd.read_csv('beta_dates/beta_data_2_42.csv', index_col=0)
beta_data.index = pd.to_datetime(beta_data.index)
labels = pd.read_csv('beta_dates/true_labels.csv', index_col=0)['decision']

# build a random forest classifier
rf = RandomForestClassifier()
rf_pipe = Pipeline([('rf', rf)])

# define the parameters
params = {'rf__n_estimators': [10, 50, 100, 200],
          'rf__max_depth': [2, 5, 10, 20, 50, 100, None],
          'rf__max_features': ['auto', 'sqrt'],
          'rf__min_samples_split': [2, 5, 10],
          'rf__min_samples_leaf': [1, 2, 4]}
# define the grid search
grid = GridSearchCV(rf_pipe, params, cv=7, n_jobs=-1, verbose=1)

# fit the grid search
grid.fit(beta_data, labels)

# print the best parameters
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

Fitting 7 folds for each of 504 candidates, totalling 3528 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'rf__max_depth': 2, 'rf__max_features': 'auto', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 50}
0.6971611721611722
Pipeline(steps=[('rf',
                 RandomForestClassifier(max_depth=2, max_features='auto',
                                        min_samples_split=5,
                                        n_estimators=50))])


In [22]:
# print the best parameters
model = RandomForestClassifier(max_depth=3, max_features='auto', min_samples_leaf=2, min_samples_split=5, n_estimators=100)
model.fit(beta_data, labels)
importances = model.feature_importances_

print(model.score(beta_data, labels))

# Print feature importances
importance = {feature: importance for feature, importance in zip(beta_data.columns, importances)}
importance = pd.DataFrame(importance, index=[0])
importance = importance.T
importance.columns = ['importance']
importance = importance.sort_values(by='importance', ascending=False)
display(importance)


0.7992700729927007


Unnamed: 0,importance
house,0.115519
ue,0.099049
pce,0.093827
mom,0.092691
ffr,0.065531
veloc,0.057053
cli,0.053182
b1_loan,0.052243
gdpd,0.045253
exports,0.033703
