In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [65]:
df = pd.read_csv('master_data.csv', index_col=0)
df.index = pd.to_datetime(df.index)
df = df.loc['1989-08-23':] #df = df.loc['1989-07-7':]
df['mom'] = -df['mom_d'] + df['mom_u']
df = df.drop(columns=['mom_u', 'mom_d'])
df['label_day'] = ~df['decision'].isna()
df = df.fillna(method='ffill')
df = df.iloc[1:]
df.to_csv('master_data_clean.csv')

### To do...
fix the decision column

In [66]:
# define the columns
monthly = ['pce','ue','cars','house','cli','exports','rgdp','gdpd','veloc', 'ffr','mich']
labels = ['decision', 'ffr', 'change']
daily = ['spx','usd', 'loan']
categorical = ['fed_party','potus_party','recess', 'mom']

In [67]:
# union of all columns
all_cols = monthly + daily + categorical + labels
shared = set(df.columns.intersection(all_cols).to_list())
df_cols = df.columns.to_list()

# find the unique values of all_cols
unique_ourlabels = {col if col not in shared else None for col in all_cols}
unique_dflabels = {col if col not in shared else None for col in df_cols}
unique_ourlabels.remove(None)
unique_dflabels.remove(None)
display(unique_ourlabels)
display(unique_dflabels)



set()

{'label_day'}

In [68]:
# forward fill all nan values
def collapse(df, key = 'label_day', start = 7, end = 60, cat_col = [], daily_col = [], other = [], labels = []):
    # Get our different dataframes
    categorical = df[cat_col]
    daily = df[daily_col]
    monthly = df[other]
    keep = df[key]
    labels = df[labels]

    # create the windows
    event_days = df[df[key]].index

    ################## Handle Categorical and Label Data ##################
    cat_collapsed = categorical[keep]
    y_labels = labels[keep]

    ################## Handle Monthly Data ##################
    # Create event windows
    windows = []
    for day in event_days:
        # Calculate start and end days of the window
        start_day = day - pd.Timedelta(days=start)
        end_day = day - pd.Timedelta(days=end)

        # Ensure start_day is not before the start of the dataset
        if end_day < df.index[0]:
            end_day = df.index[0]

        windows.append((start_day, end_day))

    # Initialize a list to store aggregated data
    aggregated_data = []

    for (start_day, end_day) in windows:
        # Select data within the window
        window_data = monthly.loc[end_day:start_day]

        # Apply your custom aggregation function
        final_vals = window_data.loc[start_day]
        change = final_vals - window_data.iloc[0]
        # relabel the change columns to include a d_ prefix
        change = change.rename(lambda x: 'd_' + x)

        # concatenate the data horizontally
        aggregated_window = pd.concat([final_vals, change], axis=0)
        aggregated_data.append(aggregated_window)

    # Combine aggregated data
    aggregated_monthly = pd.concat(aggregated_data, axis=1).T
    aggregated_monthly.index = event_days

    ################## Handle Daily Data ##################
    # define the model
    model = linear_model.LinearRegression()
    daily_agg = []

    # Get the names of the features
    daily_feat = daily.columns
    b0_name = ['b0_' + feat for feat in daily_feat]
    b1_name = ['b1_' + feat for feat in daily_feat]

    for (start_day, end_day) in windows:
        # Select data within the window
        window_data = daily.loc[end_day:start_day]
        time = np.arange(len(window_data)).reshape(-1, 1)

        # Initialize lists to store betas for each feature
        beta0 = []
        beta1 = []

        # Loop through each feature and fit the model
        for feat in daily_feat:
            model.fit(time, window_data[feat])
            beta0.append(model.intercept_)
            beta1.append(model.coef_[0])

        # Create DataFrames for betas
        beta0_df = pd.DataFrame([beta0], columns=b0_name)
        beta1_df = pd.DataFrame([beta1], columns=b1_name)

        # concatenate the data horizontally
        aggregated_window = pd.concat([beta0_df, beta1_df], axis=1)
        daily_agg.append(aggregated_window)

    # Combine aggregated data
    aggregated_daily = pd.concat(daily_agg, axis=0)
    aggregated_daily.index = event_days

    ################## Combine Data ##################
    # Combine all data
    combined = pd.concat([cat_collapsed, aggregated_monthly, aggregated_daily], axis=1)
    return combined, y_labels

In [69]:
date_ranges = [(7,60),(2,60),(7,120),(2,120),(7,42),(2,42)]

In [70]:
# loop through the date ranges and save the data
for dates in date_ranges:
    beta_data, label_data = collapse(df, start = dates[0], end = dates[1], cat_col = categorical, daily_col = daily, other = monthly, labels = labels)
    beta_data.to_csv('beta_dates/beta_data_' + str(dates[0]) + '_' + str(dates[1]) + '.csv')
label_data.to_csv('beta_dates/true_labels.csv')

In [71]:
# Upload the right data
names = ["7_60", "2_60", "7_120", "2_120", "7_42", "2_42"]
for name in names:
    data = pd.read_csv('beta_dates/beta_data_'+name+'.csv', index_col=0)
    data.index = pd.to_datetime(data.index)

    # Load in the labels
    labels = pd.read_csv('beta_dates/true_labels.csv', index_col=0)
    labels.index = pd.to_datetime(labels.index)

    # fit a sklearn softmax regression
    def cross_val(data, labels, folds = 6):
        # define the model with an intercept and l2 regularization
        model = linear_model.LogisticRegression(penalty='l2', fit_intercept=True, multi_class='multinomial', solver='lbfgs')

        # fit the model
        model.fit(data, labels)

        # get the cross validation scores
        scores = cross_val_score(model, data, labels, cv=folds)
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

        # get the coefficients
        coef = model.coef_
        return coef


    #normalize the data
    def normalize(data):
        return (data - data.mean()) / data.std()

    normalized_data = data.values
    normalized_data = normalize(data).values
    vals = cross_val(normalized_data, labels['decision'])
    coef_size = np.linalg.norm(vals, axis=0)

    # what are the smallest coefficients of coef_size
    coef_size = pd.DataFrame(coef_size, index=data.columns)
    coef_size.columns = ['coef_size']
    coef_size = coef_size.sort_values(by='coef_size', ascending=False)

    # save coef_size as a csv
    coef_size.to_csv('feature_importance/coef_size'+name+'.csv')


Accuracy: 0.50 (+/- 0.32)
Accuracy: 0.48 (+/- 0.26)
Accuracy: 0.50 (+/- 0.30)
Accuracy: 0.49 (+/- 0.30)
Accuracy: 0.49 (+/- 0.29)
Accuracy: 0.49 (+/- 0.27)
