In [261]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import os
from dotenv import load_dotenv
import pyfredapi as pf
import re

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import warnings

import xgboost as xgb

from itertools import combinations, chain

# Suppress specific FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from prince import FAMD

## First, load your api key from the environment

In [3]:
load_dotenv()
# get my FRED_API_KEY from my local environment
api_key = os.environ['FRED_API_KEY']

## Choose the time frame analyzed

In [4]:
start = '1990-01-01'
# end with the current date
end = pd.Timestamp.today().strftime('%Y-%m-%d')


## Save data not found on FRED

Business confidence index (BCI) - https://data.oecd.org/leadind/business-confidence-index-bci.htm

Consumer confidence index (CCI) - https://data.oecd.org/leadind/consumer-confidence-index-cci.htm#indicator-chart

Composite leading indicator (CLI) - https://data.oecd.org/leadind/composite-leading-indicator-cli.htm#indicator-chart

In [147]:
index_csvs = ['INDEX_DATA/BCI.csv', 'INDEX_DATA/CCI.csv', 'INDEX_DATA/CLI.csv']

index_dfs = [pd.read_csv(index_csv) for index_csv in index_csvs]
cleaned_dfs = []
# stack the dataframes
for df in index_dfs:
    indicator = df['INDICATOR'].iloc[0]
    df = df.loc[df['LOCATION'] == 'USA', ['TIME', 'Value']].rename(columns={'Value': 'value_' + indicator, 'TIME': 'date'})
    # set the index to the time column as a datetime object
    df.set_index(pd.to_datetime(df['date']), inplace=True)
    # drop the old time column
    df.drop(columns=['date'], inplace=True)

    # drop all rows with index before string 'start'
    df = df.loc[df.index >= start]

    cleaned_dfs.append(df)

# merge the cleaned_dfs
index_df = pd.concat(cleaned_dfs, axis=1)

## These are the features we will use to predict

The string in the parentheses is used by the API

In [252]:
data_sources = """
(FEDFUNDS)
Consumer Price Index for All Urban Consumers: All Items in U.S. City Average (CPIAUCSL)
Sticky Price Consumer Price Index less Food and Energy (CORESTICKM159SFRBATL)
Sticky Price Consumer Price Index less Food, Energy, and Shelter (CRESTKCPIXSLTRM159SFRBATL)
Producer Price Index by Commodity: All Commodities (PPIACO)
Personal Consumption Expenditures (PCE) 
Total Nonfarm Private Payroll Employment (ADPWNUSNERSA)
Quarterly Financial Report: U.S. Corporations: All Information: Total Cash on Hand and in U.S. Banks  (QFRTCASHINFUSNO) - only to 2009 
Unemployment Rate (UNRATE)
Noncyclical Rate of Unemployment (NROU)
Unemployment Rate - Women (LNS14000002)
Job Openings: Total Nonfarm (JTSJOL) - begins 2000
Layoffs and Discharges: Total Nonfarm (JTSLDL) - since 2000 Gross Domestic Product (GDP)
Real Gross Domestic Product (GDPC1)
Real gross domestic product per capita (A939RX0Q048SBEA)
Gross Domestic Product: Implicit Price Deflator (A191RI1Q225SBEA)
National Accounts: National Accounts Deflators: Gross Domestic Product: GDP Deflator for United States (USAGDPDEFQISMEI)
Advance Retail Sales: Retail Trade and Food Services (RSAFS) - 1992 
University of Michigan: Consumer Sentiment (UMCSENT)
New Privately-Owned Housing Units Started: Total Units (HOUST)
New Privately-Owned Housing Units Started: Single-Family Units (HOUST1F)
Total Business Sales (TOTBUSSMSA) -1992
Nonfinancial Corporate Business; Inventories Including IVA, Market Value Levels (BOGZ1LM105020005Q)
Corporate Profits After Tax -without IVA and CCAdj- (CP)
 National income: Corporate profits before tax -without IVA and CCAdj- (A053RC1Q027SBEA)
Money Market Funds; Total Financial Assets, Level (MMMFFAQ027S)
Stock Market Capitalization to GDP for United States (DDDM01USA156NWDB)
Interest Rates: Long-Term Government Bond Yields: 10-Year: Main -Including Benchmark- for United States (IRLTLT01USM156N)
Nominal Broad U.S. Dollar Index (DTWEXBGS) - 2006
Leading Indicators OECD: Leading indicators: CLI: Amplitude adjusted for OECD - Total (OECDLOLITOAASTSAM)
Trade Balance: Goods and Services, Balance of Payments Basis (BOPGSTB) - 1992
M1 (WM1NS)
Velocity of M1 Money Stock (M1V)
M2 (WM2NS)
Personal consumption expenditures: Services: Gambling (DGAMRC1A027NBEA)
"""

In [254]:
dataframes = {}

# Regular expression pattern to match text within parentheses
pattern = r"\((.*?)\)"

# Find all matches and store them in a list
matches = re.findall(pattern, data_sources)

## This cell is where the api is called and stored in Pandas Dataframes

In [255]:
extra_parameters = {
    "observation_start": start,
    "observation_end": end,
}
for match in matches:
    series_id = match
    try:
        df = pf.get_series(series_id=series_id, **extra_parameters)[['date', 'value']]
        # add the match to the end of the value column name
        df.rename(columns={'value': f'value_{match}'}, inplace=True)
        # turn the date column into a datetime object
        df['date'] = pd.to_datetime(df['date'])

        # set the date column as the index
        df.set_index('date', inplace=True)
        dataframes[series_id] = df

    except:
        print(f"Failed to get {series_id}")

In [256]:
def align_dataframes(dataframes, start_date):

    """
    Impute missing values and align the start date of each dataframe.
    """

    aligned_dfs = []

    for df in dataframes:
        # Ensure the index is a DateTimeIndex
        if not isinstance(df.index, pd.DatetimeIndex):
            df.index = pd.to_datetime(df.index)

        # Resample to monthly frequency
        # Use 'mean' for downsampling and 'ffill' for upsampling
        resampled_df = df.resample('M').mean().ffill()

        # Align start date to start, filling missing values with NaN
        aligned_df = resampled_df.reindex(pd.date_range(start_date, resampled_df.index.max(), freq='M'), fill_value=np.nan)
        
        aligned_dfs.append(aligned_df)

    return aligned_dfs

## Now, combine the FRED and non-FRED dataframes

In [257]:
dataframes_list = list(dataframes.values()) + [index_df]

# Align all DataFrames to the same frequency and start date
aligned_dataframes = align_dataframes(dataframes_list, start)

# Concatenate all DataFrames along the column axis
combined_df = pd.concat(aligned_dataframes, axis=1)

## Now, we split the features and the label

In [274]:
features = combined_df.drop('value_FEDFUNDS', axis=1)
label = combined_df['value_FEDFUNDS']

# Filling missing values with the mean (change this based on your data's context)
features_filled = features.fillna(features.mean())


## split_date: used to segment the data for training and testing

In [114]:
def is_collinear_combo(combo, collinear_pairs):
    for pair in combinations(combo, 2):
        if frozenset(pair) in collinear_pairs:
            return True
    return False

In [50]:
# # Assuming 'features' is your DataFrame
# corr_matrix = features.corr().abs()

# # Threshold for considering correlation as 'high'
# threshold = 0.8

# # Grouping collinear features
# collinear_groups = []
# visited = set()

# for col in corr_matrix.columns:
#     if col not in visited:
#         collinear_set = set([col])
#         for index in corr_matrix.index[corr_matrix[col] > threshold]:
#             if col != index:
#                 collinear_set.add(index)
#                 visited.add(index)
#         collinear_groups.append(collinear_set)

# # Selecting one representative from each collinear group
# representatives = [list(group)[0] for group in collinear_groups]

# # Generate all combinations of the representatives
# combos = []
# for i in range(1, len(representatives) + 1):
#     combos += list(combinations(representatives, i))

# combos = list(set(combos))

# print(len(combos))
# label_binary = label.diff().dropna()
# split_date = int(len(label_binary) * 0.8)
# label_binary = np.where(label_binary > 0, 1, 0)


# best_combos = []
# best_combo = None
# best_acc = 0

# for combo in combos:
#     combo = list(set(combo))
#     features_adjusted = features[list(combo)].iloc[1:]

#     train_features = features_adjusted[:split_date]
#     test_features = features_adjusted[split_date:]
#     train_label = label_binary[:split_date]
#     test_label = label_binary[split_date:]

#     # Model creation and fitting
#     xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
#     xgb_classifier.fit(train_features, train_label)

#     # Predictions and evaluation
#     xgb_predictions = xgb_classifier.predict(test_features)
#     acc = accuracy_score(test_label, xgb_predictions)
    
#     if acc < best_acc:
#         best_acc = acc
#         best_combo = combo

#     if acc > 0.70:
#         print("Combo:", combo, "Accuracy:", acc)
#         best_combos.append(combo)

# print("Best combo:", best_combo)
# print("Best MSE:", best_acc)

# features_adjusted = features[list(best_combo)].iloc[1:]

# train_features = features_adjusted[:split_date]
# test_features = features_adjusted[split_date:]
# train_label = label_binary[:split_date]
# test_label = label_binary[split_date:]

# # Model creation and fitting
# xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
# xgb_classifier.fit(train_features, train_label)

# # Predictions and evaluation
# xgb_predictions = xgb_classifier.predict(test_features)
# mse = mean_squared_error(test_label, xgb_predictions)

# print("Mean Squared Error:", mse)

# print(accuracy_score(test_label, xgb_predictions))
# print(classification_report(test_label, xgb_predictions))

In [385]:
import pandas as pd

def aggregate_before_decision(df, n_days):
    # Ensure the index is in datetime format
    df.index = pd.to_datetime(df.index)

    # Filter rows where decisions occur
    decision_rows = df[df['ffr'].notna()]

    # Columns to be aggregated
    columns_to_aggregate = df.columns.difference(['ffr', 'decision', 'change'])

    # List to store each aggregated period's data
    aggregated_periods = []

    for date in decision_rows.index:
        # Define the period for aggregation
        start_date = date - pd.Timedelta(days=n_days)
        end_date = date - pd.Timedelta(days=1)  # Exclude the decision day

        # Aggregate data for this period
        aggregated_period = df.loc[start_date:end_date, columns_to_aggregate].mean()

        # Store aggregated data with the decision day's 'ffr', 'decision', and 'change' starting from the beginning
        aggregated_period = pd.concat([df.loc[date, ['ffr', 'change', 'decision']], aggregated_period])
        # append to the front of the list
        aggregated_periods.append(aggregated_period)

    # Create a DataFrame from the aggregated periods
    aggregated_data = pd.DataFrame(aggregated_periods)

    return aggregated_data


In [388]:
aggregated_df = aggregate_before_decision(pd.read_csv('master_data.csv'), 30)
# Split data into features and target
X = aggregated_df.drop(['ffr', 'decision', 'change'], axis=1)
y = aggregated_df['decision']

# combos = get_combos(X)



# # # turn to -1,0,1 to 0, 1, 2
# y = y + 1

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# best_combos = []
# best_combo = None
# best_acc = 0

# # for i,combo in enumerate(combos):
# #     combo = list(set(combo))
# #     features_adjusted = df[list(combo)].iloc[1:]

# #     # Model creation and fitting
# #     xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
# #     xgb_classifier.fit(X_train, y_train)

# #     # Predictions and evaluation
# #     xgb_predictions = xgb_classifier.predict(X_test)
# #     acc = accuracy_score(y_test, xgb_predictions)
    
# #     if acc > best_acc:
# #         best_acc = acc
# #         best_combo = combo
# #         print("New best combo:", best_combo, "Accuracy:", best_acc)

# #     if acc > 0.60:
# #         print("Combo:", combo, "Accuracy:", acc)
# #         best_combos.append(combo)

# print("Best combo:", best_combo)
# print("Best MSE:", best_acc)

# # Classifier - can be replaced with a more suitable model for time series
# classifier = RandomForestClassifier()
# classifier.fit(X_train, y_train)

# # Predictions
# predictions = classifier.predict(X_test)

# # Evaluation
# print(classification_report(y_test, predictions))

# xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
# xgb_classifier.fit(X_train, y_train)

# # Predictions and evaluation
# xgb_predictions = xgb_classifier.predict(X_test)

# print(classification_report(y_test, xgb_predictions))


In [265]:
# # Assuming 'features' is your DataFrame
# corr_matrix = features.corr().abs()

# # Threshold for considering correlation as 'high'
# threshold = 0.1

# # Grouping collinear features
# collinear_groups = []
# already_grouped = set()

# for col in corr_matrix.columns:
#     if col not in already_grouped:
#         collinear_set = {col}
#         for index in corr_matrix.index[corr_matrix[col] > threshold]:
#             if col != index and index not in already_grouped:
#                 collinear_set.add(index)
#                 already_grouped.add(index)
#         if len(collinear_set) > 1:
#             collinear_groups.append(collinear_set)

# # Select one representative from each collinear group
# representatives = [list(group)[0] for group in collinear_groups]

# # Add non-collinear features to the representatives
# non_collinear_features = [col for col in features.columns if col not in already_grouped]
# representatives.extend(non_collinear_features)

# # Create a set of all unique representatives and non-collinear features
# unique_features = set(representatives)

# # Generate combinations of varying sizes from this set
# combos = list(chain.from_iterable(combinations(unique_features, r) for r in range(1, len(unique_features)+1)))

# print(f"Number of combinations: {len(combos)}")

# # print(combos)


# for combo in combos:
#     print(combo)


#     features_adjusted = features[list(combo)].iloc[1:]

#     train_features = features_adjusted[:split_date]
#     test_features = features_adjusted[split_date:]
#     train_label = label_binary[:split_date]
#     test_label = label_binary[split_date:]
#     features_adjusted = features[list(best_combo)].iloc[1:]


#     # Model creation and fitting
#     xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
#     xgb_classifier.fit(train_features, train_label)

#     # Predictions and evaluation
#     xgb_predictions = xgb_classifier.predict(test_features)
#     mse = mean_squared_error(test_label, xgb_predictions)

#     print("Mean Squared Error:", mse)

#     print(accuracy_score(test_label, xgb_predictions))
#     print(classification_report(test_label, xgb_predictions))

#     # xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

#     # # Define the parameter grid
#     param_grid = {
#         'n_estimators': [50, 100, 150],
#         'learning_rate': [0.01, 0.1, 0.2],
#         'max_depth': [3, 5, 7],
#         'subsample': [0.8, 1.0],  # Fraction of samples to be used for fitting each tree
#         'colsample_bytree': [0.8, 1.0]  # Fraction of features to be used for each tree
#     }

#     # # Grid Search with Cross-Validation
#     # grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
#     # grid_search.fit(train_features, train_label)

#     # # Best parameters
#     # print("Best parameters found: ", grid_search.best_params_)

#     # # Evaluate on test set with the best model
#     # best_model = grid_search.best_estimator_
#     # xgb_predictions = best_model.predict(test_features)

#     # mse = mean_squared_error(test_label, xgb_predictions)
#     # accuracy = accuracy_score(test_label, xgb_predictions)

#     # print("Mean Squared Error:", mse)
#     # print("Accuracy:", accuracy)
#     # print(classification_report(test_label, xgb_predictions))

## Now that we have the best features, we want to do a gridsearch to find the best hyperparameters

In [237]:
# prep a df to be used for classifying
# n_days is how many days prior to the announcement you want to forecast
def forecast_n_days_prior(df, n_days = 7):
    df_temp = df.copy()
    df_temp.iloc[:,:3] = df_temp.iloc[:,:3].shift(-n_days)
    return df_temp.query('ffr == ffr')

In [248]:
def rf_train(X, y, label_type='classification'):
    
    if label_type == 'classification':
        # turn to -1,0,1 to 0, 1, 2
        y = y + 1

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees in the forest
        'max_depth': [10, 20, 30],  # Maximum depth of each tree
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
        'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    }


    # perfor grid search
    grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)

    # train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Predictions
    predictions = best_model.predict(X_test)

    # Evaluation
    print(classification_report(y_test, predictions))

    return best_model, grid_search.best_params_

In [242]:
def famd_df(df, n_components=4, cat_threshold=4):
    # Get indices of categorical columns
    indices = []
    for i, name in enumerate(df.columns):
        # Assuming 'df' is your DataFrame and 'column' is the column you want to check
        unique_count = df[name].nunique()
        if unique_count < cat_threshold:  # Adjust the threshold as needed
            indices.append(i)
    
    df_encoded = df.iloc[:, indices]
    df_numeric = df.drop(df.columns[indices], axis=1)
    
    # Then we scale the numeric data
    scaler = StandardScaler()
    df_numeric = scaler.fit_transform(df_numeric)
    df_numeric = pd.DataFrame(df_numeric, index=df.index)
    
    # The function FAMD imported from prince requires categorical data to
    # Be of type 'object' so we convert it since it is already encoded
    df_categorical = df_encoded.astype(str)
    
    df_final = pd.concat([df_numeric, df_categorical], axis=1)
    
    famd = FAMD(n_components=n_components)
    
    X_famd = famd.fit_transform(df_final)
    
    return X_famd

In [384]:
# df = famd_df(pd.read_csv('beta_dates/beta_data_7_60.csv', index_col=0))
# y = pd.read_csv('labels.csv', index_col=0).values[:,0]
# rf_train(df, y, label_type='classification')

### Learning Algorithms and In-Depth Analysis


In [None]:
"""
We use random forests and XGBoost for training our models, using the collapsed data and using FAMD to reducethe dimensionality of the data to only 4 principle components. We use grid search to find the best hyperparameters for the models.

In the end, we find that the best model is a random forest with the following hyperparameters:

 {'max_depth': 20,
  'max_features': 'log2',
  'min_samples_leaf': 4,
  'min_samples_split': 5,
  'n_estimators': 50})

  The accuracy matrix for this model is:
                precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         5
         1.0       0.59      0.90      0.71        30
         2.0       0.67      0.30      0.41        20

    accuracy                           0.60        55
   macro avg       0.42      0.40      0.37        55
weighted avg       0.56      0.60      0.54        55

"""

In [404]:
def xgboost_train(X, y, label_type='classification'):
    
    if label_type == 'classification':
        # turn to -1,0,1 to 0, 1, 2
        y = y + 1

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42)

    param_grid = {
        'n_estimators': [100, 110],  # Reduced number of trees
        'learning_rate': [0.01, 0.8],  # Expanded range with a lower bound
        'max_depth': [7, 8],  # Shallower trees considering fewer components
        'min_child_weight': [4, 5],  # Adjusted values for instance weight
        'gamma': [0],  # Slightly expanded range for loss reduction
        'subsample': [0.7],  # Adjusted subsample ratio
        'colsample_bytree': [0.8, 0.9],  # Adjusted subsample ratio of columns
        'reg_alpha': [0.1],  # Adjusted L1 regularization term
        'reg_lambda': [0.1]  # Adjusted L2 regularization term
    }
    

    # perfor grid search
    grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', random_state=42), param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)

    # train model with best parameters
    xgb_classifier = xgb.XGBClassifier(**grid_search.best_params_, objective='multi:softmax', random_state=42)
    xgb_classifier.fit(X_train, y_train)

    # Predictions
    predictions = xgb_classifier.predict(X_test)

    # Evaluation
    print(classification_report(y_test, predictions))

    return xgb_classifier, grid_search.best_params_

In [426]:
df = famd_df(pd.read_csv('beta_dates/beta_data_7_60.csv', index_col=0))
y = pd.read_csv('labels.csv', index_col=0).values[:,0]
xgboost_train(df, y, label_type='classification')

  X = self.scaler_.transform(X.to_numpy())
  X = self.scaler_.transform(X.to_numpy())


Fitting 3 folds for each of 32 candidates, totalling 96 fits


KeyboardInterrupt: 

In [474]:
X = famd_df(pd.read_csv('beta_dates/beta_data_7_60.csv', index_col=0), n_components=3, cat_threshold=4)
y = pd.read_csv('true_labels.csv', index_col=0).values[:,0]

y = y + 1
classification_list = []
for i in range(1000):
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2)

    params =  {'colsample_bytree': 0.8,
    'gamma': 0,
    'learning_rate': 0.01,
    'max_depth': 7,
    'min_child_weight': 4,
    'n_estimators': 110,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'subsample': 0.7}

    # train model with best parameters
    xgb_classifier = xgb.XGBClassifier(**params, objective='multi:softmax', random_state=42)
    xgb_classifier.fit(X_train, y_train)

    # Predictions
    predictions = xgb_classifier.predict(X_test)

    # add to list
    classification_list.append(classification_report(y_test, predictions, output_dict=True))

# Evaluation
# print(classification_report(y_test, predictions))

# print('y_test:', y_test)
# print('predictions:', predictions)


# feature_importances = xgb_classifier.feature_importances_
# plt.barh(range(len(feature_importances)), feature_importances)
# plt.show()

  X = self.scaler_.transform(X.to_numpy())
  X = self.scaler_.transform(X.to_numpy())
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg

In [477]:
# get the average accuracy
avg_acc = 0
for report in classification_list:
    avg_acc += report['accuracy']
avg_acc /= len(classification_list)
print('Average accuracy:', avg_acc)

Average accuracy: 0.7456545454545469


In [372]:
# # properly read in data
# df = pd.read_csv('master_data.csv', index_col=0)

# days_prior = forecast_n_days_prior(df, 7)

# # We split the df into numeric and categorical data
# df_numeric = days_prior.drop(['decision', 'recess', 'fed_party', 'potus_party'], axis=1)
# df_encoded = days_prior[['decision', 'recess', 'fed_party', 'potus_party']]

# # Then we scale the numeric data
# scaler = StandardScaler()
# df_numeric = scaler.fit_transform(df_numeric)
# df_numeric = pd.DataFrame(df_numeric, index=days_prior.index)

# # The function FAMD imported from prince requires categorical data to
# # Be of type 'object' so we convert it since it is already encoded
# df_categorical = df_encoded.astype(str)
# days_prior_mixed = pd.concat([df_numeric, df_categorical], axis=1)

# X = days_prior_mixed.drop(['decision'], axis=1)

# famd = FAMD(n_components=3)

# X_famd = famd.fit_transform(X).values
# y = days_prior_mixed['decision'].values
# # make y ints
# y = y.astype(float).astype(int)
# xgboost_train(X_famd, y, label_type='classification')

# Key ideas: 
How to handle time series - No CV, regression, issues

b



In [None]:
"""
As a priliminary approach, we decided to pull around 35 feature columns using the Federal Reserve Economic Data (FRED),
the run a correlation matrix to find collinear features. We then grouped the collinear features and selected one representative.
We then generated all combinations of the representatives and ran a grid search to find the best combination of features, with the 
model being an XGBoost classifier. However, an issue with this approach was the reality of working with time series data. We coul


"""

In [370]:
# df = pd.read_csv('master_data.csv', index_col=0)
# aggregated_df = aggregate_before_decision(df, n_days=30)  # Aggregate 30 days before each decision

# # Split data into features and target
# X = aggregated_df.drop(['ffr', 'decision', 'change'], axis=1)
# y = aggregated_df['decision']

# combos = get_combos(X)

# # # turn to -1,0,1 to 0, 1, 2
# y = y + 1

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# param_grid = {
#     'n_estimators': [100, 200, 300],  # Number of boosting rounds
#     'learning_rate': [0.05, 0.1],  # Step size shrinkage
#     'max_depth': [5, 7],  # Maximum tree depth
#     'min_child_weight': [1, 3, 5],  # Minimum sum of instance weight needed in a child
#     'gamma': [0, 0.1, 0.2],  # Minimum loss reduction required for a split
#     'subsample': [0.7, 0.8, 0.9],  # Subsample ratio of the training instance
#     'colsample_bytree': [0.7, 0.8, 0.9],  # Subsample ratio of columns when constructing each tree
#     'reg_alpha': [0, 0.1, 0.5],  # L1 regularization term
#     'reg_lambda': [1, 1.5, 2]  # L2 regularization term
# }


# # Manual Grid Search
# best_params = None
# best_accuracy = 0
# best_params_list = []

# i = 0
# for n_estimators in param_grid['n_estimators']:
#     for learning_rate in param_grid['learning_rate']:
#         for max_depth in param_grid['max_depth']:
#             for min_child_weight in param_grid['min_child_weight']:
#                 for gamma in param_grid['gamma']:
#                     for subsample in param_grid['subsample']:
#                         for colsample_bytree in param_grid['colsample_bytree']:
#                             for reg_alpha in param_grid['reg_alpha']:
#                                 for reg_lambda in param_grid['reg_lambda']:
#                                     # Create and fit the model
#                                     xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=n_estimators,
#                                                                           learning_rate=learning_rate, max_depth=max_depth,
#                                                                           min_child_weight=min_child_weight, gamma=gamma,
#                                                                           subsample=subsample, colsample_bytree=colsample_bytree,
#                                                                           reg_alpha=reg_alpha, reg_lambda=reg_lambda, random_state=42)
#                                     xgb_classifier.fit(X_train, y_train)

#                                     # Evaluate on the validation set
#                                     val_predictions = xgb_classifier.predict(X_test)
#                                     accuracy = accuracy_score(y_test, val_predictions)

#                                     # Update best params
#                                     if accuracy > best_accuracy:
#                                         best_accuracy = accuracy
#                                         best_params = {'n_estimators': n_estimators, 'learning_rate': learning_rate,
#                                                          'max_depth': max_depth, 'min_child_weight': min_child_weight,
#                                                          'gamma': gamma, 'subsample': subsample,
#                                                          'colsample_bytree': colsample_bytree,
#                                                          'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda}
                                        
#                                     if accuracy > 0.58:
#                                         best_params_list.append({'n_estimators': n_estimators, 'learning_rate': learning_rate,
#                                                          'max_depth': max_depth, 'min_child_weight': min_child_weight,
#                                                          'gamma': gamma, 'subsample': subsample,
#                                                          'colsample_bytree': colsample_bytree,
#                                                          'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda})
#                                         print("Accuracy:", accuracy)
#                                         print(classification_report(y_test, val_predictions))
#                                     print(i)
#                                     i += 1
                                        
# print("Best parameters:", best_params)
# print("Best accuracy:", best_accuracy)

## Now, train the best classifier

In [292]:
# split_date = int(len(label_binary) * 0.8)
# features_adjusted = features[list(best_combo)].iloc[1:]
# train_features = features_adjusted[:split_date]
# test_features = features_adjusted[split_date:]
# train_label = label_binary[:split_date]
# test_label = label_binary[split_date:]
# features_adjusted = features[list(best_combo)].iloc[1:]
# # train using the best parameters
# print(best_params)
# xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', **best_params, random_state=42)
# xgb_classifier.fit(train_features, train_label)

# # Predictions and evaluation
# xgb_predictions = xgb_classifier.predict(test_features)
# mse = mean_squared_error(test_label, xgb_predictions)

# print("Mean Squared Error:", mse)

# print(accuracy_score(test_label, xgb_predictions))
# print(classification_report(test_label, xgb_predictions))


{'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 1, 'gamma': 0, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0.5, 'reg_lambda': 2}
Mean Squared Error: 0.18292682926829268
0.8170731707317073
              precision    recall  f1-score   support

           0       0.77      0.79      0.78        34
           1       0.85      0.83      0.84        48

    accuracy                           0.82        82
   macro avg       0.81      0.81      0.81        82
weighted avg       0.82      0.82      0.82        82



In [200]:
# # Assuming 'features' is your DataFrame
# corr_matrix = features.corr().abs()

# # Threshold for considering correlation as 'high'
# threshold = 0.8

# # Grouping collinear features
# collinear_groups = []
# already_grouped = set()

# for col in corr_matrix.columns:
#     if col not in already_grouped:
#         collinear_set = {col}
#         for index in corr_matrix.index[corr_matrix[col] > threshold]:
#             if col != index and index not in already_grouped:
#                 collinear_set.add(index)
#                 already_grouped.add(index)
#         if len(collinear_set) > 1:
#             collinear_groups.append(collinear_set)

#             # Assuming 'label' is your target variable
# label_corr = features.corrwith(label).abs()

# # Filter out non-collinear features that are highly correlated with the label
# non_collinear_features = [col for col in features.columns if col not in already_grouped and label_corr[col] <= threshold]

# # Generate all possible combinations of non-collinear features
# non_collinear_combos = []
# for i in range(1, len(non_collinear_features) + 1):
#     non_collinear_combos += list(combinations(non_collinear_features, i))

# # Generate all combinations where each feature in a collinear group can be a representative
# all_possible_collinear_combos = list(product(*collinear_groups))

# # Combine collinear and non-collinear combinations
# final_combos = []
# for collinear_combo in all_possible_collinear_combos:
#     for non_collinear_combo in non_collinear_combos:
#         final_combos.append(collinear_combo + non_collinear_combo)

# # remove duplicates from final_combos
# final_combos = list(set(final_combos))
# print(len(final_combos))



# label_binary = label.diff().dropna()
# split_date = int(len(label_binary) * 0.8)
# label_binary = np.where(label_binary > 0, 1, 0)

# best_combo = None
# best_MSE = np.inf

# for i, combo in enumerate(final_combos):
#     # remove duplicate elements in combo
#     combo = list(set(combo))
#     print(i)
#     features_adjusted = features[list(combo)].iloc[1:]

#     train_features = features_adjusted[:split_date]
#     test_features = features_adjusted[split_date:]
#     train_label = label_binary[:split_date]
#     test_label = label_binary[split_date:]

#     # Model creation and fitting
#     xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
#     xgb_classifier.fit(train_features, train_label)

#     # Predictions and evaluation
#     xgb_predictions = xgb_classifier.predict(test_features)
#     mse = mean_squared_error(test_label, xgb_predictions)
    
#     if mse < best_MSE:
#         best_MSE = mse
#         best_combo = combo

# print(best_combo)


In [371]:
# # Create a binary label for increase (1) or decrease (0) in interest rates
# label_binary = label.diff().dropna()
# label_binary = np.where(label_binary > 0, 1, 0)

# # Shift the binary label forward by one period
# for i in range(1, 13):
#     label_shifted = np.roll(label_binary, -i)

#     features_aligned = features.iloc[1:-1]
#     label_aligned = label_shifted[:-1]


#     split_index = int(len(label_aligned) * 0.8)

#     train_features = features_aligned[:split_index]
#     test_features = features_aligned[split_index:]
#     train_label = label_aligned[:split_index]
#     test_label = label_aligned[split_index:]


#     xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.1, max_depth=5, random_state=42)
#     xgb_classifier.fit(train_features, train_label)

#     xgb_predictions = xgb_classifier.predict(test_features)
#     accuracy = accuracy_score(test_label, xgb_predictions)

#     print("Accuracy:", accuracy)
#     # print(classification_report(test_label, xgb_predictions))


In [None]:
# # get tags
# request_url = f'https://api.stlouisfed.org/fred/tags'

# params = {
#     'api_key': api_key,
#     'file_type': 'json',
#     'limit': 1000,
#     'order_by': 'popularity',
#     # 'offset': 0,
# }
# tags_response = requests.get(request_url, params=params)
# # response_json = tags_response.json()
# # tags_df = pd.DataFrame(response_json['tags'])
# # display(tags_df)
# # print(tags_df['name'].to_list())

In [106]:
# response_json = tags_response.json()
# tags_df = pd.DataFrame(response_json['tags'])
# display(tags_df)

In [107]:
# request_url = f'https://api.stlouisfed.org/fred/releases'

# params = {
#     'api_key': api_key,
#     'file_type': 'json',
#     'limit': 1000,
#     # 'offset': 10,
# }
# response = requests.get(request_url, params=params)
# # # print the 'name' for each Release
# # for release in response.json()['releases']:
# #     print(release['name'])