In [37]:
import pandas as pd
import numpy as np

# Load the datasets
fomc_df = pd.read_csv("FOMC_dataset.csv")
spx_df = pd.read_csv("^SPX_data.csv")

# Display the first few rows of each dataset to understand their structure
fomc_df.head(), spx_df.head()


(   Unnamed: 0        Date    CPI  Industry Production     PCE  Retail Sales  \
 0           0  1998-01-01  161.6               84.347  70.736      142955.0   
 1           1  1998-01-01  161.6               84.347  70.736      142955.0   
 2           2  1998-01-01  161.6               84.347  70.736      142955.0   
 3           3  1998-01-01  161.6               84.347  70.736      142955.0   
 4           4  1998-01-01  161.6               84.347  70.736      142955.0   
 
    Unemployment  Wage Increase  Fed Rate  Home Sales  ...  Retail Trade Lag_6  \
 0           4.6          319.0      5.84       872.0  ...            209666.0   
 1           4.6          319.0      6.06       872.0  ...            209666.0   
 2           4.6          319.0      5.51       872.0  ...            209666.0   
 3           4.6          319.0      5.35       872.0  ...            209666.0   
 4           4.6          319.0      5.29       872.0  ...            209666.0   
 
    Real GDP Lag_6  Indu

In [38]:
# Convert Date columns to datetime format
fomc_df['Date'] = pd.to_datetime(fomc_df['Date'])
fomc_df['Year'] = fomc_df['Date'].dt.year
fomc_df['Month'] = fomc_df['Date'].dt.month
spx_df = spx_df.iloc[2:].reset_index(drop=True)  # Remove metadata rows

# Rename columns for clarity
spx_df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'Name']
spx_df = spx_df.drop(columns=['Name'])  # Remove unnecessary column

# Convert numeric columns to float type
spx_numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
spx_df[spx_numeric_cols] = spx_df[spx_numeric_cols].astype(float)

# Remove records before a certain cutoff date
cutoff_date = pd.to_datetime("2013-01-01")
fomc_df = fomc_df[fomc_df['Date'] >= cutoff_date]

# Group by Date and aggregate
numeric_cols = fomc_df.select_dtypes(include=[np.number]).columns.drop('FOMC Meeting', errors='ignore') 
fomc_df = fomc_df.groupby('Date')[numeric_cols].mean().reset_index()

# Fill missing FOMC Meeting with 0 (no meeting)
print("Missing Values:\n", fomc_df.isnull().sum())

# Cell 6: Treat Outliers
fomc_df['Real GDP'] = fomc_df['Real GDP'].replace(28630.739, np.nan)
fomc_df['Real GDP'] = fomc_df['Real GDP'].interpolate(method='linear')
fomc_df['Home Sales'] = fomc_df['Home Sales'].clip(lower=400, upper=800) 

# Cell 8: Select Key Columns for Tableau
key_columns = [
    'Date', 'Year', 'Month', 'CPI', 'Unemployment', 'Fed Rate', 'Real GDP', 
    'Inflation', 'Home Sales', 'Retail Sales', 'FOMC Meeting', 'Regime',
    'Industry Production Lag_1', 'Unemployment Lag_1', 'YoY CPI', 'YoY Unemployment',
    'Month_Sin', 'Month_Cos'
]

fomc_df = fomc_df[[col for col in key_columns if col in fomc_df.columns]]
print("Final Columns:", fomc_df.columns.tolist())

fomc_df.to_csv("FOMC_dataset_cleaned.csv", index=False)
spx_df.to_csv("SPX_dataset_cleaned.csv", index=False)

Missing Values:
 Date                    0
Unnamed: 0              0
CPI                     0
Industry Production     0
PCE                     0
                       ..
Unemployment Lag_12     0
Wage Increase Lag_12    0
Home Sales Lag_12       0
Retail Trade Lag_12     0
Real GDP Lag_12         0
Length: 102, dtype: int64
Final Columns: ['Date', 'Year', 'Month', 'CPI', 'Unemployment', 'Fed Rate', 'Real GDP', 'Inflation', 'Home Sales', 'Retail Sales', 'Industry Production Lag_1', 'Unemployment Lag_1', 'YoY Unemployment', 'Month_Sin', 'Month_Cos']
