In [3]:
import pandas as pd
import os

# Step 1: Load preprocessed CSV files from the Output folder and Dataframes folder
layoffs_df = pd.read_csv('Resources/Output/layoffs_bydate.csv')
mortgages_df = pd.read_csv('Resources/Output/mortgage_rates_bydate.csv')
fed_interest_df = pd.read_csv('Resources/Output/fed_interest_rate_bydate.csv')
unemployment_df = pd.read_csv('Resources/Output/unemployment_rate.csv')
stock_df = pd.read_csv('Resources/Output/stock_data.csv')

# Load the wage and CPI data from the Dataframes folder
cpi_df = pd.read_csv('Dataframes/combined_cpi_data.csv')
wage_df = pd.read_csv('Dataframes/combined_wage_data.csv')

In [4]:
# Preview each DataFrame before processing
print("\n--- Layoffs DataFrame ---")
print(layoffs_df.head())

print("\n--- Mortgages DataFrame ---")
print(mortgages_df.head())

print("\n--- Fed Interest Rate DataFrame ---")
print(fed_interest_df.head())

print("\n--- Unemployment DataFrame ---")
print(unemployment_df.head())

print("\n--- Stock DataFrame ---")
print(stock_df.head())

print("\n--- CPI DataFrame ---")
print(cpi_df.head())

print("\n--- Wage DataFrame ---")
print(wage_df.head())


--- Layoffs DataFrame ---
    period  laid_off_by_month  funds_raised_by_month
0  2020/03             7850.0                15530.2
1  2020/04            19821.0                43862.0
2  2020/05            14674.0                74191.0
3  2020/06             3926.0                11724.1
4  2020/07             1612.0                 4447.0

--- Mortgages DataFrame ---
  YearMonth  MORTGAGE30US
0   2018-01        4.0325
1   2018-02        4.3300
2   2018-03        4.4440
3   2018-04        4.4675
4   2018-05        4.5860

--- Fed Interest Rate DataFrame ---
      DATE  FEDFUNDS
0  2019/08      2.13
1  2019/09      2.04
2  2019/10      1.83
3  2019/11      1.55
4  2019/12      1.55

--- Unemployment DataFrame ---
        DATE  UNRATE
0   8/1/2019     3.6
1   9/1/2019     3.5
2  10/1/2019     3.6
3  11/1/2019     3.6
4  12/1/2019     3.6

--- Stock DataFrame ---
      Date  stock_open  stock_high  stock_low  stock_close  stock_adj_close  \
0  2024/08     5537.84     5651.62    5119.26

In [6]:
# Check the column names for each DataFrame
print("\n--- Layoffs DataFrame Columns ---")
print(layoffs_df.columns)

print("\n--- Mortgages DataFrame Columns ---")
print(mortgages_df.columns)

print("\n--- Fed Interest Rate DataFrame Columns ---")
print(fed_interest_df.columns)

print("\n--- Unemployment DataFrame Columns ---")
print(unemployment_df.columns)

print("\n--- Stock DataFrame Columns ---")
print(stock_df.columns)

print("\n--- CPI DataFrame Columns ---")
print(cpi_df.columns)

print("\n--- Wage DataFrame Columns ---")
print(wage_df.columns)



--- Layoffs DataFrame Columns ---
Index(['period', 'laid_off_by_month', 'funds_raised_by_month'], dtype='object')

--- Mortgages DataFrame Columns ---
Index(['YearMonth', 'MORTGAGE30US'], dtype='object')

--- Fed Interest Rate DataFrame Columns ---
Index(['DATE', 'FEDFUNDS'], dtype='object')

--- Unemployment DataFrame Columns ---
Index(['DATE', 'UNRATE'], dtype='object')

--- Stock DataFrame Columns ---
Index(['Date', 'stock_open', 'stock_high', 'stock_low', 'stock_close',
       'stock_adj_close', 'stock_volume'],
      dtype='object')

--- CPI DataFrame Columns ---
Index(['Date', 'Series ID', 'Value', 'Inflation Rate', 'Commodity'], dtype='object')

--- Wage DataFrame Columns ---
Index(['Date', 'Series ID', 'Value', 'Inflation Rate', 'Category'], dtype='object')


In [7]:
# Layoffs Data: Column is named 'period'
layoffs_df['Date'] = pd.to_datetime(layoffs_df['period']).dt.to_period('M')

# Mortgages Data: Column is 'YearMonth'
mortgages_df['Date'] = pd.to_datetime(mortgages_df['YearMonth']).dt.to_period('M')

# Fed Interest Rate Data: Column is 'DATE'
fed_interest_df['Date'] = pd.to_datetime(fed_interest_df['DATE'], format='%Y/%m').dt.to_period('M')

# Unemployment Data: Column is 'DATE'
unemployment_df['Date'] = pd.to_datetime(unemployment_df['DATE'], format='%m/%d/%Y').dt.to_period('M')

# Stock Data: Column is 'Date'
stock_df['Date'] = pd.to_datetime(stock_df['Date'], format='%Y/%m').dt.to_period('M')

# CPI and Wage Data: Already in correct format, but ensure they are consistently in 'YYYY-MM'
cpi_df['Date'] = pd.to_datetime(cpi_df['Date']).dt.to_period('M')
wage_df['Date'] = pd.to_datetime(wage_df['Date']).dt.to_period('M')

# Drop the old period/date columns if no longer needed
layoffs_df = layoffs_df.drop(columns=['period'])
mortgages_df = mortgages_df.drop(columns=['YearMonth'])
fed_interest_df = fed_interest_df.drop(columns=['DATE'])
unemployment_df = unemployment_df.drop(columns=['DATE'])

  layoffs_df['Date'] = pd.to_datetime(layoffs_df['period']).dt.to_period('M')


In [8]:
# Preview the updated DataFrames after the date conversions
print("\n--- Layoffs DataFrame (After Date Conversion) ---")
print(layoffs_df.head())

print("\n--- Mortgages DataFrame (After Date Conversion) ---")
print(mortgages_df.head())

print("\n--- Fed Interest Rate DataFrame (After Date Conversion) ---")
print(fed_interest_df.head())

print("\n--- Unemployment DataFrame (After Date Conversion) ---")
print(unemployment_df.head())

print("\n--- Stock DataFrame (After Date Conversion) ---")
print(stock_df.head())

# CPI and Wage DataFrames are already in YYYY-MM format
print("\n--- CPI DataFrame (Already in YYYY-MM Format) ---")
print(cpi_df.head())

print("\n--- Wage DataFrame (Already in YYYY-MM Format) ---")
print(wage_df.head())


--- Layoffs DataFrame (After Date Conversion) ---
   laid_off_by_month  funds_raised_by_month     Date
0             7850.0                15530.2  2020-03
1            19821.0                43862.0  2020-04
2            14674.0                74191.0  2020-05
3             3926.0                11724.1  2020-06
4             1612.0                 4447.0  2020-07

--- Mortgages DataFrame (After Date Conversion) ---
   MORTGAGE30US     Date
0        4.0325  2018-01
1        4.3300  2018-02
2        4.4440  2018-03
3        4.4675  2018-04
4        4.5860  2018-05

--- Fed Interest Rate DataFrame (After Date Conversion) ---
   FEDFUNDS     Date
0      2.13  2019-08
1      2.04  2019-09
2      1.83  2019-10
3      1.55  2019-11
4      1.55  2019-12

--- Unemployment DataFrame (After Date Conversion) ---
   UNRATE     Date
0     3.6  2019-08
1     3.5  2019-09
2     3.6  2019-10
3     3.6  2019-11
4     3.6  2019-12

--- Stock DataFrame (After Date Conversion) ---
      Date  stock_open

In [9]:
# Merge DataFrames on 'Date' using outer join to keep all data
merged_df = layoffs_df.merge(mortgages_df[['Date', 'MORTGAGE30US']], on='Date', how='outer')
merged_df = merged_df.merge(fed_interest_df[['Date', 'FEDFUNDS']], on='Date', how='outer')
merged_df = merged_df.merge(unemployment_df[['Date', 'UNRATE']], on='Date', how='outer')
merged_df = merged_df.merge(stock_df[['Date', 'stock_open', 'stock_high', 'stock_low', 'stock_close', 'stock_adj_close', 'stock_volume']], on='Date', how='outer')
merged_df = merged_df.merge(cpi_df[['Date', 'Value', 'Inflation Rate', 'Commodity']], on='Date', how='outer')
merged_df = merged_df.merge(wage_df[['Date', 'Value', 'Inflation Rate', 'Category']], on='Date', how='outer', suffixes=('_CPI', '_Wage'))

# Preview the merged DataFrame before handling missing values
print("\n--- Merged DataFrame (Before Handling Missing Values) ---")
print(merged_df.head())


--- Merged DataFrame (Before Handling Missing Values) ---
   laid_off_by_month  funds_raised_by_month     Date  MORTGAGE30US  FEDFUNDS  \
0             7850.0                15530.2  2020-03         3.450      0.65   
1             7850.0                15530.2  2020-03         3.450      0.65   
2             7850.0                15530.2  2020-03         3.450      0.65   
3            19821.0                43862.0  2020-04         3.306      0.05   
4            19821.0                43862.0  2020-04         3.306      0.05   

   UNRATE  stock_open  stock_high  stock_low  stock_close  stock_adj_close  \
0     4.4     2974.28     3136.72    2191.86      2584.59          2584.59   
1     4.4     2974.28     3136.72    2191.86      2584.59          2584.59   
2     4.4     2974.28     3136.72    2191.86      2584.59          2584.59   
3    14.8     2498.08     2954.86    2447.49      2912.43          2912.43   
4    14.8     2498.08     2954.86    2447.49      2912.43          291

In [10]:
# Handle missing values using forward fill to fill in gaps from different data sources
merged_df.fillna(method='ffill', inplace=True)

# Preview the merged DataFrame after handling missing values
print("\n--- Merged DataFrame (After Handling Missing Values) ---")
print(merged_df.head())


--- Merged DataFrame (After Handling Missing Values) ---
   laid_off_by_month  funds_raised_by_month     Date  MORTGAGE30US  FEDFUNDS  \
0             7850.0                15530.2  2020-03         3.450      0.65   
1             7850.0                15530.2  2020-03         3.450      0.65   
2             7850.0                15530.2  2020-03         3.450      0.65   
3            19821.0                43862.0  2020-04         3.306      0.05   
4            19821.0                43862.0  2020-04         3.306      0.05   

   UNRATE  stock_open  stock_high  stock_low  stock_close  stock_adj_close  \
0     4.4     2974.28     3136.72    2191.86      2584.59          2584.59   
1     4.4     2974.28     3136.72    2191.86      2584.59          2584.59   
2     4.4     2974.28     3136.72    2191.86      2584.59          2584.59   
3    14.8     2498.08     2954.86    2447.49      2912.43          2912.43   
4    14.8     2498.08     2954.86    2447.49      2912.43          2912

  merged_df.fillna(method='ffill', inplace=True)


In [11]:
# Save the merged DataFrame to a CSV file for further analysis
output_path = 'Resources/Output/collated_data_2.csv'
merged_df.to_csv(output_path, index=True)
print(f"\nCollated data saved to {output_path}")


Collated data saved to Resources/Output/collated_data_2.csv
