In [15]:
import pandas as pd

# Load the collated dataset
df = pd.read_csv('Resources/Output/collated_data_2.csv')

# Convert 'Date' column to datetime if it's not already
df['Date'] = pd.to_datetime(df['Date'])

# Set 'Date' as index for time series modeling
df.set_index('Date', inplace=True)

# Preview the dataset
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 416 entries, 2020-03-01 to 2024-09-01
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             416 non-null    int64  
 1   laid_off_by_month      416 non-null    float64
 2   funds_raised_by_month  416 non-null    float64
 3   MORTGAGE30US           416 non-null    float64
 4   FEDFUNDS               416 non-null    float64
 5   UNRATE                 416 non-null    float64
 6   stock_open             416 non-null    float64
 7   stock_high             416 non-null    float64
 8   stock_low              416 non-null    float64
 9   stock_close            416 non-null    float64
 10  stock_adj_close        416 non-null    float64
 11  stock_volume           416 non-null    float64
 12  Value_CPI              416 non-null    float64
 13  Inflation Rate_CPI     416 non-null    float64
 14  Commodity              416 non-null    

In [11]:
# Handle missing values by filling them with forward-fill method
df.fillna(method='ffill', inplace=True)

# Feature Engineering: Create lagged features (lags of 1, 2, 3 months)
lags = [1, 2, 3]
for col in ['MORTGAGE30US', 'FEDFUNDS', 'stock_open', 'stock_close', 'UNRATE', 'Value_CPI', 'Inflation Rate_CPI', 'Value_Wage', 'Inflation Rate_Wage']:
    for lag in lags:
        df[f'{col}_lag{lag}'] = df[col].shift(lag)

# Drop rows with missing values after creating lagged features
df.dropna(inplace=True)

df


  df.fillna(method='ffill', inplace=True)


Unnamed: 0_level_0,Unnamed: 0,laid_off_by_month,funds_raised_by_month,MORTGAGE30US,FEDFUNDS,UNRATE,stock_open,stock_high,stock_low,stock_close,...,Inflation Rate_CPI_lag3,Value_Wage_lag1,Value_Wage_lag2,Value_Wage_lag3,Inflation Rate_Wage_lag1,Inflation Rate_Wage_lag2,Inflation Rate_Wage_lag3,stock_close_lag1,stock_close_lag2,stock_close_lag3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-01,12,14674.0,74191.0,3.2325,0.05,13.2,2869.09,3068.67,2766.64,3044.31,...,-0.007078,23.64,45.37,50.73,0.552956,-0.241865,-0.177096,2912.43,2912.43,2912.43
2020-05-01,13,14674.0,74191.0,3.2325,0.05,13.2,2869.09,3068.67,2766.64,3044.31,...,-0.007078,23.64,23.64,45.37,0.552956,0.552956,-0.241865,3044.31,2912.43,2912.43
2020-05-01,14,14674.0,74191.0,3.2325,0.05,13.2,2869.09,3068.67,2766.64,3044.31,...,-0.007078,23.64,23.64,23.64,0.552956,0.552956,0.552956,3044.31,3044.31,2912.43
2020-06-01,15,3926.0,11724.1,3.1625,0.08,11.0,3038.78,3233.13,2965.66,3100.29,...,-0.505862,23.64,23.64,23.64,0.552956,0.552956,0.552956,3044.31,3044.31,3044.31
2020-06-01,16,3926.0,11724.1,3.1625,0.08,11.0,3038.78,3233.13,2965.66,3100.29,...,-0.226183,23.64,23.64,23.64,0.552956,0.552956,0.552956,3100.29,3044.31,3044.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-01,411,19335.0,3488.0,2.8680,0.10,5.4,4300.73,4429.97,4233.13,4395.26,...,1.016879,47.17,51.67,25.16,4.220062,2.337096,1.697656,4395.26,4395.26,4395.26
2021-07-01,412,19335.0,3488.0,2.8680,0.10,5.4,4300.73,4429.97,4233.13,4395.26,...,0.675018,25.16,47.17,51.67,1.697656,4.220062,2.337096,4395.26,4395.26,4395.26
2021-07-01,413,19335.0,3488.0,2.8680,0.10,5.4,4300.73,4429.97,4233.13,4395.26,...,0.675018,51.67,25.16,47.17,2.337096,1.697656,4.220062,4395.26,4395.26,4395.26
2021-07-01,414,19335.0,3488.0,2.8680,0.10,5.4,4300.73,4429.97,4233.13,4395.26,...,0.675018,47.17,51.67,25.16,4.220062,2.337096,1.697656,4395.26,4395.26,4395.26


In [17]:
# Drop duplicate rows from the DataFrame
df_cleaned = df.drop_duplicates()

df_cleaned

Unnamed: 0_level_0,Unnamed: 0,laid_off_by_month,funds_raised_by_month,MORTGAGE30US,FEDFUNDS,UNRATE,stock_open,stock_high,stock_low,stock_close,stock_adj_close,stock_volume,Value_CPI,Inflation Rate_CPI,Commodity,Value_Wage,Inflation Rate_Wage,Category
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-03-01,0,7850.0,15530.2,3.450,0.65,4.4,2974.28,3136.72,2191.86,2584.59,2584.59,1.621854e+11,49.722,0.513463,Information Technology Commodities,,,
2020-03-01,1,7850.0,15530.2,3.450,0.65,4.4,2974.28,3136.72,2191.86,2584.59,2584.59,1.621854e+11,65.649,0.738092,Education and Communication Commodities,,,
2020-03-01,2,7850.0,15530.2,3.450,0.65,4.4,2974.28,3136.72,2191.86,2584.59,2584.59,1.621854e+11,113.031,-0.145765,Education and Communication Services,,,
2020-04-01,3,19821.0,43862.0,3.306,0.05,14.8,2498.08,2954.86,2447.49,2912.43,2912.43,1.236082e+11,49.816,0.189051,Information Technology Commodities,50.73,-0.177096,Total Compensation
2020-04-01,4,19821.0,43862.0,3.306,0.05,14.8,2498.08,2954.86,2447.49,2912.43,2912.43,1.236082e+11,49.816,0.189051,Information Technology Commodities,45.37,-0.241865,Wages and Salaries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-01,411,19335.0,3488.0,2.868,0.10,5.4,4300.73,4429.97,4233.13,4395.26,4395.26,8.425562e+10,63.088,0.675018,Education and Communication Commodities,25.16,1.697656,Benefits
2021-07-01,412,19335.0,3488.0,2.868,0.10,5.4,4300.73,4429.97,4233.13,4395.26,4395.26,8.425562e+10,115.882,0.081183,Education and Communication Services,51.67,2.337096,Total Compensation
2021-07-01,413,19335.0,3488.0,2.868,0.10,5.4,4300.73,4429.97,4233.13,4395.26,4395.26,8.425562e+10,115.882,0.081183,Education and Communication Services,47.17,4.220062,Wages and Salaries
2021-07-01,414,19335.0,3488.0,2.868,0.10,5.4,4300.73,4429.97,4233.13,4395.26,4395.26,8.425562e+10,115.882,0.081183,Education and Communication Services,25.16,1.697656,Benefits


In [5]:
# Split into X (features) and y (target)
X = df.drop(columns=['laid_off_by_month'])  # All features
y = df['laid_off_by_month']  # Target: layoffs

# Train-test split (80% training, 20% testing)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (325, 44), Test shape: (82, 44)


In [8]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# One-hot encoding for the categorical variables
X_train_encoded = pd.get_dummies(X_train, drop_first=True)  # drop_first avoids multicollinearity
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Align the columns in the test set with the train set (this fills missing columns with 0 in test set)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Check the shapes of the encoded datasets
print(f'Train shape after encoding: {X_train_encoded.shape}')
print(f'Test shape after encoding: {X_test_encoded.shape}')



Train shape after encoding: (325, 47)
Test shape after encoding: (82, 47)


In [9]:
# Initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model with the encoded training data
rf_model.fit(X_train_encoded, y_train)

# Make predictions on the encoded test data
y_pred_rf = rf_model.predict(X_test_encoded)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest - Mean Squared Error: {mse_rf}')
print(f'Random Forest - R-squared: {r2_rf}')


Random Forest - Mean Squared Error: 4214953.522959756
Random Forest - R-squared: 0.9646585006506873
