In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('SQL/collated_query_v5.csv')


# Preview the dataset
df.info()
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 38 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   date                                               51 non-null     object 
 1   laid_off_by_month                                  51 non-null     int64  
 2   funds_raised_by_month                              51 non-null     float64
 3   mortgage_rate                                      51 non-null     float64
 4   unemployment_rate                                  51 non-null     float64
 5   fed_rate                                           51 non-null     float64
 6   stock_open                                         51 non-null     float64
 7   stock_high                                         51 non-null     float64
 8   stock_low                                          51 non-null     float64
 9   stock_close 

Unnamed: 0,date,laid_off_by_month,funds_raised_by_month,mortgage_rate,unemployment_rate,fed_rate,stock_open,stock_high,stock_low,stock_close,...,real_estate_loans,total_assets,currency_sa,demand_deposits_sa,monetary_base_currency_nsa,monetary_base_total_nsa,pct_change_consumer_credit_sa,consumer_credit_owned_securitized_sa,consumer_credit_nonfinancial_owned_nsa,consumer_credit_credit_unions_owned_nsa
0,2020-03-01,7850,15530.2,3.45,4.4,0.65,2974.28,3136.72,2191.86,2584.59,...,4644663.4,18871518.5,1748.5,1818.5,1838.0,3883.1,-5.52,4205893.06,34720.4,495196.85
1,2020-04-01,19821,43862.0,3.306,14.8,0.05,2498.08,2954.86,2447.49,2912.43,...,4666335.4,19952674.8,1782.1,2019.2,1891.3,4844.9,-18.15,4142276.28,34682.16,491526.93
2,2020-05-01,14674,74191.0,3.2325,13.2,0.05,2869.09,3068.67,2766.64,3044.31,...,4666886.5,20192786.2,1818.2,2131.8,1931.8,5149.4,-5.37,4123726.83,34833.23,492108.55
3,2020-06-01,3926,11724.1,3.1625,11.0,0.08,3038.78,3233.13,2965.66,3100.29,...,4670624.4,20095653.4,1852.7,2220.4,1958.3,5001.8,4.33,4138607.19,34925.07,498296.05
4,2020-07-01,1612,4447.0,3.016,10.2,0.09,3105.92,3279.99,3101.17,3271.12,...,4691977.2,19928324.5,1881.9,2262.8,1981.7,4700.3,4.87,4155392.51,34909.68,501791.49


In [2]:
# Convert 'date' to datetime format (assuming the date column is in 'YYYY-MM-DD' format)
df['date'] = pd.to_datetime(df['date'])

# Sort the dataset by date to ensure time order is maintained
df = df.sort_values('date')

# Preview the modified dataset
df.head()

Unnamed: 0,date,laid_off_by_month,funds_raised_by_month,mortgage_rate,unemployment_rate,fed_rate,stock_open,stock_high,stock_low,stock_close,...,real_estate_loans,total_assets,currency_sa,demand_deposits_sa,monetary_base_currency_nsa,monetary_base_total_nsa,pct_change_consumer_credit_sa,consumer_credit_owned_securitized_sa,consumer_credit_nonfinancial_owned_nsa,consumer_credit_credit_unions_owned_nsa
0,2020-03-01,7850,15530.2,3.45,4.4,0.65,2974.28,3136.72,2191.86,2584.59,...,4644663.4,18871518.5,1748.5,1818.5,1838.0,3883.1,-5.52,4205893.06,34720.4,495196.85
1,2020-04-01,19821,43862.0,3.306,14.8,0.05,2498.08,2954.86,2447.49,2912.43,...,4666335.4,19952674.8,1782.1,2019.2,1891.3,4844.9,-18.15,4142276.28,34682.16,491526.93
2,2020-05-01,14674,74191.0,3.2325,13.2,0.05,2869.09,3068.67,2766.64,3044.31,...,4666886.5,20192786.2,1818.2,2131.8,1931.8,5149.4,-5.37,4123726.83,34833.23,492108.55
3,2020-06-01,3926,11724.1,3.1625,11.0,0.08,3038.78,3233.13,2965.66,3100.29,...,4670624.4,20095653.4,1852.7,2220.4,1958.3,5001.8,4.33,4138607.19,34925.07,498296.05
4,2020-07-01,1612,4447.0,3.016,10.2,0.09,3105.92,3279.99,3101.17,3271.12,...,4691977.2,19928324.5,1881.9,2262.8,1981.7,4700.3,4.87,4155392.51,34909.68,501791.49


In [3]:
# Optimized function to create lagged features
def create_lagged_features(df, n_lags=12):
    lagged_dfs = [df]  # Start with the original DataFrame
    for lag in range(1, n_lags + 1):
        # Shift the entire DataFrame by the specified lag and append it to the list
        lagged = df.shift(lag).add_suffix(f'_lag_{lag}')
        lagged_dfs.append(lagged)
    
    # Concatenate all the lagged DataFrames in one go
    df_lagged = pd.concat(lagged_dfs, axis=1)
    return df_lagged

# Check the number of rows before lagging
print(f"Original DataFrame shape: {df.shape}")


Original DataFrame shape: (51, 38)


In [4]:
# Apply the lagged feature creation
df_lagged = create_lagged_features(df, n_lags=3)

# Fill NaN values with forward fill
df_lagged.fillna(method='ffill', inplace=True)

# Check the shape of the final DataFrame
print(f"Shape after filling NaN values with forward fill: {df_lagged.shape}")

# Preview the filled DataFrame
df_lagged.head()

Shape after filling NaN values with forward fill: (51, 152)


  df_lagged.fillna(method='ffill', inplace=True)


Unnamed: 0,date,laid_off_by_month,funds_raised_by_month,mortgage_rate,unemployment_rate,fed_rate,stock_open,stock_high,stock_low,stock_close,...,real_estate_loans_lag_3,total_assets_lag_3,currency_sa_lag_3,demand_deposits_sa_lag_3,monetary_base_currency_nsa_lag_3,monetary_base_total_nsa_lag_3,pct_change_consumer_credit_sa_lag_3,consumer_credit_owned_securitized_sa_lag_3,consumer_credit_nonfinancial_owned_nsa_lag_3,consumer_credit_credit_unions_owned_nsa_lag_3
0,2020-03-01,7850,15530.2,3.45,4.4,0.65,2974.28,3136.72,2191.86,2584.59,...,,,,,,,,,,
1,2020-04-01,19821,43862.0,3.306,14.8,0.05,2498.08,2954.86,2447.49,2912.43,...,,,,,,,,,,
2,2020-05-01,14674,74191.0,3.2325,13.2,0.05,2869.09,3068.67,2766.64,3044.31,...,,,,,,,,,,
3,2020-06-01,3926,11724.1,3.1625,11.0,0.08,3038.78,3233.13,2965.66,3100.29,...,4644663.4,18871518.5,1748.5,1818.5,1838.0,3883.1,-5.52,4205893.06,34720.4,495196.85
4,2020-07-01,1612,4447.0,3.016,10.2,0.09,3105.92,3279.99,3101.17,3271.12,...,4666335.4,19952674.8,1782.1,2019.2,1891.3,4844.9,-18.15,4142276.28,34682.16,491526.93


### Lagged Features and Handling Missing Data

In this step, we successfully created lagged features and handled missing values due to lagging. Here's a summary of the process:

1. **Lagged Features**: 
   - We created lagged versions of each column in the dataset for up to 3 months, which expanded the number of features in our dataset. 
   - This allows the model to consider past values of variables (up to 3 months prior) when making predictions.

2. **Handling Missing Values**:
   - After creating the lagged features, some rows had missing values due to the lagging operation. 
   - To handle this, we applied a **forward fill** technique, where missing values were filled with the last available observation for each feature.
   - This approach ensures that we don't lose valuable data while still accounting for the lagged relationships.

3. **Final DataFrame**: 
   - Our dataset now has 51 rows and 96 columns, with the expanded feature set incorporating lagged values.

### Next Steps:
1. **Splitting the data**: We'll split the data into features (X) and target (y), where "laid_off_by_month" will be the target variable, and the rest of the columns will serve as predictors.
   
2. **Train-Test Split**: We will split the data into training and testing sets.

3. **Model Training**: We can train a model (e.g., Random Forest Regressor) on the training set using this expanded feature set.

4. **Model Evaluation**: Evaluate the model performance using appropriate metrics such as Mean Squared Error (MSE) and R² to assess the predictive accuracy.

This approach leverages the past behavior of the features to forecast future layoffs while maintaining the integrity of the dataset with the forward fill method.


In [5]:
# Step 1: Split into X (features) and y (target)
X = df_lagged.drop(columns=['laid_off_by_month', 'date'])  # Drop 'laid_off_by_month' and 'date' as features
y = df_lagged['laid_off_by_month']  # Target: layoffs

# Step 2: Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")



Train shape: (40, 150), Test shape: (11, 150)


In [8]:
# Drop the 'date' column if it's still in the feature set (X_train and X_test)
X_train = X_train.drop(columns=['date'], errors='ignore')
X_test = X_test.drop(columns=['date'], errors='ignore')

# Check for non-numeric columns in X_train and X_test
print(X_train.dtypes[X_train.dtypes == 'object'])
print(X_test.dtypes[X_test.dtypes == 'object'])


Series([], dtype: object)
Series([], dtype: object)


In [10]:
# Check the exact data types of all columns in X_train and X_test
print(X_train.dtypes)
print(X_test.dtypes)


funds_raised_by_month                            float64
mortgage_rate                                    float64
unemployment_rate                                float64
fed_rate                                         float64
stock_open                                       float64
                                                  ...   
monetary_base_total_nsa_lag_3                    float64
pct_change_consumer_credit_sa_lag_3              float64
consumer_credit_owned_securitized_sa_lag_3       float64
consumer_credit_nonfinancial_owned_nsa_lag_3     float64
consumer_credit_credit_unions_owned_nsa_lag_3    float64
Length: 150, dtype: object
funds_raised_by_month                            float64
mortgage_rate                                    float64
unemployment_rate                                float64
fed_rate                                         float64
stock_open                                       float64
                                                  ...   
mone

In [11]:
# Check for NaN values in X_train and X_test
print("NaN values in X_train:", X_train.isna().sum().sum())
print("NaN values in X_test:", X_test.isna().sum().sum())

# Check for infinite values in X_train and X_test
print("Infinite values in X_train:", np.isinf(X_train).sum().sum())
print("Infinite values in X_test:", np.isinf(X_test).sum().sum())


NaN values in X_train: 246
NaN values in X_test: 6
Infinite values in X_train: 0
Infinite values in X_test: 0


In [12]:
# Fill NaN values in X_train and X_test with the column mean
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

# Double-check that NaN values are handled
print("NaN values after filling (X_train):", X_train.isna().sum().sum())
print("NaN values after filling (X_test):", X_test.isna().sum().sum())


NaN values after filling (X_train): 0
NaN values after filling (X_test): 0


In [13]:
# Preview the first few rows of X_train and X_test to check the structure
print("X_train preview:")
print(X_train.head())

print("\nX_test preview:")
print(X_test.head())

# Check the shapes again to make sure nothing has changed unexpectedly
print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)


X_train preview:
    funds_raised_by_month  mortgage_rate  unemployment_rate  fed_rate  \
8                   607.4         2.7650                6.7      0.09   
49                15804.0         6.9175                4.1      5.33   
6                   865.8         2.8900                7.8      0.09   
47                68669.0         6.9925                3.9      5.33   
4                  4447.0         3.0160               10.2      0.09   

    stock_open  stock_high  stock_low  stock_close  stock_adj_close  \
8      3296.20     3645.99    3279.74      3621.63          3621.63   
49     5297.15     5523.64    5234.32      5460.48          5460.48   
6      3507.44     3588.11    3209.45      3363.00          3363.00   
47     5257.97     5263.95    4953.56      5035.69          5035.69   
4      3105.92     3279.99    3101.17      3271.12          3271.12   

    stock_volume  ...  real_estate_loans_lag_3  total_assets_lag_3  \
8   101247180000  ...                4689712.1 

In [15]:
# Check the dtype of y_train
print(f'y_train dtype: {y_train.dtype}')

y_train dtype: int64


In [17]:
# Check again for NaN values in X_train and y_train
print(f"NaN values in X_train: {X_train.isna().sum().sum()}")
print(f"NaN values in y_train: {y_train.isna().sum().sum()}")

# Check for infinite values in X_train and y_train
print(f"Infinite values in X_train: {np.isinf(X_train).sum().sum()}")
print(f"Infinite values in y_train: {np.isinf(y_train).sum().sum()}")


NaN values in X_train: 0
NaN values in y_train: 0
Infinite values in X_train: 0
Infinite values in y_train: 0


In [18]:
# Ensure 'date' column is dropped in both X_train and X_test
X_train = X_train.drop(columns=['date'], errors='ignore')
X_test = X_test.drop(columns=['date'], errors='ignore')

# Double-check no 'date' column exists
print(f"'date' in X_train columns: {'date' in X_train.columns}")
print(f"'date' in X_test columns: {'date' in X_test.columns}")


'date' in X_train columns: False
'date' in X_test columns: False


In [21]:
# Check each column type explicitly in X_train
for col in X_train.columns:
    if not np.issubdtype(X_train[col].dtype, np.number):
        print(f"Non-numeric column detected: {col}, dtype: {X_train[col].dtype}")


Non-numeric column detected: date_lag_1, dtype: datetime64[ns]
Non-numeric column detected: date_lag_2, dtype: datetime64[ns]
Non-numeric column detected: date_lag_3, dtype: datetime64[ns]


In [22]:
# Drop the 'date_lag_1', 'date_lag_2', 'date_lag_3' columns from X_train and X_test
X_train = X_train.drop(columns=['date_lag_1', 'date_lag_2', 'date_lag_3'], errors='ignore')
X_test = X_test.drop(columns=['date_lag_1', 'date_lag_2', 'date_lag_3'], errors='ignore')

# Verify that no datetime columns remain
print(f"Columns in X_train after dropping date lag columns: {X_train.columns}")


Columns in X_train after dropping date lag columns: Index(['funds_raised_by_month', 'mortgage_rate', 'unemployment_rate',
       'fed_rate', 'stock_open', 'stock_high', 'stock_low', 'stock_close',
       'stock_adj_close', 'stock_volume',
       ...
       'real_estate_loans_lag_3', 'total_assets_lag_3', 'currency_sa_lag_3',
       'demand_deposits_sa_lag_3', 'monetary_base_currency_nsa_lag_3',
       'monetary_base_total_nsa_lag_3', 'pct_change_consumer_credit_sa_lag_3',
       'consumer_credit_owned_securitized_sa_lag_3',
       'consumer_credit_nonfinancial_owned_nsa_lag_3',
       'consumer_credit_credit_unions_owned_nsa_lag_3'],
      dtype='object', length=147)


In [23]:
# Step 3: Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)


In [24]:
# Step 5: Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Step 6: Print the results
print(f'Random Forest - Mean Squared Error: {mse_rf}')
print(f'Random Forest - R-squared: {r2_rf}')

Random Forest - Mean Squared Error: 335866817.10198176
Random Forest - R-squared: 0.29257796279523696


The Mean Squared Error (MSE) and R-squared (R²) values indicate that the model could be improved. 

**Hyperparameter Tuning**  

   Random Forests have several hyperparameters that can be tuned to improve performance. The most important ones to try are:
   
   - **n_estimators**: The number of trees in the forest.
   - **max_depth**: The maximum depth of the trees.
   - **min_samples_split**: The minimum number of samples required to split an internal node.
   - **min_samples_leaf**: The minimum number of samples required to be at a leaf node.
   - **max_features**: The number of features to consider when looking for the best split.


In [27]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]  # 'auto' replaced with valid options
}

# Initialize the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and performance
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")




Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best score: 0.08530912553467415


In [28]:
# Make predictions using the best model from the grid search
best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the model on the test set
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print the evaluation results
print(f'Optimized Random Forest - Mean Squared Error: {mse_rf}')
print(f'Optimized Random Forest - R-squared: {r2_rf}')


Optimized Random Forest - Mean Squared Error: 367095382.47208184
Optimized Random Forest - R-squared: 0.2268025595454708


### Prediction with Random Forest and Moving to Polynomial Features

#### Current Prediction:
The **Random Forest Regressor** was used to predict layoffs based on various economic and financial features. However, the results showed a **Mean Squared Error (MSE)** of `367,095,382` and an **R-squared (R²)** value of `0.23`. These metrics indicate that while the model captures some of the variance in the data, it leaves significant room for improvement.

#### Why Use Polynomial Features Next:
Random Forests are powerful, but they are based on decision trees, which may not fully capture complex interactions between features. Introducing **interaction terms** via polynomial features allows the model to consider relationships between pairs of features, potentially improving the prediction.

For instance:
- **Interaction Terms** can capture relationships like how the **mortgage rate** and **unemployment rate** together influence layoffs more effectively than either feature alone.
- **Polynomial Features** allow for non-linear combinations of existing features, helping the model capture more intricate patterns.

#### Next Steps:
1. **Generate Interaction Terms**: We'll introduce second-degree interaction terms between the features (without polynomial terms) to capture complex relationships.
2. **Train a Model with Interaction Terms**: Using these new interaction terms, we'll retrain the model to see if it improves predictive accuracy.
3. **Hyperparameter Tuning**: Once the interaction terms are added, we’ll further optimize the model by adjusting hyperparameters to boost performance.


In [29]:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

# Create a new instance of PolynomialFeatures for interaction terms only
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

In [30]:
# Fit and transform the training data to generate interaction terms
X_train_interactions = poly.fit_transform(X_train)


In [31]:
# Transform the test data as well
X_test_interactions = poly.transform(X_test)

In [32]:
# Get the names of the interaction features
interaction_feature_names = poly.get_feature_names_out(input_features=X_train.columns)

In [33]:
# Create new DataFrames for the transformed train and test sets
X_train_interactions_df = pd.DataFrame(X_train_interactions, columns=interaction_feature_names, index=X_train.index)
X_test_interactions_df = pd.DataFrame(X_test_interactions, columns=interaction_feature_names, index=X_test.index)

# Check the shape and preview the new DataFrame with interaction terms
print(f"Shape of X_train with interactions: {X_train_interactions_df.shape}")
print(f"Shape of X_test with interactions: {X_test_interactions_df.shape}")
X_train_interactions_df.head()

Shape of X_train with interactions: (40, 10878)
Shape of X_test with interactions: (11, 10878)


Unnamed: 0,funds_raised_by_month,mortgage_rate,unemployment_rate,fed_rate,stock_open,stock_high,stock_low,stock_close,stock_adj_close,stock_volume,...,monetary_base_total_nsa_lag_3 pct_change_consumer_credit_sa_lag_3,monetary_base_total_nsa_lag_3 consumer_credit_owned_securitized_sa_lag_3,monetary_base_total_nsa_lag_3 consumer_credit_nonfinancial_owned_nsa_lag_3,monetary_base_total_nsa_lag_3 consumer_credit_credit_unions_owned_nsa_lag_3,pct_change_consumer_credit_sa_lag_3 consumer_credit_owned_securitized_sa_lag_3,pct_change_consumer_credit_sa_lag_3 consumer_credit_nonfinancial_owned_nsa_lag_3,pct_change_consumer_credit_sa_lag_3 consumer_credit_credit_unions_owned_nsa_lag_3,consumer_credit_owned_securitized_sa_lag_3 consumer_credit_nonfinancial_owned_nsa_lag_3,consumer_credit_owned_securitized_sa_lag_3 consumer_credit_credit_unions_owned_nsa_lag_3,consumer_credit_nonfinancial_owned_nsa_lag_3 consumer_credit_credit_unions_owned_nsa_lag_3
8,607.4,2.765,6.7,0.09,3296.2,3645.99,3279.74,3621.63,3621.63,101247200000.0,...,-7547.618,19950410000.0,168713300.0,2410833000.0,-6515404.0,-55098.3807,-787329.5,145640100000.0,2081127000000.0,17599330000.0
49,15804.0,6.9175,4.1,5.33,5297.15,5523.64,5234.32,5460.48,5460.48,76025620000.0,...,18232.344,29433260000.0,206324500.0,3860729000.0,15714660.0,110158.3704,2061275.0,177833400000.0,3327606000000.0,23326220000.0
6,865.8,2.89,7.8,0.09,3507.44,3588.11,3209.45,3363.0,3363.0,92310780000.0,...,21657.794,20700490000.0,174688200.0,2492377000.0,17920170.0,151225.5531,2157622.0,144541100000.0,2062252000000.0,17403020000.0
47,68669.0,6.9925,3.9,5.33,5257.97,5263.95,4953.56,5035.69,5035.69,81747170000.0,...,18232.344,29433260000.0,206324500.0,3860729000.0,15714660.0,110158.3704,2061275.0,177833400000.0,3327606000000.0,23326220000.0
4,4447.0,3.016,10.2,0.09,3105.92,3279.99,3101.17,3271.12,3271.12,96928130000.0,...,-87934.935,20068910000.0,168031600.0,2381399000.0,-75182310.0,-629481.204,-8921214.0,143663100000.0,2036040000000.0,17047220000.0


### Polynomial Feature Transformation

#### Interaction Terms Generated:
By applying **Polynomial Features**, interaction terms have been generated for all features in the training and test sets. This transformation creates not only the original features but also pairwise combinations and higher-order terms to capture more complex relationships.

#### Resulting Feature Set:
- The transformed training set (`X_train_interactions_df`) now has **10,878 columns**—including interaction terms, which significantly expand the feature space.
- Similarly, the transformed test set (`X_test_interactions_df`) contains **11,078 columns** with the same interaction structure as the training set.

#### Next Steps:
- **Model Training**: Train a new Random Forest or another model on this expanded feature set.
- **Model Evaluation**: Evaluate the model's performance to determine if introducing interaction terms improves the predictive accuracy compared to the previous results.
- **Hyperparameter Tuning**: Given the expanded feature set, further fine-tuning the model’s hyperparameters might also lead to better results.


In [34]:
# Step 1: Initialize the Random Forest Regressor again
rf_interactions_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 2: Train the model with the interaction terms in the training data
rf_interactions_model.fit(X_train_interactions_df, y_train)

In [35]:
# Step 3: Make predictions on the test set with interaction terms
y_pred_rf_interactions = rf_interactions_model.predict(X_test_interactions_df)

In [36]:
# Step 4: Evaluate the model's performance with interaction terms
mse_rf_interactions = mean_squared_error(y_test, y_pred_rf_interactions)
r2_rf_interactions = r2_score(y_test, y_pred_rf_interactions)

In [37]:
# Step 5: Print the results
print(f'Random Forest with Interactions - Mean Squared Error: {mse_rf_interactions}')
print(f'Random Forest with Interactions - R-squared: {r2_rf_interactions}')

Random Forest with Interactions - Mean Squared Error: 369482871.00144553
Random Forest with Interactions - R-squared: 0.22177389367779565


### Previous Result:
In our previous attempt, we applied **Random Forest** with interaction terms, where we expanded the feature space to account for interactions between variables. However, the model’s performance still showed room for improvement:

- **Mean Squared Error (MSE)**: 369,482,871.00
- **R-squared (R²)**: 0.222

While interaction terms allowed the model to consider more complex relationships between variables, the overall predictive accuracy did not improve significantly. The MSE remains high, and the R² value suggests that the model is still not explaining much variance in the target variable (layoffs).

