In [67]:
import os
import pickle
import numpy as np
import pandas as pd
from fredapi import Fred
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
fred = Fred(api_key='YOUR_API_KEY_HERE')
from sklearn.linear_model import ElasticNetCV, ElasticNet

In [68]:
os.chdir("C:/Users/gabeyie/OneDrive - University of Tennessee/Documents/IJF_Paper")
os.makedirs('Forecasts', exist_ok=True)
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\gabeyie\OneDrive - University of Tennessee\Documents\IJF_Paper


In [69]:
oil_price = fred.get_series('DCOILWTICO', observation_start='1986-01-01', observation_end='2020-12-01')
monthly_oil = oil_price.resample('ME').mean().ffill()
monthly_oil_log = np.log(monthly_oil)
monthly_oil_log_diff = monthly_oil_log.diff()
monthly_oil_log_diff = monthly_oil_log_diff.dropna()
Target = monthly_oil_log_diff.values.ravel()
Lag_Target = monthly_oil_log_diff.shift(1)
Lag_Target = Lag_Target.bfill()
Lag_Target = Lag_Target.values.ravel()

In [70]:
# Load the data again due to reset
df = pd.read_csv('Data/current.csv', header=0)
transformation_codes = df.iloc[0]
df = df.drop(df.index[0])  # Remove the transformation codes row
df = df.dropna(axis=1)  # Remove columns with "nan"

# Adjusting transformation_codes to reflect columns actually present after NaN removal
transformation_codes = transformation_codes[df.columns]

# Define transformation functions again
def transform_series(series, code):
    if code == 1:
        # No transformation
        return series
    elif code == 2:
        # First differences 
        return series.diff(1)
    elif code == 3:
        # Second differences
        return series.diff(2)
    elif code == 4:
        # Log transformation
        return np.log(series)
    elif code == 5:
        # Log differences
        return np.log(series).diff(1)
    elif code == 6:
        # Log second differences
        return np.log(series).diff(2)
    elif code == 7:
        # Percent change differences
        return series.pct_change()
    else:
        # Default case, should not be reached
        return series

# Apply transformations, skipping the 'sasdate' column for transformations
for column in df.columns[1:]:  # Exclude date column from transformations
    code = int(transformation_codes[column])  # Convert code to integer for processing
    df[column] = transform_series(df[column].astype(float), code)

# Create lags for all variables except the 'sasdate' column
df_lagged = df.copy()

# Convert 'sasdate' to datetime format to enable filtering
df_lagged['sasdate'] = pd.to_datetime(df_lagged['sasdate'], format='%m/%d/%Y')

# Create the sub-dataframe for the specified date range
start_date = '1986-01-01'
end_date = '2020-12-31'
FredMD = df_lagged[(df_lagged['sasdate'] >= start_date) & (df_lagged['sasdate'] <= end_date)]

# Set 'sasdate' as the index of the dataframe
FredMD.set_index('sasdate', inplace=True)

In [71]:
train_ratio = 0.625
n_samples = len(monthly_oil_log_diff)
n_train = int(n_samples * train_ratio)

# Split the data into training and testing sets
train_data = monthly_oil_log_diff.iloc[:n_train]
test_data = monthly_oil_log_diff.iloc[n_train:]

split_index = int(len(FredMD) * train_ratio)
FredMD_train = FredMD.iloc[:split_index]
FredMD_test = FredMD.iloc[split_index:]

# The target variable (values) for training and testing
y_train = train_data.values.ravel()
y_test = test_data.values.ravel()

# Extracting the dates for training and testing
train_dates = train_data.index
test_dates = test_data.index

recessions = fred.get_series('USREC', observation_start='1986-01-01', observation_end='2020-12-01')
recessions = recessions.resample('ME').last().ffill()

In [72]:
# Get the Sahm Rule Recession Indicator
sahn_index = fred.get_series('SAHMREALTIME', observation_start='1986-02-01', observation_end='2020-12-01')

# Initialize recession_expansion and update as matrices of True
recession_expansion = pd.DataFrame(True, index=sahn_index.index, columns=['indicator'])
update = recession_expansion.copy()

# Update recession_expansion: set to False if Sahm Rule Recession Indicator > 0.5
recession_expansion.loc[sahn_index > 0.5, 'indicator'] = False

# Update the update matrix: set to True only for the first time point and whenever the value of recession_expansion changes
update['indicator'] = recession_expansion['indicator'].ne(recession_expansion['indicator'].shift())
update.loc[update.index[0], 'indicator'] = True
updates = update.values.ravel()

In [73]:
# Split the data into training and testing sets
sahm_index_train = sahn_index.iloc[:n_train]
sahm_index_test = sahn_index.iloc[n_train:]

# The target variable (values) for training and testing
sahm_index_train = sahm_index_train.values.ravel()
sahm_index_test  = sahm_index_test .values.ravel()

In [74]:
Article_Verb_Noun_Noun_Verb_train_combined = pd.read_csv("Data/Article_Verb_Noun_Noun_Verb_train_combined.csv")
Article_Verb_Noun_Noun_Verb_test_combined = pd.read_csv("Data/Article_Verb_Noun_Noun_Verb_test_combined.csv")

In [75]:
# Remove first row of the training sets
FredMD_train = FredMD_train.iloc[1:,:]
Article_Verb_Noun_Noun_Verb_train_combined  = Article_Verb_Noun_Noun_Verb_train_combined .iloc[1:, :]

# Convert all training sets to arrays
FredMD_train = np.array(FredMD_train)
Article_Verb_Noun_Noun_Verb_train_combined = np.array(Article_Verb_Noun_Noun_Verb_train_combined)

# Convert all testing sets to arrays
FredMD_test = np.array(FredMD_test)
Article_Verb_Noun_Noun_Verb_test_combined = np.array(Article_Verb_Noun_Noun_Verb_test_combined)

Text = np.concatenate([Article_Verb_Noun_Noun_Verb_train_combined , Article_Verb_Noun_Noun_Verb_test_combined ])

datasets1 = {r'Verb-Noun/Noun-Verb Colls($D_{3,t}$)': (FredMD_train, FredMD_test)}

In [76]:
def num_factors(data, kmax):
    T, N = data.shape
    K = min(kmax, N)

    xx = (data.T @ data) / (T*N) if N < T else (data @ data.T) / (T*N)

    eig_values = np.linalg.eigvals(xx)
    d = sorted(eig_values, reverse=True)

    ER = [d[k] / d[k+1] for k in range(K-1)]
    ER = [0 if np.isnan(e) or np.isinf(e) else e for e in ER]
    
    n_fac = max(ER)
    
    num_factors = ER.index(n_fac) + 1 # Remember python indexing starts from 0 so +1

    return num_factors

In [77]:
# Initialize your objects
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
scaler = StandardScaler()
horizons = [1, 3, 6, 9]

# Placeholder for storing all values for each horizon and each model
predictions_dict_pca = {}
y_true_dict_pca = {}

# Loop over datasets
for model_name, (train, test) in datasets1.items():
    predictions_dict_pca[model_name] = {h: [] for h in horizons}
    y_true_dict_pca[model_name] = {h: [] for h in horizons}

    # Concatenate train and test
    data = np.concatenate([train, test])

    # Initialize the model outside the loop
    model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=1000000, tol=0.0001)
    

    # Loop over horizons
    for h in horizons:

        # Define a variable to keep track of the last observed value of the recession indicator
        last_indicator = None
        y_true_per_pca_horizon = []
        y_pred_per_pca_horizon = []

        # Loop over time points in the test set
        for i in range(len(train) + h - 1, len(data)):
            
            # Get train and test data up to the forecast origin
            train_temp = data[:i - h + 1]
            test_temp = data[i - h + 1:i + 1]

            # Get the corresponding targets
            y_train_temp = Target[:i - h + 1]
            y_test_temp = Target[i - h + 1:i + 1]

            # Get the corresponding lag targets
            y_lag_train_temp = Lag_Target[:i - h + 1]
            y_lag_test_temp = Lag_Target[i - h + 1:i + 1]
            
            # Check if we should update the model (i.e., if there is a change in the recession_indicator)
            current_indicator = updates[i]
            if  current_indicator != last_indicator:
                # Update the last observed value of the recession indicator
                last_indicator = current_indicator
                
                # Train the model
                model.fit(train_temp, np.ravel(y_train_temp))

                # If no features were selected, refit the model with a different l1_ratio
                refit_attempts = 0
                while len(np.nonzero(model.coef_)[0]) <= 1 and refit_attempts < 2:
                    model = ElasticNet(l1_ratio = 0.05, alpha = 0.05) 
                    model.fit(train_temp, np.ravel(y_train_temp))
                    refit_attempts += 1

                #If still no features were selected after 2 attempts, print a warning
                if len(np.nonzero(model.coef_)[0]) <= 1:
                    print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')

            # If any features were selected, apply PCA
            if model.coef_.any():
                # Get indices of non-zero coefficients
                selected_features = np.nonzero(model.coef_)[0]
                selected_features_indices = np.nonzero(model.coef_)[0]

                # Select the features that were not discarded by the ElasticNet
                selected_train_temp = train_temp[:, selected_features]
                selected_test_temp = test_temp[:, selected_features]

                # Define PCA
                n_components = num_factors(selected_train_temp, kmax=8)  # Choose a suitable value for kmax
                pca = PCA(n_components= n_components)
                best_pca = pca.fit(selected_train_temp)

                # Transform data using the best PCA
                selected_train_temp_pca = best_pca.transform(selected_train_temp)
                selected_test_temp_pca = best_pca.transform(selected_test_temp)

                 # Add the lagged target as an additional column to the PCA-transformed data
                selected_train_temp_pca = np.column_stack((selected_train_temp_pca, y_lag_train_temp))
                selected_test_temp_pca = np.column_stack((selected_test_temp_pca, y_lag_test_temp))

                # Train a linear regression model and compute p-values
                lr = LinearRegression()

                # Calculate p-values
                mod = sm.OLS(np.ravel(y_train_temp), sm.add_constant(selected_train_temp_pca))
                fii = mod.fit()
                p_values = fii.summary2().tables[1]['P>|t|']

                # Find the significant features
                significant_features = p_values[p_values < 0.05].index  # Find features with p-value < 0.05

                # Ignore the constant term
                significant_features = [i for i in significant_features if i != 'const']

                # Create a mapping from column names to indices
                column_to_index = {col: idx-1 for idx, col in enumerate(fii.summary2().tables[1].index)}  # idx-1 corrects for the added constant

                # Convert column names to indices
                significant_indices = [column_to_index[col] for col in significant_features if column_to_index[col] != -1]  # We make sure not to include the constant

                # If there are significant features, retrain the model on these
                if significant_indices:
                    selected_train_temp_pca = selected_train_temp_pca[:, significant_indices]
                    selected_test_temp_pca = selected_test_temp_pca[:, significant_indices]
                else:
                    print("No features with p-value < 0.05 was found. Retaining all PCA-transformed features.")

                # Fit the model on the selected (or all) PCA-transformed features
                lr.fit(selected_train_temp_pca, np.ravel(y_train_temp))
                
                # Make a prediction and add it to the predictions list
                y_pred_pca_temp = lr.predict(selected_test_temp_pca)
                y_pred_per_pca_horizon.append(y_pred_pca_temp[h-1]) # Remember python indexing starts from 0

                # Add true values to a list
                y_true_per_pca_horizon.append(y_test_temp[h-1]) # Remember python indexing starts from 0

        predictions_dict_pca[model_name][h] = y_pred_per_pca_horizon
        y_true_dict_pca[model_name][h] = y_true_per_pca_horizon

# Save dictionaries to files for future use
with open('Forecasts/Sahm_Rule_FredMD_predictions_dict.pkl', 'wb') as f:
    pickle.dump(predictions_dict_pca, f)

In [78]:
# Placeholder for storing all values for each horizon and each model
predictions_dict_pca = {}
y_true_dict_pca = {}

# Loop over datasets
for model_name, (train, test) in datasets1.items():
    predictions_dict_pca[model_name] = {h: [] for h in horizons}
    y_true_dict_pca[model_name] = {h: [] for h in horizons}

    # Concatenate train and test
    data = np.concatenate([train, test])

    # Initialize the model outside the loop
    model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=1000000, tol=0.0001)
    text_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=1000000, tol=0.0001)
    

    # Loop over horizons
    for h in horizons:

        # Define a variable to keep track of the last observed value of the recession indicator
        last_indicator = None
        y_true_per_pca_horizon = []
        y_pred_per_pca_horizon = []

        # Loop over time points in the test set
        for i in range(len(train) + h - 1, len(data)):
            
            # Get train and test data up to the forecast origin
            train_temp = data[:i - h + 1]
            test_temp = data[i - h + 1:i + 1]

            text_train_temp = Text[:i - h + 1]
            text_test_temp = Text[i - h + 1:i + 1]

            # Get the corresponding targets
            y_train_temp = Target[:i - h + 1]
            y_test_temp = Target[i - h + 1:i + 1]

            # Get the corresponding lag targets
            y_lag_train_temp = Lag_Target[:i - h + 1]
            y_lag_test_temp = Lag_Target[i - h + 1:i + 1]

            # Standardize the data
            scaler.fit(text_train_temp)
            text_train_temp_standardized = scaler.transform(text_train_temp)
            text_test_temp_standardized = scaler.transform(text_test_temp)
            
            # Check if we should update the model (i.e., if there is a change in the recession_indicator)
            current_indicator = updates[i]
            if  current_indicator != last_indicator:
                # Update the last observed value of the recession indicator
                last_indicator = current_indicator
                
                # Train the model
                model.fit(train_temp, np.ravel(y_train_temp))

                # Train the text model on standardized text
                text_model.fit(text_train_temp_standardized, np.ravel(y_train_temp))

                # If no features were selected, refit the model with a different l1_ratio
                refit_attempts = 0
                while len(np.nonzero(model.coef_)[0]) <= 1 and refit_attempts < 2:
                    model = ElasticNet(l1_ratio = 0.05, alpha = 0.05) 
                    model.fit(train_temp, np.ravel(y_train_temp))
                    refit_attempts += 1

                #If still no features were selected after 2 attempts, print a warning
                if len(np.nonzero(model.coef_)[0]) <= 1:
                    print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')
                
                # If no features were selected, refit the model with a different l1_ratio
                refit_attempts = 0
                while len(np.nonzero(text_model.coef_)[0]) <= 1 and refit_attempts < 2:
                    text_model = ElasticNet(l1_ratio = 0.1, alpha = 0.1) 
                    text_model.fit(text_train_temp_standardized, np.ravel(y_train_temp))
                    refit_attempts += 1
                
                #If still no features were selected after 2 attempts, print a warning
                if len(np.nonzero(text_model.coef_)[0]) <= 1:
                    print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')

            # If any features were selected, apply PCA
            if model.coef_.any() or text_model.coef_.any():
                # Get indices of non-zero coefficients
                selected_features = np.nonzero(model.coef_)[0]
                text_selected_features = np.nonzero(text_model.coef_)[0]

                # Select the features that were not discarded by the ElasticNet
                selected_train_temp = train_temp[:, selected_features]
                selected_test_temp = test_temp[:, selected_features]

                text_selected_train_temp = text_train_temp_standardized[:, text_selected_features]
                text_selected_test_temp = text_test_temp_standardized[:, text_selected_features]

                # Define PCA
                n_components = num_factors(selected_train_temp, kmax=8)  # Choose a suitable value for kmax
                pca = PCA(n_components= n_components)
                best_pca = pca.fit(selected_train_temp)

                # Transform data using the best PCA
                selected_train_temp_pca = best_pca.transform(selected_train_temp)
                selected_test_temp_pca = best_pca.transform(selected_test_temp)

                # Define PCA
                n_components = num_factors(text_selected_train_temp, kmax=8)  # Choose a suitable value for kmax
                text_pca = PCA(n_components= n_components)
                text_best_pca = text_pca.fit(text_selected_train_temp)

                # Transform data using the best PCA
                text_selected_train_temp_pca = text_best_pca.transform(text_selected_train_temp)
                text_selected_test_temp_pca = text_best_pca.transform(text_selected_test_temp)

                 # Add the lagged target as an additional column to the PCA-transformed data
                selected_train_temp_pca = np.column_stack((selected_train_temp_pca, y_lag_train_temp, text_selected_train_temp_pca))
                selected_test_temp_pca = np.column_stack((selected_test_temp_pca, y_lag_test_temp, text_selected_test_temp_pca))

                # Train a linear regression model and compute p-values
                lr = LinearRegression()

                # Calculate p-values
                mod = sm.OLS(np.ravel(y_train_temp), sm.add_constant(selected_train_temp_pca))
                fii = mod.fit()
                p_values = fii.summary2().tables[1]['P>|t|']

                # Find the significant features
                significant_features = p_values[p_values < 0.05].index  # Find features with p-value < 0.05

                # Ignore the constant term
                significant_features = [i for i in significant_features if i != 'const']

                # Create a mapping from column names to indices
                column_to_index = {col: idx-1 for idx, col in enumerate(fii.summary2().tables[1].index)}  # idx-1 corrects for the added constant

                # Convert column names to indices
                significant_indices = [column_to_index[col] for col in significant_features if column_to_index[col] != -1]  # We make sure not to include the constant

                # If there are significant features, retrain the model on these
                if significant_indices:
                    selected_train_temp_pca = selected_train_temp_pca[:, significant_indices]
                    selected_test_temp_pca = selected_test_temp_pca[:, significant_indices]
                else:
                    print("No features with p-value < 0.05 was found. Retaining all PCA-transformed features.")

                # Fit the model on the selected (or all) PCA-transformed features
                lr.fit(selected_train_temp_pca, np.ravel(y_train_temp))
                
                # Make a prediction and add it to the predictions list
                y_pred_pca_temp = lr.predict(selected_test_temp_pca)
                y_pred_per_pca_horizon.append(y_pred_pca_temp[h-1]) # Remember python indexing starts from 0

                # Add true values to a list
                y_true_per_pca_horizon.append(y_test_temp[h-1]) # Remember python indexing starts from 0

        predictions_dict_pca[model_name][h] = y_pred_per_pca_horizon
        y_true_dict_pca[model_name][h] = y_true_per_pca_horizon

# Save dictionaries to files for future use
with open('Forecasts/Sahm_Rule_FREDMD_plus_Text_predictions_dict.pkl', 'wb') as f:
    pickle.dump(predictions_dict_pca, f)

In [79]:
# Placeholder for storing all values for each horizon and each model
predictions_dict_pca = {}
y_true_dict_pca = {}

# Loop over datasets
for model_name, (train, test) in datasets1.items():
    predictions_dict_pca[model_name] = {h: [] for h in horizons}
    y_true_dict_pca[model_name] = {h: [] for h in horizons}

    # Concatenate train and test
    data = np.concatenate([train, test])

    # Initialize the model outside the loop
    model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=1000000, tol=0.0001)
    

    # Loop over horizons
    for h in horizons:

        # Define a variable to keep track of the last observed value of the recession indicator
        y_true_per_pca_horizon = []
        y_pred_per_pca_horizon = []

        # Loop over time points in the test set
        for i in range(len(train) + h - 1, len(data)):
            
            # Get train and test data up to the forecast origin
            train_temp = data[:i - h + 1]
            test_temp = data[i - h + 1:i + 1]

            # Get the corresponding targets
            y_train_temp = Target[:i - h + 1]
            y_test_temp = Target[i - h + 1:i + 1]

            # Get the corresponding lag targets
            y_lag_train_temp = Lag_Target[:i - h + 1]
            y_lag_test_temp = Lag_Target[i - h + 1:i + 1]
                
            # Train the model
            model.fit(train_temp, np.ravel(y_train_temp))

            # If no features were selected, refit the model with a different l1_ratio
            refit_attempts = 0
            while len(np.nonzero(model.coef_)[0]) <= 1 and refit_attempts < 2:
                model = ElasticNet(l1_ratio = 0.05, alpha = 0.05) 
                model.fit(train_temp, np.ravel(y_train_temp))
                refit_attempts += 1

            #If still no features were selected after 2 attempts, print a warning
            if len(np.nonzero(model.coef_)[0]) <= 1:
                print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')

            # If any features were selected, apply PCA
            if model.coef_.any():
                # Get indices of non-zero coefficients
                selected_features = np.nonzero(model.coef_)[0]
                selected_features_indices = np.nonzero(model.coef_)[0]

                # Select the features that were not discarded by the ElasticNet
                selected_train_temp = train_temp[:, selected_features]
                selected_test_temp = test_temp[:, selected_features]

                # Define PCA
                n_components = num_factors(selected_train_temp, kmax=8)  # Choose a suitable value for kmax
                pca = PCA(n_components= n_components)
                best_pca = pca.fit(selected_train_temp)

                # Transform data using the best PCA
                selected_train_temp_pca = best_pca.transform(selected_train_temp)
                selected_test_temp_pca = best_pca.transform(selected_test_temp)

                 # Add the lagged target as an additional column to the PCA-transformed data
                selected_train_temp_pca = np.column_stack((selected_train_temp_pca, y_lag_train_temp))
                selected_test_temp_pca = np.column_stack((selected_test_temp_pca, y_lag_test_temp))

                # Train a linear regression model and compute p-values
                lr = LinearRegression()

                # Calculate p-values
                mod = sm.OLS(np.ravel(y_train_temp), sm.add_constant(selected_train_temp_pca))
                fii = mod.fit()
                p_values = fii.summary2().tables[1]['P>|t|']

                # Find the significant features
                significant_features = p_values[p_values < 0.05].index  # Find features with p-value < 0.05

                # Ignore the constant term
                significant_features = [i for i in significant_features if i != 'const']

                # Create a mapping from column names to indices
                column_to_index = {col: idx-1 for idx, col in enumerate(fii.summary2().tables[1].index)}  # idx-1 corrects for the added constant

                # Convert column names to indices
                significant_indices = [column_to_index[col] for col in significant_features if column_to_index[col] != -1]  # We make sure not to include the constant

                # If there are significant features, retrain the model on these
                if significant_indices:
                    selected_train_temp_pca = selected_train_temp_pca[:, significant_indices]
                    selected_test_temp_pca = selected_test_temp_pca[:, significant_indices]
                else:
                    print("No features with p-value < 0.05 was found. Retaining all PCA-transformed features.")

                # Fit the model on the selected (or all) PCA-transformed features
                lr.fit(selected_train_temp_pca, np.ravel(y_train_temp))
                
                # Make a prediction and add it to the predictions list
                y_pred_pca_temp = lr.predict(selected_test_temp_pca)
                y_pred_per_pca_horizon.append(y_pred_pca_temp[h-1]) # Remember python indexing starts from 0

                # Add true values to a list
                y_true_per_pca_horizon.append(y_test_temp[h-1]) # Remember python indexing starts from 0

        predictions_dict_pca[model_name][h] = y_pred_per_pca_horizon
        y_true_dict_pca[model_name][h] = y_true_per_pca_horizon

# Save dictionaries to files for future use
with open('Forecasts/Continuous_FREDMD_predictions_dict.pkl', 'wb') as f:
    pickle.dump(predictions_dict_pca, f)

In [80]:
# Placeholder for storing all values for each horizon and each model
predictions_dict_pca = {}
y_true_dict_pca = {}

# Loop over datasets
for model_name, (train, test) in datasets1.items():
    predictions_dict_pca[model_name] = {h: [] for h in horizons}
    y_true_dict_pca[model_name] = {h: [] for h in horizons}

    # Concatenate train and test
    data = np.concatenate([train, test])

    # Initialize the model outside the loop
    model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=1000000, tol=0.0001)
    text_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=1000000, tol=0.0001)
    

    # Loop over horizons
    for h in horizons:

        # Define a variable to keep track of the last observed value of the recession indicator
        y_true_per_pca_horizon = []
        y_pred_per_pca_horizon = []

        # Loop over time points in the test set
        for i in range(len(train) + h - 1, len(data)):
            
            # Get train and test data up to the forecast origin
            train_temp = data[:i - h + 1]
            test_temp = data[i - h + 1:i + 1]

            text_train_temp = Text[:i - h + 1]
            text_test_temp = Text[i - h + 1:i + 1]

            # Get the corresponding targets
            y_train_temp = Target[:i - h + 1]
            y_test_temp = Target[i - h + 1:i + 1]

            # Get the corresponding lag targets
            y_lag_train_temp = Lag_Target[:i - h + 1]
            y_lag_test_temp = Lag_Target[i - h + 1:i + 1]

            # Standardize the data
            scaler.fit(text_train_temp)
            text_train_temp_standardized = scaler.transform(text_train_temp)
            text_test_temp_standardized = scaler.transform(text_test_temp)
                
            # Train the model
            model.fit(train_temp, np.ravel(y_train_temp))

            # Train the text model on standardized text
            text_model.fit(text_train_temp_standardized, np.ravel(y_train_temp))

            # If no features were selected, refit the model with a different l1_ratio
            refit_attempts = 0
            while len(np.nonzero(model.coef_)[0]) <= 1 and refit_attempts < 2:
                model = ElasticNet(l1_ratio = 0.05, alpha = 0.05) 
                model.fit(train_temp, np.ravel(y_train_temp))
                refit_attempts += 1

            #If still no features were selected after 2 attempts, print a warning
            if len(np.nonzero(model.coef_)[0]) <= 1:
                print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')
                
            # If no features were selected, refit the model with a different l1_ratio
            refit_attempts = 0
            while len(np.nonzero(text_model.coef_)[0]) <= 1 and refit_attempts < 2:
                text_model = ElasticNet(l1_ratio = 0.1, alpha = 0.1) 
                text_model.fit(text_train_temp_standardized, np.ravel(y_train_temp))
                refit_attempts += 1
                
            #If still no features were selected after 2 attempts, print a warning
            if len(np.nonzero(text_model.coef_)[0]) <= 1:
                print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')

            # If any features were selected, apply PCA
            if model.coef_.any() or text_model.coef_.any():
                # Get indices of non-zero coefficients
                selected_features = np.nonzero(model.coef_)[0]
                text_selected_features = np.nonzero(text_model.coef_)[0]

                # Select the features that were not discarded by the ElasticNet
                selected_train_temp = train_temp[:, selected_features]
                selected_test_temp = test_temp[:, selected_features]

                text_selected_train_temp = text_train_temp_standardized[:, text_selected_features]
                text_selected_test_temp = text_test_temp_standardized[:, text_selected_features]

                # Define PCA
                n_components = num_factors(selected_train_temp, kmax=8)  # Choose a suitable value for kmax
                pca = PCA(n_components= n_components)
                best_pca = pca.fit(selected_train_temp)

                # Transform data using the best PCA
                selected_train_temp_pca = best_pca.transform(selected_train_temp)
                selected_test_temp_pca = best_pca.transform(selected_test_temp)

                # Define PCA
                n_components = num_factors(text_selected_train_temp, kmax=8)  # Choose a suitable value for kmax
                text_pca = PCA(n_components= n_components)
                text_best_pca = text_pca.fit(text_selected_train_temp)

                # Transform data using the best PCA
                text_selected_train_temp_pca = text_best_pca.transform(text_selected_train_temp)
                text_selected_test_temp_pca = text_best_pca.transform(text_selected_test_temp)

                 # Add the lagged target as an additional column to the PCA-transformed data
                selected_train_temp_pca = np.column_stack((selected_train_temp_pca, y_lag_train_temp, text_selected_train_temp_pca))
                selected_test_temp_pca = np.column_stack((selected_test_temp_pca, y_lag_test_temp, text_selected_test_temp_pca))

                # Train a linear regression model and compute p-values
                lr = LinearRegression()

                # Calculate p-values
                mod = sm.OLS(np.ravel(y_train_temp), sm.add_constant(selected_train_temp_pca))
                fii = mod.fit()
                p_values = fii.summary2().tables[1]['P>|t|']

                # Find the significant features
                significant_features = p_values[p_values < 0.05].index  # Find features with p-value < 0.05

                # Ignore the constant term
                significant_features = [i for i in significant_features if i != 'const']

                # Create a mapping from column names to indices
                column_to_index = {col: idx-1 for idx, col in enumerate(fii.summary2().tables[1].index)}  # idx-1 corrects for the added constant

                # Convert column names to indices
                significant_indices = [column_to_index[col] for col in significant_features if column_to_index[col] != -1]  # We make sure not to include the constant

                # If there are significant features, retrain the model on these
                if significant_indices:
                    selected_train_temp_pca = selected_train_temp_pca[:, significant_indices]
                    selected_test_temp_pca = selected_test_temp_pca[:, significant_indices]
                else:
                    print("No features with p-value < 0.05 was found. Retaining all PCA-transformed features.")

                # Fit the model on the selected (or all) PCA-transformed features
                lr.fit(selected_train_temp_pca, np.ravel(y_train_temp))
                
                # Make a prediction and add it to the predictions list
                y_pred_pca_temp = lr.predict(selected_test_temp_pca)
                y_pred_per_pca_horizon.append(y_pred_pca_temp[h-1]) # Remember python indexing starts from 0

                # Add true values to a list
                y_true_per_pca_horizon.append(y_test_temp[h-1]) # Remember python indexing starts from 0

        predictions_dict_pca[model_name][h] = y_pred_per_pca_horizon
        y_true_dict_pca[model_name][h] = y_true_per_pca_horizon

# Save dictionaries to files for future use
with open('Forecasts/continuous_FREDMD_plus_Text_predictions_dict.pkl', 'wb') as f:
    pickle.dump(predictions_dict_pca, f)