In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from fredapi import Fred
import statsmodels.api as sm
from tabulate import tabulate
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
fred = Fred(api_key='YOUR_API_KEY_HERE')
from sklearn.linear_model import ElasticNetCV, ElasticNet

In [2]:
os.chdir("C:/Users/gabeyie/OneDrive - University of Tennessee/Documents/IJF_Paper")
os.makedirs('Forecasts', exist_ok=True)
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\gabeyie\OneDrive - University of Tennessee\Documents\IJF_Paper


In [3]:
oil_price = fred.get_series('DCOILWTICO', observation_start='1986-01-01', observation_end='2020-12-01')
monthly_oil = oil_price.resample('ME').mean().ffill()
monthly_oil_log = np.log(monthly_oil)
monthly_oil_log_diff = monthly_oil_log.diff()
monthly_oil_log_diff = monthly_oil_log_diff.dropna()
Target = monthly_oil_log_diff.values.ravel()
Lag_Target = monthly_oil_log_diff.shift(1)
Lag_Target = Lag_Target.bfill()
Lag_Target = Lag_Target.values.ravel()

In [4]:
train_ratio = 0.625
n_samples = len(monthly_oil_log_diff)
n_train = int(n_samples * train_ratio)

# Split the data into training and testing sets
train_data = monthly_oil_log_diff.iloc[:n_train]
test_data = monthly_oil_log_diff.iloc[n_train:]

# The target variable (values) for training and testing
y_train = train_data.values.ravel()
y_test = test_data.values.ravel()

# Extracting the dates for training and testing
train_dates = train_data.index
test_dates = test_data.index

recessions = fred.get_series('USREC', observation_start='1986-01-01', observation_end='2020-12-01')
recessions = recessions.resample('ME').last().ffill()

In [5]:
# Calculate the 10th and 90th percentiles for the changes in the log of oil prices
lower_bound = monthly_oil_log_diff.quantile(0.10)
upper_bound = monthly_oil_log_diff.quantile(0.90)

# Initialize recession_expansion and update as matrices of True
recession_expansion = pd.DataFrame(True, index=monthly_oil_log_diff.index, columns=['indicator'])
update = recession_expansion.copy()

# Update recession_expansion based on the significant changes
# A change is considered significant if it's below the 10th or above the 90th percentile
recession_expansion.loc[(monthly_oil_log_diff <= lower_bound) | (monthly_oil_log_diff >= upper_bound), 'indicator'] = False

# Update the 'update' matrix: set to True only for the first time point and whenever the value of recession_expansion changes
update['indicator'] = recession_expansion['indicator'].ne(recession_expansion['indicator'].shift())
update.loc[update.index[0], 'indicator'] = True # Ensure the first observation is marked for an update

# Flatten the 'update' DataFrame to a numpy array for easier processing in the forecasting loop
updates = update.values.ravel()

In [6]:
# Load the full datasets
Article_train = pd.read_csv("Data/Article_train.csv")
Article_test = pd.read_csv("Data/Article_test.csv")
Headline_train = pd.read_csv("Data/Headline_train.csv")
Headline_test = pd.read_csv("Data/Headline_test.csv")

# Load combined datasets
Article_Noun_Noun_Adjective_Noun_train_combined = pd.read_csv("Data/Article_Noun_Noun_Adjective_Noun_train_combined.csv")
Article_Noun_Noun_Adjective_Noun_test_combined = pd.read_csv("Data/Article_Noun_Noun_Adjective_Noun_test_combined.csv")
Headline_Noun_Noun_Adjective_Noun_train_combined = pd.read_csv("Data/Headline_Noun_Noun_Adjective_Noun_train_combined.csv")
Headline_Noun_Noun_Adjective_Noun_test_combined = pd.read_csv("Data/Headline_Noun_Noun_Adjective_Noun_test_combined.csv")

Article_Noun_Adjective_Verb_Adjective_train_combined = pd.read_csv("Data/Article_Noun_Adjective_Verb_Adjective_train_combined.csv")
Article_Noun_Adjective_Verb_Adjective_test_combined = pd.read_csv("Data/Article_Noun_Adjective_Verb_Adjective_test_combined.csv")
Headline_Noun_Adjective_Verb_Adjective_train_combined = pd.read_csv("Data/Headline_Noun_Adjective_Verb_Adjective_train_combined.csv")
Headline_Noun_Adjective_Verb_Adjective_test_combined = pd.read_csv("Data/Headline_Noun_Adjective_Verb_Adjective_test_combined.csv")

Article_Verb_Noun_Noun_Verb_train_combined = pd.read_csv("Data/Article_Verb_Noun_Noun_Verb_train_combined.csv")
Article_Verb_Noun_Noun_Verb_test_combined = pd.read_csv("Data/Article_Verb_Noun_Noun_Verb_test_combined.csv")
Headline_Verb_Noun_Noun_Verb_train_combined = pd.read_csv("Data/Headline_Verb_Noun_Noun_Verb_train_combined.csv")
Headline_Verb_Noun_Noun_Verb_test_combined = pd.read_csv("Data/Headline_Verb_Noun_Noun_Verb_test_combined.csv")

In [7]:
# Remove first row of the training sets
Article_train = Article_train.iloc[1:, :]
Article_Noun_Noun_Adjective_Noun_train_combined = Article_Noun_Noun_Adjective_Noun_train_combined.iloc[1:, :]
Article_Verb_Noun_Noun_Verb_train_combined = Article_Verb_Noun_Noun_Verb_train_combined.iloc[1:, :]
Article_Noun_Adjective_Verb_Adjective_train_combined = Article_Noun_Adjective_Verb_Adjective_train_combined.iloc[1:, :]

Headline_train = Headline_train.iloc[1:, :]
Headline_Noun_Noun_Adjective_Noun_train_combined = Headline_Noun_Noun_Adjective_Noun_train_combined.iloc[1:, :]
Headline_Verb_Noun_Noun_Verb_train_combined = Headline_Verb_Noun_Noun_Verb_train_combined.iloc[1:, :]
Headline_Noun_Adjective_Verb_Adjective_train_combined = Headline_Noun_Adjective_Verb_Adjective_train_combined.iloc[1:, :]


Article_column_names_dict = {
    r'TF-IDF Colls($D_{1,t}$)': list(Article_train.columns),      
    r'Noun-Noun/Adj-Noun Colls($D_{2,t}$)': list(Article_Noun_Noun_Adjective_Noun_train_combined.columns),
    r'Verb-Noun/Noun-Verb Colls($D_{3,t}$)': list(Article_Verb_Noun_Noun_Verb_train_combined.columns),
    r'Noun-Adj/Verb-Adj Colls($D_{4,t}$)': list(Article_Noun_Adjective_Verb_Adjective_train_combined.columns)       
}

Headline_column_names_dict = {
    r'TF-IDF Colls($D_{1,t}$)': list(Headline_train.columns),      
    r'Noun-Noun/Adj-Noun Colls($D_{2,t}$)': list(Headline_Noun_Noun_Adjective_Noun_train_combined.columns),
    r'Verb-Noun/Noun-Verb Colls($D_{3,t}$)': list(Headline_Verb_Noun_Noun_Verb_train_combined.columns),
    r'Noun-Adj/Verb-Adj Colls($D_{4,t}$)': list(Headline_Noun_Adjective_Verb_Adjective_train_combined.columns)       
}

# Convert all training sets to arrays
Article_train = np.array(Article_train)
Article_Noun_Noun_Adjective_Noun_train_combined = np.array(Article_Noun_Noun_Adjective_Noun_train_combined)
Article_Noun_Adjective_Verb_Adjective_train_combined = np.array(Article_Noun_Adjective_Verb_Adjective_train_combined)
Article_Verb_Noun_Noun_Verb_train_combined = np.array(Article_Verb_Noun_Noun_Verb_train_combined)

Headline_train = np.array(Headline_train)
Headline_Noun_Noun_Adjective_Noun_train_combined = np.array(Headline_Noun_Noun_Adjective_Noun_train_combined)
Headline_Noun_Adjective_Verb_Adjective_train_combined = np.array(Headline_Noun_Adjective_Verb_Adjective_train_combined)
Headline_Verb_Noun_Noun_Verb_train_combined = np.array(Headline_Verb_Noun_Noun_Verb_train_combined)

# Convert all testing sets to arrays
Article_test = np.array(Article_test)
Article_Noun_Noun_Adjective_Noun_test_combined = np.array(Article_Noun_Noun_Adjective_Noun_test_combined)
Article_Noun_Adjective_Verb_Adjective_test_combined = np.array(Article_Noun_Adjective_Verb_Adjective_test_combined)
Article_Verb_Noun_Noun_Verb_test_combined = np.array(Article_Verb_Noun_Noun_Verb_test_combined)

Headline_test = np.array(Headline_test)
Headline_Noun_Noun_Adjective_Noun_test_combined = np.array(Headline_Noun_Noun_Adjective_Noun_test_combined)
Headline_Noun_Adjective_Verb_Adjective_test_combined = np.array(Headline_Noun_Adjective_Verb_Adjective_test_combined)
Headline_Verb_Noun_Noun_Verb_test_combined = np.array(Headline_Verb_Noun_Noun_Verb_test_combined)

Article_datasets = {
    r'TF-IDF Colls($D_{1,t}$)': (Article_train, Article_test),
    r'Noun-Noun/Adj-Noun Colls($D_{2,t}$)': (Article_Noun_Noun_Adjective_Noun_train_combined, Article_Noun_Noun_Adjective_Noun_test_combined),
    r'Verb-Noun/Noun-Verb Colls($D_{3,t}$)': (Article_Verb_Noun_Noun_Verb_train_combined, Article_Verb_Noun_Noun_Verb_test_combined),
    r'Noun-Adj/Verb-Adj Colls($D_{4,t}$)': (Article_Noun_Adjective_Verb_Adjective_train_combined, Article_Noun_Adjective_Verb_Adjective_test_combined)  
}

Headline_datasets = {
    r'TF-IDF Colls($D_{1,t}$)': (Headline_train, Headline_test),
    r'Noun-Noun/Adj-Noun Colls($D_{2,t}$)': (Headline_Noun_Noun_Adjective_Noun_train_combined, Headline_Noun_Noun_Adjective_Noun_test_combined),
    r'Verb-Noun/Noun-Verb Colls($D_{3,t}$)': (Headline_Verb_Noun_Noun_Verb_train_combined, Headline_Verb_Noun_Noun_Verb_test_combined),
    r'Noun-Adj/Verb-Adj Colls($D_{4,t}$)': (Headline_Noun_Adjective_Verb_Adjective_train_combined, Headline_Noun_Adjective_Verb_Adjective_test_combined)  
}

In [8]:
def num_factors(data, kmax):
    T, N = data.shape
    K = min(kmax, N)

    xx = (data.T @ data) / (T*N) if N < T else (data @ data.T) / (T*N)

    eig_values = np.linalg.eigvals(xx)
    d = sorted(eig_values, reverse=True)

    ER = [d[k] / d[k+1] for k in range(K-1)]
    ER = [0 if np.isnan(e) or np.isinf(e) else e for e in ER]
    
    n_fac = max(ER)
    
    num_factors = ER.index(n_fac) + 1 # Remember python indexing starts from 0 so +1

    return num_factors

In [9]:
# Initialize your objects
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
scaler = StandardScaler()
horizons = [1, 3, 6, 9]

# Placeholder for storing all values for each horizon and each model
Article_predictions_dict_pca = {}
Article_y_true_dict_pca = {}
Article_elasticnet_feature_counts = {}
Article_elasticnet_selected_indices = {model_name: {h: [] for h in horizons} for model_name, _ in Article_datasets.items()}
Article_pca_components_counts = {model_name: {h: [] for h in horizons} for model_name, _ in Article_datasets.items()}

# Loop over datasets
for model_name, (train, test) in Article_datasets.items():
    Article_predictions_dict_pca[model_name] = {h: [] for h in horizons}
    Article_y_true_dict_pca[model_name] = {h: [] for h in horizons}
    Article_elasticnet_feature_counts[model_name] = {h: [] for h in horizons}

    # Concatenate train and test
    data = np.concatenate([train, test])

    # Initialize the model outside the loop
    model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=100000, tol=0.0001)

    # Loop over horizons
    for h in horizons:
        # Define a variable to keep track of the last observed value of the recession indicator
        last_indicator = None
        y_true_per_pca_horizon = []
        y_pred_per_pca_horizon = []

        # Loop over time points in the test set
        for i in range(len(train) + h - 1, len(data)):
            # Get train and test data up to the forecast origin
            train_temp = data[:i - h + 1]
            test_temp = data[i - h + 1:i + 1]

            # Get the corresponding targets
            y_train_temp = Target[:i - h + 1]
            y_test_temp = Target[i - h + 1:i + 1]

            
            y_lag_train_temp = Lag_Target[:i - h + 1]
            y_lag_test_temp = Lag_Target[i - h + 1:i + 1]

            # Standardize the data
            scaler.fit(train_temp)
            train_temp_standardized = scaler.transform(train_temp)
            test_temp_standardized = scaler.transform(test_temp)
            
            # Check if we should update the model (i.e., if there is a change in the recession_indicator)
            current_indicator = updates[i]
            if  current_indicator != last_indicator:
                # Update the last observed value of the recession indicator
                last_indicator = current_indicator
                
                # Train the model
                model.fit(train_temp_standardized, np.ravel(y_train_temp))

                # If no features were selected, refit the model with a different l1_ratio
                refit_attempts = 0
                while len(np.nonzero(model.coef_)[0]) <= 1 and refit_attempts < 2:
                    model = ElasticNet(l1_ratio = 0.1, alpha = 0.1) 
                    model.fit(train_temp_standardized, np.ravel(y_train_temp))
                    refit_attempts += 1

                #If still no features were selected after 2 attempts, print a warning
                if len(np.nonzero(model.coef_)[0]) <= 1:
                    print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')

            # If any features were selected, apply PCA
            if model.coef_.any():
                # Get indices of non-zero coefficients
                selected_features = np.nonzero(model.coef_)[0]
                selected_features_indices = np.nonzero(model.coef_)[0]

                # Track number of features selected by ElasticNetCV
                selected_features_count = len(np.nonzero(model.coef_)[0])
                Article_elasticnet_feature_counts[model_name][h].append(selected_features_count)
                Article_elasticnet_selected_indices[model_name][h].append(selected_features_indices)
  
                # Select the features that were not discarded by the ElasticNet
                selected_train_temp = train_temp[:, selected_features]
                selected_test_temp = test_temp[:, selected_features]

                # Initialize and fit a new scaler on the selected features
                scaler_pca = StandardScaler()
                scaler_pca.fit(selected_train_temp)
                
                # Standardize selected features
                selected_train_temp_standardized = scaler_pca.transform(selected_train_temp)
                selected_test_temp_standardized = scaler_pca.transform(selected_test_temp)

                # Define PCA
                n_components = num_factors(selected_train_temp_standardized, kmax=8)  # Choose a suitable value for kmax
                pca = PCA(n_components= n_components)
                best_pca = pca.fit(selected_train_temp_standardized)

                Article_pca_components_counts[model_name][h].append(n_components)

                # Transform data using the best PCA
                selected_train_temp_pca = best_pca.transform(selected_train_temp_standardized)
                selected_test_temp_pca = best_pca.transform(selected_test_temp_standardized)

                 # Add the lagged target as an additional column to the PCA-transformed data
                selected_train_temp_pca = np.column_stack((selected_train_temp_pca, y_lag_train_temp))
                selected_test_temp_pca = np.column_stack((selected_test_temp_pca, y_lag_test_temp))

                # Train a linear regression model and compute p-values
                lr = LinearRegression()

                # Calculate p-values
                mod = sm.OLS(np.ravel(y_train_temp), sm.add_constant(selected_train_temp_pca))
                fii = mod.fit()
                p_values = fii.summary2().tables[1]['P>|t|']

                # Find the significant features
                significant_features = p_values[p_values < 0.05].index  # Find features with p-value < 0.05

                # Ignore the constant term
                significant_features = [i for i in significant_features if i != 'const']

                # Create a mapping from column names to indices
                column_to_index = {col: idx-1 for idx, col in enumerate(fii.summary2().tables[1].index)}  # idx-1 corrects for the added constant

                # Convert column names to indices
                significant_indices = [column_to_index[col] for col in significant_features if column_to_index[col] != -1]  # We make sure not to include the constant

                # If there are significant features, retrain the model on these
                if significant_indices:
                    selected_train_temp_pca = selected_train_temp_pca[:, significant_indices]
                    selected_test_temp_pca = selected_test_temp_pca[:, significant_indices]
                else:
                    print("No features with p-value < 0.05 was found. Retaining all PCA-transformed features.")

                # Fit the model on the selected (or all) PCA-transformed features
                lr.fit(selected_train_temp_pca, np.ravel(y_train_temp))
                
                # Make a prediction and add it to the predictions list
                y_pred_pca_temp = lr.predict(selected_test_temp_pca)
                y_pred_per_pca_horizon.append(y_pred_pca_temp[h-1]) # Remember python indexing starts from 0

                # Add true values to a list
                y_true_per_pca_horizon.append(y_test_temp[h-1]) # Remember python indexing starts from 0

        Article_predictions_dict_pca[model_name][h] = y_pred_per_pca_horizon
        Article_y_true_dict_pca[model_name][h] = y_true_per_pca_horizon

# Save dictionaries to files for future use
with open('Forecasts/Alt_Updating_Scheme_Article_predictions_dict.pkl', 'wb') as f:
    pickle.dump(Article_predictions_dict_pca, f)

In [10]:
# Placeholder for storing all values for each horizon and each model
Headline_predictions_dict_pca = {}
Headline_y_true_dict_pca = {}
Headline_elasticnet_feature_counts = {}
Headline_elasticnet_selected_indices = {model_name: {h: [] for h in horizons} for model_name, _ in Headline_datasets.items()}
Headline_pca_components_counts = {model_name: {h: [] for h in horizons} for model_name, _ in Headline_datasets.items()}

# Loop over datasets
for model_name, (train, test) in Headline_datasets.items():
    Headline_predictions_dict_pca[model_name] = {h: [] for h in horizons}
    Headline_y_true_dict_pca[model_name] = {h: [] for h in horizons}
    Headline_elasticnet_feature_counts[model_name] = {h: [] for h in horizons}

    # Concatenate train and test
    data = np.concatenate([train, test])

    # Initialize the model outside the loop
    model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, max_iter=100000, tol=0.0001)

    # Loop over horizons
    for h in horizons:
        # Define a variable to keep track of the last observed value of the recession indicator
        last_indicator = None
        y_true_per_pca_horizon = []
        y_pred_per_pca_horizon = []

        # Loop over time points in the test set
        for i in range(len(train) + h - 1, len(data)):
            # Get train and test data up to the forecast origin
            train_temp = data[:i - h + 1]
            test_temp = data[i - h + 1:i + 1]

            # Get the corresponding targets
            y_train_temp = Target[:i - h + 1]
            y_test_temp = Target[i - h + 1:i + 1]

            
            y_lag_train_temp = Lag_Target[:i - h + 1]
            y_lag_test_temp = Lag_Target[i - h + 1:i + 1]

            # Standardize the data
            scaler.fit(train_temp)
            train_temp_standardized = scaler.transform(train_temp)
            test_temp_standardized = scaler.transform(test_temp)
            
            # Check if we should update the model (i.e., if there is a change in the recession_indicator)
            current_indicator = updates[i]
            if  current_indicator != last_indicator:
                # Update the last observed value of the recession indicator
                last_indicator = current_indicator
                
                # Train the model
                model.fit(train_temp_standardized, np.ravel(y_train_temp))

                # If no features were selected, refit the model with a different l1_ratio
                refit_attempts = 0
                while len(np.nonzero(model.coef_)[0]) <= 1 and refit_attempts < 2:
                    model = ElasticNet(l1_ratio = 0.1, alpha = 0.1) 
                    model.fit(train_temp_standardized, np.ravel(y_train_temp))
                    refit_attempts += 1

                #If still no features were selected after 2 attempts, print a warning
                if len(np.nonzero(model.coef_)[0]) <= 1:
                    print(f'Warning: Model failed to select more than one feature after {refit_attempts} attempts.')

            # If any features were selected, apply PCA
            if model.coef_.any():
                # Get indices of non-zero coefficients
                selected_features = np.nonzero(model.coef_)[0]
                selected_features_indices = np.nonzero(model.coef_)[0]

                # Track number of features selected by ElasticNetCV
                selected_features_count = len(np.nonzero(model.coef_)[0])
                Headline_elasticnet_feature_counts[model_name][h].append(selected_features_count)
                Headline_elasticnet_selected_indices[model_name][h].append(selected_features_indices)
  
                # Select the features that were not discarded by the ElasticNet
                selected_train_temp = train_temp[:, selected_features]
                selected_test_temp = test_temp[:, selected_features]

                # Initialize and fit a new scaler on the selected features
                scaler_pca = StandardScaler()
                scaler_pca.fit(selected_train_temp)
                
                # Standardize selected features
                selected_train_temp_standardized = scaler_pca.transform(selected_train_temp)
                selected_test_temp_standardized = scaler_pca.transform(selected_test_temp)

                # Define PCA
                n_components = num_factors(selected_train_temp_standardized, kmax=8)  # Choose a suitable value for kmax
                pca = PCA(n_components= n_components)
                best_pca = pca.fit(selected_train_temp_standardized)

                Headline_pca_components_counts[model_name][h].append(n_components)

                # Transform data using the best PCA
                selected_train_temp_pca = best_pca.transform(selected_train_temp_standardized)
                selected_test_temp_pca = best_pca.transform(selected_test_temp_standardized)

                 # Add the lagged target as an additional column to the PCA-transformed data
                selected_train_temp_pca = np.column_stack((selected_train_temp_pca, y_lag_train_temp))
                selected_test_temp_pca = np.column_stack((selected_test_temp_pca, y_lag_test_temp))

                # Train a linear regression model and compute p-values
                lr = LinearRegression()

                # Calculate p-values
                mod = sm.OLS(np.ravel(y_train_temp), sm.add_constant(selected_train_temp_pca))
                fii = mod.fit()
                p_values = fii.summary2().tables[1]['P>|t|']

                # Find the significant features
                significant_features = p_values[p_values < 0.05].index  # Find features with p-value < 0.05

                # Ignore the constant term
                significant_features = [i for i in significant_features if i != 'const']

                # Create a mapping from column names to indices
                column_to_index = {col: idx-1 for idx, col in enumerate(fii.summary2().tables[1].index)}  # idx-1 corrects for the added constant

                # Convert column names to indices
                significant_indices = [column_to_index[col] for col in significant_features if column_to_index[col] != -1]  # We make sure not to include the constant

                # If there are significant features, retrain the model on these
                if significant_indices:
                    selected_train_temp_pca = selected_train_temp_pca[:, significant_indices]
                    selected_test_temp_pca = selected_test_temp_pca[:, significant_indices]
                else:
                    print("No features with p-value < 0.05 was found. Retaining all PCA-transformed features.")

                # Fit the model on the selected (or all) PCA-transformed features
                lr.fit(selected_train_temp_pca, np.ravel(y_train_temp))
                
                # Make a prediction and add it to the predictions list
                y_pred_pca_temp = lr.predict(selected_test_temp_pca)
                y_pred_per_pca_horizon.append(y_pred_pca_temp[h-1]) # Remember python indexing starts from 0

                # Add true values to a list
                y_true_per_pca_horizon.append(y_test_temp[h-1]) # Remember python indexing starts from 0

        Headline_predictions_dict_pca[model_name][h] = y_pred_per_pca_horizon
        Headline_y_true_dict_pca[model_name][h] = y_true_per_pca_horizon

# Save dictionaries to files for future use
with open('Forecasts/Alt_Updating_Scheme_Headline_predictions_dict.pkl', 'wb') as f:
    pickle.dump(Headline_predictions_dict_pca, f)

In [11]:
# Initialize a dictionary to hold feature selection frequencies for each model
Article_feature_selection_frequencies = {model_name: defaultdict(int) for model_name in Article_datasets.keys()}
Headline_feature_selection_frequencies = {model_name: defaultdict(int) for model_name in Headline_datasets.keys()}

# Loop over each model, horizon, and selected feature indices to count frequencies
for model_name, horizons_data in Article_elasticnet_selected_indices.items():
    for horizon, indices_list in horizons_data.items():
        for indices in indices_list:
            for idx in indices:
                Article_feature_selection_frequencies[model_name][idx] += 1
                
# Loop over each model, horizon, and selected feature indices to count frequencies
for model_name, horizons_data in Headline_elasticnet_selected_indices.items():
    for horizon, indices_list in horizons_data.items():
        for indices in indices_list:
            for idx in indices:
                Headline_feature_selection_frequencies[model_name][idx] += 1
                
# Initialize the dictionary for top features per model
Article_top_features_per_model = {}
Headline_top_features_per_model = {}

N = 20  # Number of top features to identify

for model_name, frequencies in Article_feature_selection_frequencies.items():
    # Retrieve the list of original feature names for the current model from your dictionary
    feature_names = Article_column_names_dict[model_name]
    
    # Sort the features by their selection frequency, in descending order, and pick the top N
    sorted_features = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)[:N]
    
    # Map the indices of the top features to their actual names using the list from 'column_names_dict'
    Article_top_features_per_model[model_name] = [(feature_names[idx], freq) for idx, freq in sorted_features]
    
for model_name, frequencies in Headline_feature_selection_frequencies.items():
    # Retrieve the list of original feature names for the current model from your dictionary
    feature_names = Headline_column_names_dict[model_name]
    
    # Sort the features by their selection frequency, in descending order, and pick the top N
    sorted_features = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)[:N]
    
    # Map the indices of the top features to their actual names using the list from 'column_names_dict'
    Headline_top_features_per_model[model_name] = [(feature_names[idx], freq) for idx, freq in sorted_features]

In [12]:
# Save other key variables (test dates, horizons, update indicators, etc.)
with open("Forecasts/Alt_Updating_Scheme_Variables.pkl", "wb") as f:
    pickle.dump({
        "Alt_Updating_Scheme_Article_elasticnet_feature_counts": Article_elasticnet_feature_counts,
        "Alt_Updating_Scheme_Headline_elasticnet_feature_counts": Headline_elasticnet_feature_counts,
        "Alt_Updating_Scheme_Article_pca_components_counts": Article_pca_components_counts,
        "Alt_Updating_Scheme_Headline_pca_components_counts": Headline_pca_components_counts,
        "Alt_Updating_Scheme_Article_column_names_dict": Article_column_names_dict,
        "Alt_Updating_Scheme_Headline_column_names_dict": Headline_column_names_dict,
        "Alt_Updating_Scheme_Article_top_features_per_model": Article_top_features_per_model,
        "Alt_Updating_Scheme_Headline_top_features_per_model": Headline_top_features_per_model,
    }, f)

In [13]:
with open("Forecasts/Alt_Updating_Scheme_other_variables.pkl", "wb") as f:
    pickle.dump({
        "test_dates": test_dates,
        "horizons": horizons,
        "update_indicator": update['indicator'],
        "recessions": recessions,
    }, f)