In [1]:
import copy
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

from tqdm.notebook import tqdm
from sklearn.model_selection import KFold 


### Load the preprocessed Aligned corpus (includes surprisal and incremental information value estimates)

In [2]:
ns_norm = pd.read_csv("preprocessed_corpora/naturalstories_preprocessed_normalised.csv")

# aligned.columns = aligned.columns.str.replace("-", "_")
ns_norm.columns = ns_norm.columns.str.replace("-", "_")


In [3]:
# For interaction terms

def all_pairs(list):
    """
    Returns all possible pairs of elements in a list
    """
    pairs = []
    for i in range(len(list)):
        for j in range(i+1, len(list)):
            pairs.append([list[i], list[j]])
    return pairs

In [4]:
# Constants

DISTANCE_METRICS = ["cosine_std"] #["euclidean", "cosine", "euclidean_std", "cosine_std"]
MODEL_NAMES = ['gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl']
HORIZONS = range(1, 11)
LAYERS = {
    "gpt2_small": list(range(0, 13)),
    "gpt2_medium": list(range(0, 25, 2)),
    "gpt2_large": list(range(0, 37, 3)),
    "gpt2_xl": list(range(0, 49, 4))
}

PREDICTED_VARIABLES = ['meanItemRT', 'sdItemRT']

BASELINE_PREDICTORS = ['Subtlex_log10', 'zone', 'length']
SURPRISAL_PREDICTORS = [col for col in ns_norm if '_surprisal' in col]
IAS_PREDICTORS = [col for col in ns_norm if '_ias_' in col]
ALL_INFORMATION_PREDICTORS = SURPRISAL_PREDICTORS + IAS_PREDICTORS 


### Individual IAS predictors (horizon-layer combinations) over surprisal baseline

In [5]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_layer_combinations = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model including surprisal
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models by horizon-layer combination
        for layer_idx, layer in enumerate(LAYERS[model]):
            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_" in p and p.startswith(model) and p.endswith("Smean")]
                    
                    # a bit of a hack: the previous line can match both cosine and cosine_std, so we need to filter out the correct one
                    if len(predictors) > 1 and "_std_" not in dist_metric:
                        predictors = [p for p in predictors if "_std_" not in p]
                    assert(len(predictors) == 1)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {predictors[0]}',
                        data=df_tmp
                    ).fit()
                    
                    anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                    results_horizon_layer_combinations.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": "full", 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": anova_results.ssr[1],
                        "anova_delta_ss": anova_results.ss_diff[1],
                        "anova_p": anova_results['Pr(>F)'][1],
                        "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                        "ias_coef": OLS_model.params[predictors[0]],
                    })

        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     

            # fit IAS models by horizon-layer combination
            for layer_idx, layer in enumerate(LAYERS[model]):
                for horizon in range(1, 11):
                    for dist_metric in DISTANCE_METRICS:
                        predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]

                        assert(len(predictors) == 1)

                        OLS_model = smf.ols(
                            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {predictors[0]}',
                            data=df_tmp_fold
                        ).fit()

                        anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                        results_horizon_layer_combinations.append({
                            "y": predicted_var, 
                            "metric": "Information value", 
                            "model": model, 
                            "layer": layer,
                            "layer_idx": layer_idx,
                            "horizon": horizon,
                            "aggregation": "mean",
                            "dist_metric": dist_metric,
                            "fold": fold, 
                            "loglik": OLS_model.llf / OLS_model.nobs,
                            "rsquared": OLS_model.rsquared,
                            "rsquared_adj": OLS_model.rsquared_adj,
                            "aic": OLS_model.aic,
                            "bic": OLS_model.bic,
                            "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                            "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                            "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                            "delta_aic": OLS_model.aic - OLS_baseline.aic,
                            "delta_bic": OLS_model.bic - OLS_baseline.bic,
                            "anova_rss": anova_results.ssr[1],
                            "anova_delta_ss": anova_results.ss_diff[1],
                            "anova_p": anova_results['Pr(>F)'][1],
                            "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                            "ias_coef": OLS_model.params[predictors[0]],
                        })
        
results_horizon_layer_combinations_df = pd.DataFrame(results_horizon_layer_combinations)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
results_horizon_layer_combinations_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_horizon_layer_over_surprisal.csv",
    index=False
)

### Individual IAS predictors (horizon-layer combinations) against control baseline

In [7]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_layer_combinations = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model including control predictors
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models by horizon-layer combination
        for layer_idx, layer in enumerate(LAYERS[model]):
            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_" in p and p.startswith(model) and p.endswith("Smean")]
                    
                    # a bit of a hack: the previous line can match both cosine and cosine_std, so we need to filter out the correct one
                    if len(predictors) > 1 and "_std_" not in dist_metric:
                        predictors = [p for p in predictors if "_std_" not in p]
                    assert(len(predictors) == 1)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
                        data=df_tmp
                    ).fit()
                    
                    anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                    results_horizon_layer_combinations.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": "full", 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": anova_results.ssr[1],
                        "anova_delta_ss": anova_results.ss_diff[1],
                        "anova_p": anova_results['Pr(>F)'][1],
                        "ias_coef": OLS_model.params[predictors[0]],
                    })

        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp)

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including control predictors
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}', 
                data=df_tmp_fold
            ).fit()     

            # fit IAS models by horizon-layer combination
            for layer_idx, layer in enumerate(LAYERS[model]):
                for horizon in range(1, 11):
                    for dist_metric in DISTANCE_METRICS:
                        predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]

                        assert(len(predictors) == 1)

                        OLS_model = smf.ols(
                            formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
                            data=df_tmp_fold
                        ).fit()

                        anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                        results_horizon_layer_combinations.append({
                            "y": predicted_var, 
                            "metric": "Information value", 
                            "model": model, 
                            "layer": layer,
                            "layer_idx": layer_idx,
                            "horizon": horizon,
                            "aggregation": "mean",
                            "dist_metric": dist_metric,
                            "fold": fold, 
                            "loglik": OLS_model.llf / OLS_model.nobs,
                            "rsquared": OLS_model.rsquared,
                            "rsquared_adj": OLS_model.rsquared_adj,
                            "aic": OLS_model.aic,
                            "bic": OLS_model.bic,
                            "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                            "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                            "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                            "delta_aic": OLS_model.aic - OLS_baseline.aic,
                            "delta_bic": OLS_model.bic - OLS_baseline.bic,
                            "anova_rss": anova_results.ssr[1],
                            "anova_delta_ss": anova_results.ss_diff[1],
                            "anova_p": anova_results['Pr(>F)'][1],
                            "ias_coef": OLS_model.params[predictors[0]],
                        })

       
results_horizon_layer_combinations_df = pd.DataFrame(results_horizon_layer_combinations)



  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
results_horizon_layer_combinations_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_horizon_layer_against_control.csv",
    index=False
)

### Individual IAS predictors (horizon-layer combinations) against surprisal baseline, head-to-head

In [9]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_layer_combinations = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model including surprisal
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models by horizon-layer combination
        for layer_idx, layer in enumerate(LAYERS[model]):
            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_" in p and p.startswith(model) and p.endswith("Smean")]
                    
                    # a bit of a hack: the previous line can match both cosine and cosine_std, so we need to filter out the correct one
                    if len(predictors) > 1 and "_std_" not in dist_metric:
                        predictors = [p for p in predictors if "_std_" not in p]
                    assert(len(predictors) == 1)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
                        data=df_tmp
                    ).fit()
                    
                    anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                    results_horizon_layer_combinations.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": "full", 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": anova_results.ssr[1],
                        "anova_delta_ss": anova_results.ss_diff[1],
                        "anova_p": anova_results['Pr(>F)'][1],
                        "surprisal_coef": OLS_baseline.params[f"{model}_surprisal"],
                        "ias_coef": OLS_model.params[predictors[0]],
                    })
        
                # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------

        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp)

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     

            # fit IAS models by horizon-layer combination
            for layer_idx, layer in enumerate(LAYERS[model]):
                for horizon in range(1, 11):
                    for dist_metric in DISTANCE_METRICS:
                        predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]

                        assert(len(predictors) == 1)

                        OLS_model = smf.ols(
                            formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
                            data=df_tmp_fold
                        ).fit()

                        anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                        results_horizon_layer_combinations.append({
                            "y": predicted_var, 
                            "metric": "Information value", 
                            "model": model, 
                            "layer": layer,
                            "layer_idx": layer_idx,
                            "horizon": horizon,
                            "aggregation": "mean",
                            "dist_metric": dist_metric,
                            "fold": fold, 
                            "loglik": OLS_model.llf / OLS_model.nobs,
                            "rsquared": OLS_model.rsquared,
                            "rsquared_adj": OLS_model.rsquared_adj,
                            "aic": OLS_model.aic,
                            "bic": OLS_model.bic,
                            "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                            "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                            "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                            "delta_aic": OLS_model.aic - OLS_baseline.aic,
                            "delta_bic": OLS_model.bic - OLS_baseline.bic,
                            "anova_rss": anova_results.ssr[1],
                            "anova_delta_ss": anova_results.ss_diff[1],
                            "anova_p": anova_results['Pr(>F)'][1],
                            "surprisal_coef": OLS_baseline.params[f"{model}_surprisal"],
                            "ias_coef": OLS_model.params[predictors[0]],
                        })
        
results_horizon_layer_combinations_df = pd.DataFrame(results_horizon_layer_combinations)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
results_horizon_layer_combinations_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_horizon_layer_replace_surprisal.csv",
    index=False
)

### Layer-level predictors (against surprisal baseline)

In [8]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_layer_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for layer_idx, layer in enumerate(LAYERS[model]):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 10)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_layer_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": layer,
                    "layer_idx": layer_idx,
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     

            for layer_idx, layer in enumerate(LAYERS[model]):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 10)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_layer_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": "All",
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_layer_combinations_df = pd.DataFrame(results_layer_combinations_df)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
results_layer_combinations_df.to_csv(
    "results/ols_ns_ias_cosine_std_layer_against_surprisal.csv",
    index=False
)

In [9]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_layer_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for layer_idx, layer in enumerate(LAYERS[model]):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 10)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str}  + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_layer_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": layer,
                    "layer_idx": layer_idx,
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}',
                data=df_tmp_fold
            ).fit()     

            for layer_idx, layer in enumerate(LAYERS[model]):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 10)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_layer_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": "All",
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_layer_combinations_df = pd.DataFrame(results_layer_combinations_df)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
results_layer_combinations_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_layer_against_control.csv",
    index=False
)

### Horizon-level predictors (against surprisal baseline)

In [14]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for horizon in range(1, 11):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 13)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_horizon_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": horizon,
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     

            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 13)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_horizon_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": "All",
                        "layer_idx": "All",
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_horizon_combinations_df = pd.DataFrame(results_horizon_combinations_df)


  0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
results_horizon_combinations_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_horizon_against_surprisal.csv",
    index=False
)

In [11]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for horizon in range(1, 11):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 13)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str}  + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_horizon_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": horizon,
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}',
                data=df_tmp_fold
            ).fit()     

            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 13)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_horizon_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": "All",
                        "layer_idx": "All",
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_horizon_combinations_df = pd.DataFrame(results_horizon_combinations_df)


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
results_horizon_combinations_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_horizon_against_control.csv",
    index=False
)

: 

### Full model 

#### Against surprisal baseline

In [7]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_full_model_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models 
        for dist_metric in DISTANCE_METRICS:
            predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

            assert(len(predictors) == 130)

            OLS_model = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                data=df_tmp
            ).fit()

            anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

            results_full_model_df.append({
                "y": predicted_var, 
                "metric": "Information value", 
                "model": model, 
                "layer": "All",
                "layer_idx": "All",
                "horizon": "All",
                "aggregation": "mean",
                "dist_metric": dist_metric,
                "fold": "full", 
                "loglik": OLS_model.llf / OLS_model.nobs,
                "rsquared": OLS_model.rsquared,
                "rsquared_adj": OLS_model.rsquared_adj,
                "aic": OLS_model.aic,
                "bic": OLS_model.bic,
                "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                "delta_aic": OLS_model.aic - OLS_baseline.aic,
                "delta_bic": OLS_model.bic - OLS_baseline.bic,
                "anova_rss": anova_results.ssr[1],
                "anova_delta_ss": anova_results.ss_diff[1],
                "anova_p": anova_results['Pr(>F)'][1],
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
            })
    
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     


            for dist_metric in DISTANCE_METRICS:
                
                predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                assert(len(predictors) == 130)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                    data=df_tmp_fold
                ).fit()
                
                results_full_model_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": fold, 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": "",
                    "anova_delta_ss": "",
                    "anova_p": ""
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
            })

       
results_full_model_df = pd.DataFrame(results_full_model_df)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
results_full_model_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_full_model_against_surprisal.csv",
    index=False
)

#### Against control baseline

In [9]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_full_model_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models 
        for dist_metric in DISTANCE_METRICS:
            predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

            assert(len(predictors) == 130)

            OLS_model = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                data=df_tmp
            ).fit()

            anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

            results_full_model_df.append({
                "y": predicted_var, 
                "metric": "Information value", 
                "model": model, 
                "layer": "All",
                "layer_idx": "All",
                "horizon": "All",
                "aggregation": "mean",
                "dist_metric": dist_metric,
                "fold": "full", 
                "loglik": OLS_model.llf / OLS_model.nobs,
                "rsquared": OLS_model.rsquared,
                "rsquared_adj": OLS_model.rsquared_adj,
                "aic": OLS_model.aic,
                "bic": OLS_model.bic,
                "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                "delta_aic": OLS_model.aic - OLS_baseline.aic,
                "delta_bic": OLS_model.bic - OLS_baseline.bic,
                "anova_rss": anova_results.ssr[1],
                "anova_delta_ss": anova_results.ss_diff[1],
                "anova_p": anova_results['Pr(>F)'][1],
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
            })
    
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}', 
                data=df_tmp_fold
            ).fit()     


            for dist_metric in DISTANCE_METRICS:
                
                predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                assert(len(predictors) == 130)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                    data=df_tmp_fold
                ).fit()
                
                results_full_model_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": fold, 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": "",
                    "anova_delta_ss": "",
                    "anova_p": ""
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_full_model_df = pd.DataFrame(results_full_model_df)


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
results_full_model_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_full_model_against_control.csv",
    index=False
)

### Surprisal together with incremental information value

In [5]:
predicted_variables = PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_comparison_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()
    
    # baseline model
    OLS_baseline = smf.ols(
        formula=f'{predicted_var} ~ {baseline_predictors_str}', 
        data=df_tmp
    ).fit()

    results_comparison_df.append({
        "y": predicted_var, 
        "metric": "Baseline", 
        "model": "", 
        "aggregation": "",
        "dist_metric": "",
        "fold": "full", 
        "loglik": OLS_baseline.llf / OLS_baseline.nobs,
        "delta_loglik": "",
        "rsquared": OLS_baseline.rsquared,
        "delta_rsquared": "",
        "rsquared_adj": OLS_baseline.rsquared_adj,
        "delta_rsquared_adj": "",
        "aic": OLS_baseline.aic,
        "bic": OLS_baseline.bic,
        "anova_p_vs_baseline": "",
        "anova_p_ias_vs_surprisal": "",
        "anova_p_surprisal_vs_ias": "",
    })
    
    for model in MODEL_NAMES:

        OLS_surprisal = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()

        anova_results_surprisal_vs_baseline = sm.stats.anova_lm(OLS_baseline, OLS_surprisal)

        results_comparison_df.append({
            "y": predicted_var, 
            "metric": "Surprisal", 
            "model": model, 
            "aggregation": "",
            "dist_metric": "",
            "fold": "full", 
            "loglik": OLS_surprisal.llf / OLS_surprisal.nobs,
            "delta_loglik": OLS_surprisal.llf / OLS_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
            "rsquared": OLS_surprisal.rsquared,
            "delta_rsquared": OLS_surprisal.rsquared - OLS_baseline.rsquared,
            "rsquared_adj": OLS_surprisal.rsquared_adj,
            "delta_rsquared_adj": OLS_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
            "aic": OLS_surprisal.aic,
            "bic": OLS_surprisal.bic,
            "anova_p_vs_baseline": anova_results_surprisal_vs_baseline['Pr(>F)'][1],
            "anova_p_ias_vs_surprisal": "",
            "anova_p_surprisal_vs_ias": "",
            "anova_p_both_vs_ias": "",
            "anova_p_both_vs_surprisal": "",
        })
            
        # fit IAS models 
        for dist_metric in DISTANCE_METRICS:
            for aggregation in ["Smean", "Smin"]:
                predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith(aggregation)]

                assert(len(predictors) == 130)

                OLS_ias = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results_ias_vs_baseline = sm.stats.anova_lm(OLS_baseline, OLS_ias)
                anova_results_ias_vs_surprisal = sm.stats.anova_lm(OLS_surprisal, OLS_ias)
                anova_results_surprisal_vs_ias = sm.stats.anova_lm(OLS_ias, OLS_surprisal)

                results_comparison_df.append({
                    "y": predicted_var, 
                    "metric": f"IAS ({aggregation[1:]})",
                    "model": model, 
                    "aggregation": aggregation,
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_ias.llf / OLS_ias.nobs,
                    "delta_loglik": OLS_ias.llf / OLS_ias.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "rsquared": OLS_ias.rsquared,
                    "delta_rsquared": OLS_ias.rsquared - OLS_baseline.rsquared,
                    "rsquared_adj": OLS_ias.rsquared_adj,
                    "delta_rsquared_adj": OLS_ias.rsquared_adj - OLS_baseline.rsquared_adj,
                    "aic": OLS_ias.aic,
                    "bic": OLS_ias.bic,
                    "anova_p_vs_baseline": anova_results_ias_vs_baseline['Pr(>F)'][1],
                    "anova_p_ias_vs_surprisal": anova_results_ias_vs_surprisal['Pr(>F)'][1],
                    "anova_p_surprisal_vs_ias": anova_results_surprisal_vs_ias['Pr(>F)'][1],
                    "anova_p_both_vs_ias": "",
                    "anova_p_both_vs_surprisal": "",
                })

                OLS_ias_surprisal = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)} + {model}_surprisal',
                    data=df_tmp
                ).fit()

                anova_results_both_vs_baseline = sm.stats.anova_lm(OLS_baseline, OLS_ias_surprisal)
                anova_results_both_vs_ias = sm.stats.anova_lm(OLS_ias, OLS_ias_surprisal)
                anova_results_both_vs_surprisal = sm.stats.anova_lm(OLS_surprisal, OLS_ias_surprisal)

                results_comparison_df.append({
                    "y": predicted_var, 
                    "metric": f"Surprisal + IAS ({aggregation[1:]})",
                    "model": model, 
                    "aggregation": aggregation,
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs,
                    "delta_loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "rsquared": OLS_ias_surprisal.rsquared,
                    "delta_rsquared": OLS_ias_surprisal.rsquared - OLS_baseline.rsquared,
                    "rsquared_adj": OLS_ias_surprisal.rsquared_adj,
                    "delta_rsquared_adj": OLS_ias_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
                    "aic": OLS_ias_surprisal.aic,
                    "bic": OLS_ias_surprisal.bic,
                    "anova_p_vs_baseline": anova_results_both_vs_baseline['Pr(>F)'][1],
                    "anova_p_ias_vs_surprisal": "",
                    "anova_p_surprisal_vs_ias": "",
                    "anova_p_both_vs_ias": anova_results_both_vs_ias['Pr(>F)'][1],
                    "anova_p_both_vs_surprisal": anova_results_both_vs_surprisal['Pr(>F)'][1],
                })
            
    
    
for predicted_var in tqdm(predicted_variables):

    df_tmp = ns_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    # ---------------------------------------------------
    # 10-fold bootstrapping 
    # ---------------------------------------------------
    kf = KFold(n_splits=10, random_state=42, shuffle=True)
    kf.get_n_splits(df_tmp) 

    for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
        df_tmp_fold = df_tmp.iloc[split_indices]

        # first fit baseline model including surprisal
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp_fold
        ).fit()     

        results_comparison_df.append({
            "y": predicted_var, 
            "metric": "Baseline", 
            "model": "", 
            "aggregation": "",
            "dist_metric": "",
            "fold": fold, 
            "loglik": OLS_baseline.llf / OLS_baseline.nobs,
            "delta_loglik": "",
            "rsquared": OLS_baseline.rsquared,
            "delta_rsquared": "",
            "rsquared_adj": OLS_baseline.rsquared_adj,
            "delta_rsquared_adj": "",
            "aic": OLS_baseline.aic,
            "bic": OLS_baseline.bic,
            "anova_p_vs_baseline": "",
            "anova_p_ias_vs_surprisal": "",
            "anova_p_surprisal_vs_ias": "",
        })

        for model in MODEL_NAMES:

            OLS_surprisal = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()

            results_comparison_df.append({
                "y": predicted_var, 
                "metric": "Surprisal", 
                "model": model, 
                "aggregation": "",
                "dist_metric": "",
                "fold": fold, 
                "loglik": OLS_surprisal.llf / OLS_surprisal.nobs,
                "delta_loglik": OLS_surprisal.llf / OLS_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                "rsquared": OLS_surprisal.rsquared,
                "delta_rsquared": OLS_surprisal.rsquared - OLS_baseline.rsquared,
                "rsquared_adj": OLS_surprisal.rsquared_adj,
                "delta_rsquared_adj": OLS_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
                "aic": OLS_surprisal.aic,
                "bic": OLS_surprisal.bic,
                "anova_p_vs_baseline": "",
                "anova_p_ias_vs_surprisal": "",
                "anova_p_surprisal_vs_ias": "",
                "anova_p_both_vs_ias": "",
                "anova_p_both_vs_surprisal": "",
            })

            # fit IAS models 
            for dist_metric in DISTANCE_METRICS:
                for aggregation in ["Smean", "Smin"]:
                    predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith(aggregation)]

                    assert(len(predictors) == 130)

                    OLS_ias = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()

                    results_comparison_df.append({
                        "y": predicted_var, 
                        "metric": f"IAS ({aggregation[1:]})",
                        "model": model, 
                        "aggregation": aggregation,
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_ias.llf / OLS_ias.nobs,
                        "delta_loglik": OLS_ias.llf / OLS_ias.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "rsquared": OLS_ias.rsquared,
                        "delta_rsquared": OLS_ias.rsquared - OLS_baseline.rsquared,
                        "rsquared_adj": OLS_ias.rsquared_adj,
                        "delta_rsquared_adj": OLS_ias.rsquared_adj - OLS_baseline.rsquared_adj,
                        "aic": OLS_ias.aic,
                        "bic": OLS_ias.bic,
                        "anova_p_vs_baseline": "",
                        "anova_p_ias_vs_surprisal": "",
                        "anova_p_surprisal_vs_ias": "",
                        "anova_p_both_vs_ias": "",
                        "anova_p_both_vs_surprisal": "",
                    })

                    OLS_ias_surprisal = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)} + {model}_surprisal',
                        data=df_tmp_fold
                    ).fit()

                    results_comparison_df.append({
                        "y": predicted_var, 
                        "metric": f"Surprisal + IAS ({aggregation[1:]})",
                        "model": model, 
                        "aggregation": aggregation,
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs,
                        "delta_loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "rsquared": OLS_ias_surprisal.rsquared,
                        "delta_rsquared": OLS_ias_surprisal.rsquared - OLS_baseline.rsquared,
                        "rsquared_adj": OLS_ias_surprisal.rsquared_adj,
                        "delta_rsquared_adj": OLS_ias_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
                        "aic": OLS_ias_surprisal.aic,
                        "bic": OLS_ias_surprisal.bic,
                        "anova_p_vs_baseline": "",
                        "anova_p_ias_vs_surprisal": "",
                        "anova_p_surprisal_vs_ias": "",
                        "anova_p_both_vs_ias": "",
                        "anova_p_both_vs_surprisal": "",
                    })


results_comparison_df = pd.DataFrame(results_comparison_df)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
results_comparison_df.to_csv(
    "results_final/ols_ns_ias_cosine_std_comparison_all.csv",
    index=False
)
