In [10]:
import copy
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy import stats

from tqdm.notebook import tqdm
from sklearn.model_selection import KFold 


### Load the preprocessed Aligned corpus (includes surprisal and incremental information value estimates)

In [11]:
aligned_norm = pd.read_csv("preprocessed_corpora/aligned_preprocessed_normalised.csv")

# aligned.columns = aligned.columns.str.replace("-", "_")
aligned_norm.columns = aligned_norm.columns.str.replace("-", "_")


In [12]:
# For interaction terms

def all_pairs(list):
    """
    Returns all possible pairs of elements in a list
    """
    pairs = []
    for i in range(len(list)):
        for j in range(i+1, len(list)):
            pairs.append([list[i], list[j]])
    return pairs

In [13]:
# Constants

DISTANCE_METRICS = ["cosine_std"]  #["euclidean", "cosine", "euclidean_std", "cosine_std"]
MODEL_NAMES = ['gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl']
HORIZONS = range(1, 11)
LAYERS = {
    "gpt2_small": list(range(0, 13)),
    "gpt2_medium": list(range(0, 25, 2)),
    "gpt2_large": list(range(0, 37, 3)),
    "gpt2_xl": list(range(0, 49, 4))
}



RATINGS = ['rating_mean','rating_sd', 'cloze_p_smoothed', 'cloze_s', 'entropy']
ERP = ['ELAN', 'LAN', 'N400', 'EPNP', 'P600', 'PNP']
RT = ['RTfirstfix', 'RTfirstpass', 'RTrightbound', 'RTgopast', 'self_paced_reading_time']
ALL_PREDICTED_VARIABLES = RATINGS + RT + ERP

BASELINE_PREDICTORS = ['Subtlex_log10', 'context_length', 'length']
SURPRISAL_PREDICTORS = [col for col in aligned_norm if '_surprisal' in col]
IAS_PREDICTORS = [col for col in aligned_norm if '_ias_' in col]
ALL_INFORMATION_PREDICTORS = SURPRISAL_PREDICTORS + IAS_PREDICTORS 


In [14]:
# # take the log of all IAS predictors
# for col in IAS_PREDICTORS:
#     aligned_norm[col] = np.log(aligned_norm[col] + 1e-10)

### Horizon-layer combinations

#### Surprisal baseline

In [None]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_layer_combinations = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model including surprisal
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models by horizon-layer combination
        for layer_idx, layer in enumerate(LAYERS[model]):
            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    
                    assert(len(predictors) == 1)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {predictors[0]}',
                        data=df_tmp
                    ).fit()

                    anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                    results_horizon_layer_combinations.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": "full", 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": anova_results.ssr[1],
                        "anova_delta_ss": anova_results.ss_diff[1],
                        "anova_p": anova_results['Pr(>F)'][1],
                        "ftest_p": "",
                        "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                        "ias_coef": OLS_model.params[predictors[0]],
                    })
        

        # # ---------------------------------------------------
        # # 10-fold bootstrapping for IAS models
        # # ---------------------------------------------------
        # kf = KFold(n_splits=10, random_state=42, shuffle=True)
        # kf.get_n_splits(df_tmp) 

        # for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
        #     df_tmp_fold = df_tmp.iloc[split_indices]

        #     # first fit baseline model including surprisal
        #     OLS_baseline = smf.ols(
        #         formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
        #         data=df_tmp_fold
        #     ).fit()     

        #     # fit IAS models by horizon-layer combination
        #     for layer_idx, layer in enumerate(LAYERS[model]):
        #         for horizon in range(1, 11):
        #             for dist_metric in DISTANCE_METRICS:
        #                 predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]

        #                 assert(len(predictors) == 1)

        #                 OLS_model = smf.ols(
        #                     formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {predictors[0]}',
        #                     data=df_tmp_fold
        #                 ).fit()

        #                 anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

        #                 results_horizon_layer_combinations.append({
        #                     "y": predicted_var, 
        #                     "metric": "Information value", 
        #                     "model": model, 
        #                     "layer": layer,
        #                     "layer_idx": layer_idx,
        #                     "horizon": horizon,
        #                     "aggregation": "mean",
        #                     "dist_metric": dist_metric,
        #                     "fold": fold, 
        #                     "loglik": OLS_model.llf,
        #                     "rsquared": OLS_model.rsquared,
        #                     "rsquared_adj": OLS_model.rsquared_adj,
        #                     "aic": OLS_model.aic,
        #                     "bic": OLS_model.bic,
        #                     "delta_loglik": OLS_model.llf - OLS_baseline.llf,
        #                     "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
        #                     "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
        #                     "delta_aic": OLS_model.aic - OLS_baseline.aic,
        #                     "delta_bic": OLS_model.bic - OLS_baseline.bic,
        #                     "anova_rss": anova_results.ssr[1],
        #                     "anova_delta_ss": anova_results.ss_diff[1],
        #                     "anova_p": anova_results['Pr(>F)'][1],
        #                     "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
        #                     "ias_coef": OLS_model.params[predictors[0]],
        #                 })

       
results_horizon_layer_combinations_df = pd.DataFrame(results_horizon_layer_combinations)


  0%|          | 0/16 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
OLS_model.nobs

1466.0

In [7]:
results_horizon_layer_combinations_df.to_csv(
    "results/ols_aligned_ias_cosine_std_horizon_layer_over_surprisal.csv",
    index=False
)

#### Against control baseline

In [16]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_layer_combinations = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model including control predictors
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models by horizon-layer combination
        for layer_idx, layer in enumerate(LAYERS[model]):
            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]

                    assert(len(predictors) == 1)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
                        data=df_tmp
                    ).fit()

                    anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                    results_horizon_layer_combinations.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": "full", 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": anova_results.ssr[1],
                        "anova_delta_ss": anova_results.ss_diff[1],
                        "anova_p": anova_results['Pr(>F)'][1],
                        "ias_coef": OLS_model.params[predictors[0]],
                    })

        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp)

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including control predictors
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}', 
                data=df_tmp_fold
            ).fit()     

            # fit IAS models by horizon-layer combination
            for layer_idx, layer in enumerate(LAYERS[model]):
                for horizon in range(1, 11):
                    for dist_metric in DISTANCE_METRICS:
                        predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]

                        assert(len(predictors) == 1)

                        OLS_model = smf.ols(
                            formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
                            data=df_tmp_fold
                        ).fit()

                        anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                        results_horizon_layer_combinations.append({
                            "y": predicted_var, 
                            "metric": "Information value", 
                            "model": model, 
                            "layer": layer,
                            "layer_idx": layer_idx,
                            "horizon": horizon,
                            "aggregation": "mean",
                            "dist_metric": dist_metric,
                            "fold": fold, 
                            "loglik": OLS_model.llf / OLS_model.nobs,
                            "rsquared": OLS_model.rsquared,
                            "rsquared_adj": OLS_model.rsquared_adj,
                            "aic": OLS_model.aic,
                            "bic": OLS_model.bic,
                            "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                            "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                            "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                            "delta_aic": OLS_model.aic - OLS_baseline.aic,
                            "delta_bic": OLS_model.bic - OLS_baseline.bic,
                            "anova_rss": anova_results.ssr[1],
                            "anova_delta_ss": anova_results.ss_diff[1],
                            "anova_p": anova_results['Pr(>F)'][1],
                            "ias_coef": OLS_model.params[predictors[0]],
                        })
       
results_horizon_layer_combinations_df = pd.DataFrame(results_horizon_layer_combinations)


  0%|          | 0/16 [00:00<?, ?it/s]

In [17]:
results_horizon_layer_combinations_df.to_csv(
    "results_final/ols_aligned_ias_cosine_std_horizon_layer_against_control.csv",
    index=False
)

#### Head-to-head comparison with surprisal

In [29]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_layer_combinations = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model including surprisal
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models by horizon-layer combination
        for layer_idx, layer in enumerate(LAYERS[model]):
            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    
                    assert(len(predictors) == 1)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
                        data=df_tmp
                    ).fit()

                    anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                    f = anova_results.ssr[0] / anova_results.ssr[1]
                    df0 = OLS_baseline.df_model
                    df1 = OLS_model.df_model
                    p_value = stats.f.cdf(f, df0, df1)

                    print('f', p_value)
                    print('anova', anova_results['Pr(>F)'][1])
                    

                    results_horizon_layer_combinations.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": "full", 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": anova_results.ssr[1],
                        "anova_delta_ss": anova_results.ss_diff[1],
                        "anova_p": anova_results['Pr(>F)'][1],
                        "surprisal_coef": OLS_baseline.params[f"{model}_surprisal"],
                        "ias_coef": OLS_model.params[predictors[0]],
                    })

        # ---------------------------------------------------
        # # 10-fold bootstrapping for IAS models
        # # ---------------------------------------------------

        # kf = KFold(n_splits=10, random_state=42, shuffle=True)
        # kf.get_n_splits(df_tmp)

        # for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
        #     df_tmp_fold = df_tmp.iloc[split_indices]

        #     # first fit baseline model including surprisal
        #     OLS_baseline = smf.ols(
        #         formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
        #         data=df_tmp_fold
        #     ).fit()     

            # # fit IAS models by horizon-layer combination
            # for layer_idx, layer in enumerate(LAYERS[model]):
            #     for horizon in range(1, 11):
            #         for dist_metric in DISTANCE_METRICS:
            #             predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]

            #             assert(len(predictors) == 1)

            #             OLS_model = smf.ols(
            #                 formula=f'{predicted_var} ~ {baseline_predictors_str} + {predictors[0]}',
            #                 data=df_tmp_fold
            #             ).fit()

            #             anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

            #             results_horizon_layer_combinations.append({
            #                 "y": predicted_var, 
            #                 "metric": "Information value", 
            #                 "model": model, 
            #                 "layer": layer,
            #                 "layer_idx": layer_idx,
            #                 "horizon": horizon,
            #                 "aggregation": "mean",
            #                 "dist_metric": dist_metric,
            #                 "fold": fold, 
            #                 "loglik": OLS_model.llf,
            #                 "rsquared": OLS_model.rsquared,
            #                 "rsquared_adj": OLS_model.rsquared_adj,
            #                 "aic": OLS_model.aic,
            #                 "bic": OLS_model.bic,
            #                 "delta_loglik": OLS_model.llf - OLS_baseline.llf,
            #                 "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
            #                 "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
            #                 "delta_aic": OLS_model.aic - OLS_baseline.aic,
            #                 "delta_bic": OLS_model.bic - OLS_baseline.bic,
            #                 "anova_rss": anova_results.ssr[1],
            #                 "anova_delta_ss": anova_results.ss_diff[1],
            #                 "anova_p": anova_results['Pr(>F)'][1],
            #                 "surprisal_coef": OLS_baseline.params[f"{model}_surprisal"],
            #                 "ias_coef": OLS_model.params[predictors[0]],
            #             })

results_horizon_layer_combinations_df = pd.DataFrame(results_horizon_layer_combinations)


  0%|          | 0/16 [00:00<?, ?it/s]

f 0.36146465368110464
anova nan
f 0.35546565825046655
anova nan
f 0.37585152275572864
anova nan
f 0.364289308547202
anova nan
f 0.355672290754783
anova nan
f 0.349000039883552
anova nan
f 0.34296388892394297
anova nan
f 0.3360554269085063
anova nan
f 0.3298644989352419
anova nan
f 0.32636824571083334
anova nan
f 0.36454927872301324
anova nan
f 0.3448341677023346
anova nan
f 0.3496124023734903
anova nan
f 0.34036427612445824
anova nan
f 0.3301198271433326
anova nan
f 0.32191691576511944
anova nan
f 0.318100196616814
anova nan
f 0.314149239207099
anova nan
f 0.3100009448035633
anova nan
f 0.30789976293033033
anova nan
f 0.34637103120290114
anova nan
f 0.32895294915173023
anova nan
f 0.3464947862359848
anova nan
f 0.33959119801267157
anova nan
f 0.3312546075360631
anova nan
f 0.32578109437782093
anova nan
f 0.3223141513395691
anova nan
f 0.3185160175585196
anova nan
f 0.3145666013398269
anova nan
f 0.31217064810773404
anova nan
f 0.3352148769357823
anova nan
f 0.3191269399160349
anova nan

KeyboardInterrupt: 

In [11]:
results_horizon_layer_combinations_df.to_csv(
    "results_final/ols_aligned_ias_cosine_std_horizon_layer_replace_surprisal.csv",
    index=False
)

### Layer-level predictors 

#### Against surprisal baseline

In [48]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_layer_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for layer_idx, layer in enumerate(LAYERS[model]):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 10)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_layer_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": layer,
                    "layer_idx": layer_idx,
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     

            for layer_idx, layer in enumerate(LAYERS[model]):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 10)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_layer_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": "All",
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_layer_combinations_df = pd.DataFrame(results_layer_combinations_df)


  0%|          | 0/16 [00:00<?, ?it/s]

In [49]:
results_layer_combinations_df.to_csv(
    "results_final/ols_aligned_ias_cosine_std_layer_against_surprisal.csv",
    index=False
)

#### Against control baseline

In [18]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_layer_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for layer_idx, layer in enumerate(LAYERS[model]):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 10)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str}  + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_layer_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": layer,
                    "layer_idx": layer_idx,
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}',
                data=df_tmp_fold
            ).fit()     

            for layer_idx, layer in enumerate(LAYERS[model]):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_L{layer}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 10)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_layer_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": layer,
                        "layer_idx": layer_idx,
                        "horizon": "All",
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_layer_combinations_df = pd.DataFrame(results_layer_combinations_df)


  0%|          | 0/16 [00:00<?, ?it/s]

In [19]:
results_layer_combinations_df.to_csv(
    "results_final/ols_aligned_ias_cosine_std_layer_against_control.csv",
    index=False
)

### Horizon-level predictors

#### Against surprisal baseline

In [53]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for horizon in range(1, 11):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 13)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_horizon_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": horizon,
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     

            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 13)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_horizon_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": "All",
                        "layer_idx": "All",
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_horizon_combinations_df = pd.DataFrame(results_horizon_combinations_df)


  0%|          | 0/16 [00:00<?, ?it/s]

In [54]:
results_horizon_combinations_df.to_csv(
    "results_final/ols_aligned_ias_cosine_std_horizon_against_surprisal.csv",
    index=False
)

#### Against control baseline

In [20]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_horizon_combinations_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models layer-wise
        for horizon in range(1, 11):
            for dist_metric in DISTANCE_METRICS:
                predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

                assert(len(predictors) == 13)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str}  + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

                results_horizon_combinations_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": horizon,
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": anova_results.ssr[1],
                    "anova_delta_ss": anova_results.ss_diff[1],
                    "anova_p": anova_results['Pr(>F)'][1],
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })
        
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}',
                data=df_tmp_fold
            ).fit()     

            for horizon in range(1, 11):
                for dist_metric in DISTANCE_METRICS:
                    
                    predictors = [p for p in IAS_PREDICTORS if f"_H{horizon}_" in p and f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                    assert(len(predictors) == 13)

                    OLS_model = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()
                    
                    results_horizon_combinations_df.append({
                        "y": predicted_var, 
                        "metric": "Information value", 
                        "model": model, 
                        "layer": "All",
                        "layer_idx": "All",
                        "horizon": horizon,
                        "aggregation": "mean",
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_model.llf / OLS_model.nobs,
                        "rsquared": OLS_model.rsquared,
                        "rsquared_adj": OLS_model.rsquared_adj,
                        "aic": OLS_model.aic,
                        "bic": OLS_model.bic,
                        "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                        "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                        "delta_aic": OLS_model.aic - OLS_baseline.aic,
                        "delta_bic": OLS_model.bic - OLS_baseline.bic,
                        "anova_rss": "",
                        "anova_delta_ss": "",
                        "anova_p": ""
                    # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                    # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_horizon_combinations_df = pd.DataFrame(results_horizon_combinations_df)


  0%|          | 0/16 [00:00<?, ?it/s]

In [21]:
results_horizon_combinations_df.to_csv(
    "results_final/ols_aligned_ias_cosine_std_horizon_against_control.csv",
    index=False
)

### Full model 

#### Against surprisal baseline

In [60]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_full_model_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models 
        for dist_metric in DISTANCE_METRICS:
            predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

            assert(len(predictors) == 130)

            OLS_model = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                data=df_tmp
            ).fit()

            anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

            results_full_model_df.append({
                "y": predicted_var, 
                "metric": "Information value", 
                "model": model, 
                "layer": "All",
                "layer_idx": "All",
                "horizon": "All",
                "aggregation": "mean",
                "dist_metric": dist_metric,
                "fold": "full", 
                "loglik": OLS_model.llf / OLS_model.nobs,
                "rsquared": OLS_model.rsquared,
                "rsquared_adj": OLS_model.rsquared_adj,
                "aic": OLS_model.aic,
                "bic": OLS_model.bic,
                "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                "delta_aic": OLS_model.aic - OLS_baseline.aic,
                "delta_bic": OLS_model.bic - OLS_baseline.bic,
                "anova_rss": anova_results.ssr[1],
                "anova_delta_ss": anova_results.ss_diff[1],
                "anova_p": anova_results['Pr(>F)'][1],
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
            })
    
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()     


            for dist_metric in DISTANCE_METRICS:
                
                predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                assert(len(predictors) == 130)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal + {"+".join(predictors)}',
                    data=df_tmp_fold
                ).fit()
                
                results_full_model_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": fold, 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": "",
                    "anova_delta_ss": "",
                    "anova_p": ""
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
            })

       
results_full_model_df = pd.DataFrame(results_full_model_df)


  0%|          | 0/16 [00:00<?, ?it/s]

In [61]:
results_full_model_df.to_csv(
    "results/ols_aligned_ias_cosine_std_full_model_against_surprisal.csv",
    index=False
)

#### Against control baseline

In [62]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_full_model_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    for model in MODEL_NAMES:
        
        # ------------------------------------------
        # ANOVA for IAS models
        # ------------------------------------------
        
        # first fit baseline model
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp
        ).fit()        
            
        # fit IAS models 
        for dist_metric in DISTANCE_METRICS:
            predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")] 

            assert(len(predictors) == 130)

            OLS_model = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                data=df_tmp
            ).fit()

            anova_results = sm.stats.anova_lm(OLS_baseline, OLS_model)

            results_full_model_df.append({
                "y": predicted_var, 
                "metric": "Information value", 
                "model": model, 
                "layer": "All",
                "layer_idx": "All",
                "horizon": "All",
                "aggregation": "mean",
                "dist_metric": dist_metric,
                "fold": "full", 
                "loglik": OLS_model.llf / OLS_model.nobs,
                "rsquared": OLS_model.rsquared,
                "rsquared_adj": OLS_model.rsquared_adj,
                "aic": OLS_model.aic,
                "bic": OLS_model.bic,
                "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                "delta_aic": OLS_model.aic - OLS_baseline.aic,
                "delta_bic": OLS_model.bic - OLS_baseline.bic,
                "anova_rss": anova_results.ssr[1],
                "anova_delta_ss": anova_results.ss_diff[1],
                "anova_p": anova_results['Pr(>F)'][1],
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
            })
    
    
        # ---------------------------------------------------
        # 10-fold bootstrapping for IAS models
        # ---------------------------------------------------
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        kf.get_n_splits(df_tmp) 

        for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
            df_tmp_fold = df_tmp.iloc[split_indices]

            # first fit baseline model including surprisal
            OLS_baseline = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str}', 
                data=df_tmp_fold
            ).fit()     


            for dist_metric in DISTANCE_METRICS:
                
                predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith("Smean")]
                assert(len(predictors) == 130)

                OLS_model = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                    data=df_tmp_fold
                ).fit()
                
                results_full_model_df.append({
                    "y": predicted_var, 
                    "metric": "Information value", 
                    "model": model, 
                    "layer": "All",
                    "layer_idx": "All",
                    "horizon": "All",
                    "aggregation": "mean",
                    "dist_metric": dist_metric,
                    "fold": fold, 
                    "loglik": OLS_model.llf / OLS_model.nobs,
                    "rsquared": OLS_model.rsquared,
                    "rsquared_adj": OLS_model.rsquared_adj,
                    "aic": OLS_model.aic,
                    "bic": OLS_model.bic,
                    "delta_loglik": OLS_model.llf / OLS_model.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "delta_rsquared": OLS_model.rsquared - OLS_baseline.rsquared, 
                    "delta_rsquared_adj": OLS_model.rsquared_adj - OLS_baseline.rsquared_adj, 
                    "delta_aic": OLS_model.aic - OLS_baseline.aic,
                    "delta_bic": OLS_model.bic - OLS_baseline.bic,
                    "anova_rss": "",
                    "anova_delta_ss": "",
                    "anova_p": ""
                # "surprisal_coef": OLS_model.params[f"{model}_surprisal"],
                # "ias_coef": OLS_model.params[predictors[0]],
                })

       
results_full_model_df = pd.DataFrame(results_full_model_df)


  0%|          | 0/16 [00:00<?, ?it/s]

In [63]:
results_full_model_df.to_csv(
    "results/ols_aligned_ias_cosine_std_full_model_against_control.csv",
    index=False
)

### Surprisal together with incremental information value

In [22]:
predicted_variables = ALL_PREDICTED_VARIABLES
all_predictors = ALL_INFORMATION_PREDICTORS

baseline_predictors = BASELINE_PREDICTORS
baseline_predictors_str = " + ".join(baseline_predictors) + " + " + " + ".join([f"{p[0]}:{p[1]}" for p in all_pairs(baseline_predictors)])
    

results_comparison_df = []

for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()
    
    # baseline model
    OLS_baseline = smf.ols(
        formula=f'{predicted_var} ~ {baseline_predictors_str}', 
        data=df_tmp
    ).fit()

    results_comparison_df.append({
        "y": predicted_var, 
        "metric": "Baseline", 
        "model": "", 
        "aggregation": "",
        "dist_metric": "",
        "fold": "full", 
        "loglik": OLS_baseline.llf / OLS_baseline.nobs,
        "delta_loglik": "",
        "rsquared": OLS_baseline.rsquared,
        "delta_rsquared": "",
        "rsquared_adj": OLS_baseline.rsquared_adj,
        "delta_rsquared_adj": "",
        "aic": OLS_baseline.aic,
        "bic": OLS_baseline.bic,
        "anova_p_vs_baseline": "",
        "anova_p_ias_vs_surprisal": "",
        "anova_p_surprisal_vs_ias": "",
    })
    
    for model in MODEL_NAMES:

        OLS_surprisal = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
            data=df_tmp
        ).fit()

        anova_results_surprisal_vs_baseline = sm.stats.anova_lm(OLS_baseline, OLS_surprisal)

        results_comparison_df.append({
            "y": predicted_var, 
            "metric": "Surprisal", 
            "model": model, 
            "aggregation": "",
            "dist_metric": "",
            "fold": "full", 
            "loglik": OLS_surprisal.llf / OLS_surprisal.nobs,
            "delta_loglik": OLS_surprisal.llf / OLS_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
            "rsquared": OLS_surprisal.rsquared,
            "delta_rsquared": OLS_surprisal.rsquared - OLS_baseline.rsquared,
            "rsquared_adj": OLS_surprisal.rsquared_adj,
            "delta_rsquared_adj": OLS_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
            "aic": OLS_surprisal.aic,
            "bic": OLS_surprisal.bic,
            "anova_p_vs_baseline": anova_results_surprisal_vs_baseline['Pr(>F)'][1],
            "anova_p_ias_vs_surprisal": "",
            "anova_p_surprisal_vs_ias": "",
            "anova_p_both_vs_ias": "",
            "anova_p_both_vs_surprisal": "",
        })
            
        # fit IAS models 
        for dist_metric in DISTANCE_METRICS:
            for aggregation in ["Smean", "Smin"]:
                predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith(aggregation)]

                assert(len(predictors) == 130)

                OLS_ias = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                    data=df_tmp
                ).fit()

                anova_results_ias_vs_baseline = sm.stats.anova_lm(OLS_baseline, OLS_ias)
                anova_results_ias_vs_surprisal = sm.stats.anova_lm(OLS_surprisal, OLS_ias)
                anova_results_surprisal_vs_ias = sm.stats.anova_lm(OLS_ias, OLS_surprisal)

                results_comparison_df.append({
                    "y": predicted_var, 
                    "metric": f"IAS ({aggregation[1:]})",
                    "model": model, 
                    "aggregation": aggregation,
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_ias.llf / OLS_ias.nobs,
                    "delta_loglik": OLS_ias.llf / OLS_ias.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "rsquared": OLS_ias.rsquared,
                    "delta_rsquared": OLS_ias.rsquared - OLS_baseline.rsquared,
                    "rsquared_adj": OLS_ias.rsquared_adj,
                    "delta_rsquared_adj": OLS_ias.rsquared_adj - OLS_baseline.rsquared_adj,
                    "aic": OLS_ias.aic,
                    "bic": OLS_ias.bic,
                    "anova_p_vs_baseline": anova_results_ias_vs_baseline['Pr(>F)'][1],
                    "anova_p_ias_vs_surprisal": anova_results_ias_vs_surprisal['Pr(>F)'][1],
                    "anova_p_surprisal_vs_ias": anova_results_surprisal_vs_ias['Pr(>F)'][1],
                    "anova_p_both_vs_ias": "",
                    "anova_p_both_vs_surprisal": "",
                })

                OLS_ias_surprisal = smf.ols(
                    formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)} + {model}_surprisal',
                    data=df_tmp
                ).fit()

                anova_results_both_vs_baseline = sm.stats.anova_lm(OLS_baseline, OLS_ias_surprisal)
                anova_results_both_vs_ias = sm.stats.anova_lm(OLS_ias, OLS_ias_surprisal)
                anova_results_both_vs_surprisal = sm.stats.anova_lm(OLS_surprisal, OLS_ias_surprisal)

                results_comparison_df.append({
                    "y": predicted_var, 
                    "metric": f"Surprisal + IAS ({aggregation[1:]})",
                    "model": model, 
                    "aggregation": aggregation,
                    "dist_metric": dist_metric,
                    "fold": "full", 
                    "loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs,
                    "delta_loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                    "rsquared": OLS_ias_surprisal.rsquared,
                    "delta_rsquared": OLS_ias_surprisal.rsquared - OLS_baseline.rsquared,
                    "rsquared_adj": OLS_ias_surprisal.rsquared_adj,
                    "delta_rsquared_adj": OLS_ias_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
                    "aic": OLS_ias_surprisal.aic,
                    "bic": OLS_ias_surprisal.bic,
                    "anova_p_vs_baseline": anova_results_both_vs_baseline['Pr(>F)'][1],
                    "anova_p_ias_vs_surprisal": "",
                    "anova_p_surprisal_vs_ias": "",
                    "anova_p_both_vs_ias": anova_results_both_vs_ias['Pr(>F)'][1],
                    "anova_p_both_vs_surprisal": anova_results_both_vs_surprisal['Pr(>F)'][1],
                })
            
    
    
for predicted_var in tqdm(predicted_variables):

    df_tmp = aligned_norm[[predicted_var] + baseline_predictors + all_predictors].dropna()

    # ---------------------------------------------------
    # 10-fold bootstrapping 
    # ---------------------------------------------------
    kf = KFold(n_splits=10, random_state=42, shuffle=True)
    kf.get_n_splits(df_tmp) 

    for fold, (split_indices, _) in enumerate(kf.split(df_tmp)):
        df_tmp_fold = df_tmp.iloc[split_indices]

        # first fit baseline model including surprisal
        OLS_baseline = smf.ols(
            formula=f'{predicted_var} ~ {baseline_predictors_str}', 
            data=df_tmp_fold
        ).fit()     

        results_comparison_df.append({
            "y": predicted_var, 
            "metric": "Baseline", 
            "model": "", 
            "aggregation": "",
            "dist_metric": "",
            "fold": fold, 
            "loglik": OLS_baseline.llf / OLS_baseline.nobs,
            "delta_loglik": "",
            "rsquared": OLS_baseline.rsquared,
            "delta_rsquared": "",
            "rsquared_adj": OLS_baseline.rsquared_adj,
            "delta_rsquared_adj": "",
            "aic": OLS_baseline.aic,
            "bic": OLS_baseline.bic,
            "anova_p_vs_baseline": "",
            "anova_p_ias_vs_surprisal": "",
            "anova_p_surprisal_vs_ias": "",
        })

        for model in MODEL_NAMES:

            OLS_surprisal = smf.ols(
                formula=f'{predicted_var} ~ {baseline_predictors_str} + {model}_surprisal', 
                data=df_tmp_fold
            ).fit()

            results_comparison_df.append({
                "y": predicted_var, 
                "metric": "Surprisal", 
                "model": model, 
                "aggregation": "",
                "dist_metric": "",
                "fold": fold, 
                "loglik": OLS_surprisal.llf / OLS_surprisal.nobs,
                "delta_loglik": OLS_surprisal.llf / OLS_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                "rsquared": OLS_surprisal.rsquared,
                "delta_rsquared": OLS_surprisal.rsquared - OLS_baseline.rsquared,
                "rsquared_adj": OLS_surprisal.rsquared_adj,
                "delta_rsquared_adj": OLS_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
                "aic": OLS_surprisal.aic,
                "bic": OLS_surprisal.bic,
                "anova_p_vs_baseline": "",
                "anova_p_ias_vs_surprisal": "",
                "anova_p_surprisal_vs_ias": "",
                "anova_p_both_vs_ias": "",
                "anova_p_both_vs_surprisal": "",
            })

            # fit IAS models 
            for dist_metric in DISTANCE_METRICS:
                for aggregation in ["Smean", "Smin"]:
                    predictors = [p for p in IAS_PREDICTORS if f"_D{dist_metric}_S" in p and p.startswith(model) and p.endswith(aggregation)]

                    assert(len(predictors) == 130)

                    OLS_ias = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)}',
                        data=df_tmp_fold
                    ).fit()

                    results_comparison_df.append({
                        "y": predicted_var, 
                        "metric": f"IAS ({aggregation[1:]})", 
                        "model": model, 
                        "aggregation": aggregation,
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_ias.llf / OLS_ias.nobs,
                        "delta_loglik": OLS_ias.llf / OLS_ias.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "rsquared": OLS_ias.rsquared,
                        "delta_rsquared": OLS_ias.rsquared - OLS_baseline.rsquared,
                        "rsquared_adj": OLS_ias.rsquared_adj,
                        "delta_rsquared_adj": OLS_ias.rsquared_adj - OLS_baseline.rsquared_adj,
                        "aic": OLS_ias.aic,
                        "bic": OLS_ias.bic,
                        "anova_p_vs_baseline": "",
                        "anova_p_ias_vs_surprisal": "",
                        "anova_p_surprisal_vs_ias": "",
                        "anova_p_both_vs_ias": "",
                        "anova_p_both_vs_surprisal": "",
                    })

                    OLS_ias_surprisal = smf.ols(
                        formula=f'{predicted_var} ~ {baseline_predictors_str} + {"+".join(predictors)} + {model}_surprisal',
                        data=df_tmp_fold
                    ).fit()

                    results_comparison_df.append({
                        "y": predicted_var, 
                        "metric": f"Surprisal + IAS ({aggregation[1:]})",
                        "model": model, 
                        "aggregation": aggregation,
                        "dist_metric": dist_metric,
                        "fold": fold, 
                        "loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs,
                        "delta_loglik": OLS_ias_surprisal.llf / OLS_ias_surprisal.nobs - OLS_baseline.llf / OLS_baseline.nobs,
                        "rsquared": OLS_ias_surprisal.rsquared,
                        "delta_rsquared": OLS_ias_surprisal.rsquared - OLS_baseline.rsquared,
                        "rsquared_adj": OLS_ias_surprisal.rsquared_adj,
                        "delta_rsquared_adj": OLS_ias_surprisal.rsquared_adj - OLS_baseline.rsquared_adj,
                        "aic": OLS_ias_surprisal.aic,
                        "bic": OLS_ias_surprisal.bic,
                        "anova_p_vs_baseline": "",
                        "anova_p_ias_vs_surprisal": "",
                        "anova_p_surprisal_vs_ias": "",
                        "anova_p_both_vs_ias": "",
                        "anova_p_both_vs_surprisal": "",
                    })



results_comparison_df = pd.DataFrame(results_comparison_df)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [23]:
results_comparison_df.to_csv(
    "results_final/ols_aligned_ias_cosine_std_comparison_all.csv",
    index=False
)
