# Correlations: Information Value and Suprisal vs. Reading Times (Provo, Natural Stories, Brown)

In [1]:
### Import and utils
import ast
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from collections import defaultdict, Counter
from scipy.stats import kendalltau, linregress
from tqdm.notebook import tqdm
from typing import List, Dict, Tuple, Optional, Union
from ast import literal_eval
import statsmodels.api as sm
import statsmodels.formula.api as smf


# For local imports
import sys
sys.path.append('../../')
from utils import load_jsonl, load_surprise_data


# Plots
sns.set_context("paper")
sns.set_style("whitegrid")
sns.set_palette("colorblind")
sns.set(font_scale=1.8)

## Load data and preprocess

In [2]:
DATASETS = ['brown', 'ns', 'provo']

DATASET_CONTEXT_MAP = {
    'switchboard': 'switchboard/switchboard_results_is.jsonl',
    'dailydialog': 'dailydialog/dailydialog_results_is.jsonl',
    'BLL2018': 'BLL2018/processed_ratings.jsonl',
    'brown': 'RTs_5jun/brown_rt.jsonl',
    'ns': 'RTs_5jun/ns_rt.jsonl',
    'provo': 'RTs_5jun/provo_rt.jsonl',
}

CORPUS_NAMES = {
    'switchboard': 'Switchboard',
    'dailydialog': 'Dailydialog',
    'BLL2018': 'Clasp',
    'provo': 'Provo',
    'brown': 'Brown',
    'ns': 'Natural Stories',
}

MODEL_NAME_MAP = {
    'gpt2': 'GPT-2 Small',
    'gpt2-ft': 'GPT-2 Small Finetuned', # todo: remove
    'gpt2_medium': 'GPT-2 Medium',
    'gpt2-medium': 'GPT-2 Medium',
    'gpt2_large': 'GPT-2 Large',
    'gpt2-large': 'GPT-2 Large',
    'dialogpt_small': 'DialoGPT Small',
    'dialogpt-small': 'DialoGPT Small',
    'dialogpt_medium': 'DialoGPT Medium',
    'dialogpt-medium': 'DialoGPT Medium',
    'dialogpt_large': 'DialoGPT Large',
    'dialogpt-large': 'DialoGPT Large',
    'opt_125m': 'OPT 125M',
    'opt-125m': 'OPT 125M',
    'opt_350m': 'OPT 350M',
    'opt-350m': 'OPT 350M',
    'opt_1.3b': 'OPT 1.3B',
    'opt-1.3b': 'OPT 1.3B',
    'gpt_neo_125m': 'GPT-Neo 125M',
    'gpt-neo-125m': 'GPT-Neo 125M',
    'gpt_neo_1.3b': 'GPT-Neo 1.3B',
    'gpt-neo-1.3b': 'GPT-Neo 1.3B',
    'gpt-neo-1.3B': 'GPT-Neo 1.3B',
}

SAMPLING_PARAM_MAP = {
    "None": "None", "02": "0.2", "03": "0.3", "075": "0.75", "08": "0.8",
    "085": "0.85", "09": "0.9", "095": "0.95", "125": "1.25"
}

SURPRISAL_COLUMNS = [
    'in_context_surprisal', #'out_of_context_surprisal', 'in_context_surprisal_rnd',
       # 'in_context_entropies', 'out_of_context_entropies', 'in_context_entropies_rnd',
       # 'in_context_deviations', 'out_of_context_deviations', 'in_context_deviations_rnd',
]


In [3]:
### Load data
dfs = []
for dataset in DATASETS:
    print(f"Loading data for {dataset}...")
    DATA_DIR = f"../../../data/surprise/{dataset}"

    df = load_surprise_data(DATA_DIR)

    # Prettify column and value names
    df["corpus"] = df["corpus"].map(CORPUS_NAMES)
    df["sampling"] = df["sampling"].str.capitalize()
    df["sampling"] = df["sampling"].str.replace("Temp", "Temperature")

    df["model"] = df["model"].map(MODEL_NAME_MAP)

    sampling_param_map = {
        "None": "None", "02": "0.2", "03": "0.3", "075": "0.75", "08": "0.8",
        "085": "0.85", "09": "0.9", "095": "0.95", "125": "1.25"
    }
    df["sampling_param"] = df["sampling_param"].map(sampling_param_map)

    dfs.append(df)

surprise = pd.concat(dfs, axis=0, ignore_index=True)

# Set unique data point id for grouping
surprise["item_id"] = surprise.apply(lambda row: f"{row['context_id']}", axis=1)

# Merge sampling and sampling param column in surprise
surprise['sampling'] = surprise['sampling'].astype(str)
surprise['sampling_param'] = surprise['sampling_param'].astype(str)
surprise['sampling_strategy'] = surprise['sampling'] + '_' + surprise['sampling_param']


Loading data for brown...
Size of dataset: 396000 rows
Loading data for ns...
Size of dataset: 377720 rows
Loading data for provo...
Size of dataset: 119680 rows


In [4]:
POWER_RANGE = np.arange(0.5, 5.01, 0.25)

def local_diff(x):
    """from RUID"""
    d = 0
    for i in range(len(x)-1):
        d += abs(x[i+1]-x[i])
    return d

def power(x, y):
    """from RUID"""
#     if x.mask.all():
#         return np.nan
#     return np.nanmean(x**y)
    return np.nansum(x**y)

def load_surprisal_data(data_dir, columns, surprisal_columns):
    """
    Load data from a directory of CSV files containing surprisal estimates.
    # Arguments:
        data_dir (str): Path to directory containing CSV files.
        columns (list): columns to load from CSV files. If None, all columns are loaded.
        surprisal_columns (list): columns to format with ast (ugly but fine for now)
    """
    data = None
    for file in os.listdir(data_dir):
        if file.endswith(".csv"):
            dset = data_dir.split('/')
            fp = os.path.join(data_dir, file)
            df = pd.read_csv(fp, usecols=columns)

            # Store with fields
            fields_from_name = fp.split('/')
            df['corpus'] = CORPUS_NAMES[fields_from_name[-2]]
            df['model'] = MODEL_NAME_MAP[fields_from_name[-1][:-4]]

            # literal eval
            for column in surprisal_columns:
                df[column] = df[column].apply(
                    lambda s: [
                        ast.literal_eval(x.strip(' []'))
                        for x in s.split() if x.strip(' []')
                    ]
                )

            if "data" in locals():
                data = pd.concat([data, df])
            else:
                data = df

    print(f"Size of dataset: {data.shape} rows, cols")

    # Add sequence surprisal aggregates
    surprise_cols = [c for c in data.columns if "in_" in c or "out_" in c]
    # print(surprise_cols)
    for col in surprise_cols:
        data[f'{col}_agg_sum'] = data[col].apply(np.sum)
        data[f'{col}_agg_variance'] = data[col].apply(np.var)
        data[f'{col}_agg_max'] = data[col].apply(np.amax)
        data[f'{col}_agg_mean'] = data[col].apply(np.mean)
        data[f'{col}_agg_ldiff'] = data[col].apply(local_diff)
        for p in POWER_RANGE:
            data[f'{col}_agg_power_' + str(p).replace('.', '_')] = data[col].apply(lambda x: power(x, p))

    print(f"Size of dataset: {data.shape} rows, cols")
    return data


In [5]:
# Load the surprisal data
cols = [
    'context_id', 'time_mean_NO'#, 'judgements', 'mean_acceptability', 'median_acceptability', 'real'
]
all_cols = cols + SURPRISAL_COLUMNS

all_surprisal_dfs = []
for dataset in DATASETS:
    print(f"Loading data for {dataset}...")
    DATA_DIR = f"../../../data/surprisal/{dataset}"

    surprisal_df = load_surprisal_data(DATA_DIR, all_cols, SURPRISAL_COLUMNS)
    all_surprisal_dfs.append(surprisal_df)

surprisal = pd.concat(all_surprisal_dfs, axis=0, ignore_index=True)


Loading data for brown...
Size of dataset: (3600, 5) rows, cols
Size of dataset: (3600, 29) rows, cols
Loading data for ns...
Size of dataset: (3976, 5) rows, cols
Size of dataset: (3976, 29) rows, cols
Loading data for provo...
Size of dataset: (1088, 5) rows, cols
Size of dataset: (1088, 29) rows, cols


In [6]:
len(surprise)/len(surprisal)

103.11634349030471

## Measure correlations

In [7]:
corrs = []
for corpus in tqdm(surprisal.corpus.unique()):
    for model in surprisal.model.unique():
        for measure in SURPRISAL_COLUMNS:
            for agg in ['agg_sum', 'agg_variance', 'agg_max', 'agg_mean', 'agg_ldiff'] + [f"agg_power_{str(p).replace('.', '_')}" for p in POWER_RANGE]:

                _df = surprisal[
                    (surprisal['model'] == model) &
                    (surprisal['corpus'] == corpus)
                ]
                assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")

                if measure == 'mutual_info':
                    measure_name = measure
                else:
                    measure_name = f'{measure}_{agg}'

                rho, p = spearmanr(_df[measure_name], _df['time_mean_NO'], nan_policy="omit")

                corrs.append({
                    'corpus': corpus,
                    'model': model,
                    'n_samples': 'None',
                    'sampling': 'None',
                    'measure': measure,
                    'agg': agg,
                    'corr': rho,
                    'p-value': p,
                    'type': 'surprisal'
                })


iv_measures = [
    'surprise_mean_1gram', 'surprise_mean_2gram', 'surprise_mean_3gram',
    'surprise_mean_1gram_pos', 'surprise_mean_2gram_pos', 'surprise_mean_3gram_pos',
    'surprise_mean_cosine', 'surprise_mean_euclidean',
    'surprise_min_1gram', 'surprise_min_2gram', 'surprise_min_3gram',
    'surprise_min_1gram_pos', 'surprise_min_2gram_pos', 'surprise_min_3gram_pos',
    'surprise_min_cosine', 'surprise_min_euclidean'
]

for corpus in surprise.corpus.unique():
    for model in surprise.model.unique():
        print(model)
        for n_samples in tqdm(surprise.n_samples.unique()):
            for sampling in surprise.sampling_strategy.unique():
                for measure in iv_measures:
                    _df = surprise[
                        (surprise['model'] == model) &
                        (surprise['corpus'] == corpus) &
                        (surprise['n_samples'] == n_samples) &
                        (surprise['sampling_strategy'] == sampling)
                    ]
                    assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")

                    rho, p = spearmanr(_df[measure], _df['time_mean_NO'], nan_policy="omit")

                    corrs.append({
                        'corpus': corpus,
                        'model': model,
                        'n_samples': n_samples,
                        'sampling': sampling,
                        'measure': measure,
                        'agg': 'None',
                        'corr': rho,
                        'p-value': p,
                        'type': 'iv'
                    })

corrs_df = pd.DataFrame(corrs)


  assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")


  0%|          | 0/3 [00:00<?, ?it/s]

GPT-2 Large


  assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-Neo 125M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-Neo 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Small


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 350M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Medium


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 125M


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Large


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-Neo 125M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-Neo 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Small


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 350M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Medium


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 125M


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Large


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-Neo 125M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-Neo 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Small


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 350M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Medium


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 125M


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

In [52]:
# Save dataframe
corrs_df.to_csv("/Users/mario/code/surprise/code/notebooks/correlations/dataframes/correlations_rt.csv", index=False)

In [9]:
## Show rankings

In [10]:
corrs_df[
    (corrs_df.corpus == 'Provo') &
    (corrs_df.type == 'iv')
].sort_values('corr', ascending=True).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
32793,Provo,GPT-Neo 1.3B,40,Typical_0.95,surprise_min_2gram,,-0.04623,0.593035,iv
33145,Provo,GPT-Neo 1.3B,60,Typical_0.95,surprise_min_2gram,,-0.041634,0.630334,iv
33977,Provo,GPT-Neo 1.3B,100,Typical_0.3,surprise_min_2gram,,-0.035625,0.680524,iv
35881,Provo,OPT 350M,10,Nucleus_0.9,surprise_min_2gram,,-0.026317,0.761031,iv
32617,Provo,GPT-Neo 1.3B,30,Typical_0.95,surprise_min_2gram,,-0.025358,0.769491,iv
35802,Provo,OPT 350M,10,Temperature_1.25,surprise_min_3gram,,-0.023986,0.781642,iv
35794,Provo,OPT 350M,10,Temperature_1.25,surprise_mean_3gram,,-0.023742,0.783804,iv
32969,Provo,GPT-Neo 1.3B,50,Typical_0.95,surprise_min_2gram,,-0.016997,0.844293,iv
39374,Provo,OPT 125M,10,Nucleus_0.85,surprise_min_cosine,,-0.013401,0.876942,iv
39375,Provo,OPT 125M,10,Nucleus_0.85,surprise_min_euclidean,,-0.013401,0.876942,iv


In [11]:
corrs_df[
    (corrs_df.corpus == 'Provo') &
    (corrs_df.type == 'iv')
].sort_values('corr', ascending=False).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
34141,Provo,GPT-2 Small,10,Nucleus_0.95,surprise_min_3gram_pos,,0.420734,3.39605e-07,iv
39429,Provo,OPT 125M,10,Typical_0.3,surprise_mean_3gram_pos,,0.396429,1.773201e-06,iv
34133,Provo,GPT-2 Small,10,Nucleus_0.95,surprise_mean_3gram_pos,,0.392141,2.342071e-06,iv
39437,Provo,OPT 125M,10,Typical_0.3,surprise_min_3gram_pos,,0.389302,2.809909e-06,iv
38469,Provo,GPT-2 Medium,60,Temperature_0.75,surprise_mean_3gram_pos,,0.382382,4.349235e-06,iv
39432,Provo,OPT 125M,10,Typical_0.3,surprise_min_1gram,,0.379163,5.311087e-06,iv
38645,Provo,GPT-2 Medium,70,Temperature_0.75,surprise_mean_3gram_pos,,0.378522,5.525353e-06,iv
28829,Provo,GPT-2 Large,10,Nucleus_0.8,surprise_min_3gram_pos,,0.37833,5.591134e-06,iv
41805,Provo,OPT 1.3B,50,Typical_0.2,surprise_min_3gram_pos,,0.376833,6.129758e-06,iv
38285,Provo,GPT-2 Medium,50,Typical_0.2,surprise_min_3gram_pos,,0.376482,6.263017e-06,iv


In [12]:
corrs_df[
    (corrs_df.corpus == 'Provo') &
    (corrs_df.type == 'surprisal')
].sort_values('corr', ascending=False).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
529,Provo,GPT-Neo 125M,,,in_context_surprisal,agg_variance,0.495325,8.758864e-10,surprisal
385,Provo,GPT-2 Medium,,,in_context_surprisal,agg_variance,0.481103,3.054629e-09,surprisal
531,Provo,GPT-Neo 125M,,,in_context_surprisal,agg_mean,0.470908,7.225927e-09,surprisal
387,Provo,GPT-2 Medium,,,in_context_surprisal,agg_mean,0.459153,1.884242e-08,surprisal
555,Provo,OPT 125M,,,in_context_surprisal,agg_mean,0.458556,1.97623e-08,surprisal
483,Provo,GPT-Neo 1.3B,,,in_context_surprisal,agg_mean,0.454525,2.721084e-08,surprisal
459,Provo,OPT 350M,,,in_context_surprisal,agg_mean,0.448857,4.236598e-08,surprisal
435,Provo,GPT-2 Small,,,in_context_surprisal,agg_mean,0.446167,5.212939e-08,surprisal
411,Provo,OPT 1.3B,,,in_context_surprisal,agg_mean,0.445861,5.336466e-08,surprisal
505,Provo,GPT-2 Large,,,in_context_surprisal,agg_variance,0.441248,7.580989e-08,surprisal


In [13]:
corrs_df[
    (corrs_df.corpus == 'Natural Stories') &
    (corrs_df.type == 'iv')
].sort_values('corr', ascending=False).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
17153,Natural Stories,GPT-Neo 125M,50,Typical_0.2,surprise_mean_2gram,,0.309871,1.602066e-12,iv
16977,Natural Stories,GPT-Neo 125M,40,Typical_0.2,surprise_mean_2gram,,0.307502,2.408093e-12,iv
17505,Natural Stories,GPT-Neo 125M,70,Typical_0.2,surprise_mean_2gram,,0.306789,2.720287e-12,iv
18192,Natural Stories,GPT-Neo 1.3B,10,Temperature_1.25,surprise_mean_1gram,,0.305824,3.206338e-12,iv
18033,Natural Stories,GPT-Neo 125M,100,Typical_0.2,surprise_mean_2gram,,0.304159,4.252988e-12,iv
18368,Natural Stories,GPT-Neo 1.3B,20,Temperature_1.25,surprise_mean_1gram,,0.304127,4.276135e-12,iv
17488,Natural Stories,GPT-Neo 125M,70,Temperature_1.25,surprise_mean_1gram,,0.303965,4.394579e-12,iv
17329,Natural Stories,GPT-Neo 125M,60,Typical_0.2,surprise_mean_2gram,,0.30374,4.564679e-12,iv
16608,Natural Stories,GPT-Neo 125M,20,Temperature_1.25,surprise_mean_1gram,,0.302789,5.357964e-12,iv
17857,Natural Stories,GPT-Neo 125M,90,Typical_0.2,surprise_mean_2gram,,0.302758,5.385915e-12,iv


In [14]:
corrs_df[
    (corrs_df.corpus == 'Natural Stories') &
    (corrs_df.type == 'surprisal')
].sort_values('corr', ascending=False).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
339,Natural Stories,GPT-Neo 125M,,,in_context_surprisal,agg_mean,0.392923,8.51689e-20,surprisal
291,Natural Stories,GPT-Neo 1.3B,,,in_context_surprisal,agg_mean,0.390716,1.419814e-19,surprisal
243,Natural Stories,GPT-2 Small,,,in_context_surprisal,agg_mean,0.379489,1.802647e-18,surprisal
315,Natural Stories,GPT-2 Large,,,in_context_surprisal,agg_mean,0.37175,9.822118e-18,surprisal
195,Natural Stories,GPT-2 Medium,,,in_context_surprisal,agg_mean,0.360891,9.827840000000001e-17,surprisal
363,Natural Stories,OPT 125M,,,in_context_surprisal,agg_mean,0.322905,1.593011e-13,surprisal
267,Natural Stories,OPT 350M,,,in_context_surprisal,agg_mean,0.302556,5.572388e-12,surprisal
219,Natural Stories,OPT 1.3B,,,in_context_surprisal,agg_mean,0.295432,1.814631e-11,surprisal
337,Natural Stories,GPT-Neo 125M,,,in_context_surprisal,agg_variance,0.232546,1.577692e-07,surprisal
289,Natural Stories,GPT-Neo 1.3B,,,in_context_surprisal,agg_variance,0.230097,2.142759e-07,surprisal


In [15]:
corrs_df[
    (corrs_df.corpus == 'Brown') &
    (corrs_df.type == 'iv')
].sort_values('corr', ascending=False).head(80)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
7401,Brown,GPT-2 Small,90,Typical_0.3,surprise_min_2gram,,0.223212,0.000002,iv
3017,Brown,GPT-Neo 125M,40,Ancestral_None,surprise_min_2gram,,0.221030,0.000002,iv
7577,Brown,GPT-2 Small,100,Typical_0.3,surprise_min_2gram,,0.220205,0.000002,iv
6873,Brown,GPT-2 Small,60,Typical_0.3,surprise_min_2gram,,0.218814,0.000003,iv
7225,Brown,GPT-2 Small,80,Typical_0.3,surprise_min_2gram,,0.206637,0.000010,iv
...,...,...,...,...,...,...,...,...,...
11433,Brown,OPT 125M,20,Nucleus_0.95,surprise_min_2gram,,0.174542,0.000199,iv
10377,Brown,GPT-2 Medium,60,Nucleus_0.95,surprise_min_2gram,,0.174460,0.000200,iv
1161,Brown,GPT-2 Large,40,Temperature_0.75,surprise_min_2gram,,0.174305,0.000203,iv
7977,Brown,OPT 350M,30,Typical_0.95,surprise_min_2gram,,0.173933,0.000209,iv


In [16]:
corrs_df[
    (corrs_df.corpus == 'Brown') &
    (corrs_df.type == 'surprisal')
].sort_values('corr', ascending=False).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
99,Brown,GPT-Neo 1.3B,,,in_context_surprisal,agg_mean,0.219881,2e-06,surprisal
51,Brown,GPT-2 Small,,,in_context_surprisal,agg_mean,0.216187,4e-06,surprisal
147,Brown,GPT-Neo 125M,,,in_context_surprisal,agg_mean,0.212365,6e-06,surprisal
3,Brown,GPT-2 Medium,,,in_context_surprisal,agg_mean,0.199709,2e-05,surprisal
123,Brown,GPT-2 Large,,,in_context_surprisal,agg_mean,0.193166,3.7e-05,surprisal
171,Brown,OPT 125M,,,in_context_surprisal,agg_mean,0.18827,5.9e-05,surprisal
75,Brown,OPT 350M,,,in_context_surprisal,agg_mean,0.163418,0.000501,surprisal
170,Brown,OPT 125M,,,in_context_surprisal,agg_max,0.149453,0.001476,surprisal
27,Brown,OPT 1.3B,,,in_context_surprisal,agg_mean,0.146486,0.001836,surprisal
74,Brown,OPT 350M,,,in_context_surprisal,agg_max,0.134265,0.004328,surprisal


## Find best estimator per corpus and linguistic level

In [42]:
corrs_df[
    (corrs_df.corpus == 'Provo') &
    (corrs_df.type == "iv") &
    (~corrs_df.measure.str.contains('cosine')) &
    (~corrs_df.measure.str.contains('euclidean')) &
    (~corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
39432,Provo,OPT 125M,10,Typical_0.3,surprise_min_1gram,,0.379163,5e-06,iv
37584,Provo,GPT-2 Medium,10,Temperature_0.75,surprise_mean_1gram,,0.374863,7e-06,iv


In [43]:
corrs_df[
    (corrs_df.corpus == 'Provo') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
34141,Provo,GPT-2 Small,10,Nucleus_0.95,surprise_min_3gram_pos,,0.420734,3.39605e-07,iv
39429,Provo,OPT 125M,10,Typical_0.3,surprise_mean_3gram_pos,,0.396429,1.773201e-06,iv


In [44]:
corrs_df[
    (corrs_df.corpus == 'Provo') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('cosine') | corrs_df.measure.str.contains('euclidean'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
41007,Provo,OPT 125M,100,Nucleus_0.95,surprise_min_euclidean,,0.181198,0.034762,iv
41006,Provo,OPT 125M,100,Nucleus_0.95,surprise_min_cosine,,0.181198,0.034762,iv


In [51]:
corrs_df[
    (corrs_df.corpus == 'Natural Stories') &
    (corrs_df.type == "iv") &
    (~corrs_df.measure.str.contains('cosine')) &
    (~corrs_df.measure.str.contains('euclidean')) &
    (~corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
17153,Natural Stories,GPT-Neo 125M,50,Typical_0.2,surprise_mean_2gram,,0.309871,1.602066e-12,iv
16977,Natural Stories,GPT-Neo 125M,40,Typical_0.2,surprise_mean_2gram,,0.307502,2.408093e-12,iv


In [40]:
corrs_df[
    (corrs_df.corpus == 'Natural Stories') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
16507,Natural Stories,GPT-Neo 125M,10,Nucleus_0.8,surprise_min_1gram_pos,,0.261986,3.033707e-09,iv
25707,Natural Stories,OPT 125M,30,Typical_0.3,surprise_min_1gram_pos,,0.259018,4.623416e-09,iv


In [45]:
corrs_df[
    (corrs_df.corpus == 'Natural Stories') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('cosine') | corrs_df.measure.str.contains('euclidean'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
23391,Natural Stories,OPT 350M,100,Nucleus_0.9,surprise_min_euclidean,,0.147226,0.000995,iv
23390,Natural Stories,OPT 350M,100,Nucleus_0.9,surprise_min_cosine,,0.147226,0.000995,iv


In [46]:
corrs_df[
    (corrs_df.corpus == 'Brown') &
    (corrs_df.type == "iv") &
    (~corrs_df.measure.str.contains('cosine')) &
    (~corrs_df.measure.str.contains('euclidean')) &
    (~corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
7401,Brown,GPT-2 Small,90,Typical_0.3,surprise_min_2gram,,0.223212,2e-06,iv
3017,Brown,GPT-Neo 125M,40,Ancestral_None,surprise_min_2gram,,0.22103,2e-06,iv


In [47]:
corrs_df[
    (corrs_df.corpus == 'Brown') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
9509,Brown,GPT-2 Medium,10,Typical_0.3,surprise_mean_3gram_pos,,0.18484,8.6e-05,iv
10827,Brown,GPT-2 Medium,90,Typical_0.2,surprise_min_1gram_pos,,0.175202,0.000188,iv


In [50]:
corrs_df[
    (corrs_df.corpus == 'Brown') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('cosine') | corrs_df.measure.str.contains('euclidean'))
].sort_values('corr', ascending=False).head(2)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
4046,Brown,GPT-Neo 125M,100,Nucleus_0.95,surprise_min_cosine,,0.048443,0.305186,iv
4047,Brown,GPT-Neo 125M,100,Nucleus_0.95,surprise_min_euclidean,,0.048443,0.305186,iv
