# Correlations: Information Value and Suprisal vs. Acceptability (Clasp)

In [1]:
### Import and utils
import ast
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from collections import defaultdict, Counter
from scipy.stats import kendalltau, linregress
from tqdm.notebook import tqdm
from typing import List, Dict, Tuple, Optional, Union
from ast import literal_eval
import statsmodels.api as sm
import statsmodels.formula.api as smf


# For local imports
import sys
sys.path.append('../../')
from utils import load_jsonl, load_surprise_data


# Plots
sns.set_context("paper")
sns.set_style("whitegrid")
sns.set_palette("colorblind")
sns.set(font_scale=1.8)

## Load data and preprocess

In [2]:
DATASETS = ['BLL2018']

DATASET_CONTEXT_MAP = {
    'switchboard': 'switchboard/switchboard_results_is.jsonl',
    'dailydialog': 'dailydialog/dailydialog_results_is.jsonl',
    'BLL2018': 'BLL2018/processed_ratings.jsonl',
    'brown': 'RTs/brown_rt.jsonl',
    'ns': 'RTs/ns_rt.jsonl',
    'provo': 'RTs/provo_rt.jsonl',
}

CORPUS_NAMES = {
    'switchboard': 'Switchboard',
    'dailydialog': 'Dailydialog',
    'BLL2018': 'Clasp',
    'provo': 'Provo',
    'brown': 'Brown',
    'ns': 'Natural Stories',
}

MODEL_NAME_MAP = {
    'gpt2': 'GPT-2 Small',
    'gpt2-ft': 'GPT-2 Small Finetuned', # todo: remove
    'gpt2_medium': 'GPT-2 Medium',
    'gpt2-medium': 'GPT-2 Medium',
    'gpt2_large': 'GPT-2 Large',
    'gpt2-large': 'GPT-2 Large',
    'dialogpt_small': 'DialoGPT Small',
    'dialogpt-small': 'DialoGPT Small',
    'dialogpt_medium': 'DialoGPT Medium',
    'dialogpt-medium': 'DialoGPT Medium',
    'dialogpt_large': 'DialoGPT Large',
    'dialogpt-large': 'DialoGPT Large',
    'opt_125m': 'OPT 125M',
    'opt-125m': 'OPT 125M',
    'opt_350m': 'OPT 350M',
    'opt-350m': 'OPT 350M',
    'opt_1.3b': 'OPT 1.3B',
    'opt-1.3b': 'OPT 1.3B',
    'gpt_neo_125m': 'GPT-Neo 125M',
    'gpt-neo-125m': 'GPT-Neo 125M',
    'gpt_neo_1.3b': 'GPT-Neo 1.3B',
    'gpt-neo-1.3b': 'GPT-Neo 1.3B',
    'gpt-neo-1.3B': 'GPT-Neo 1.3B',
}

SAMPLING_PARAM_MAP = {
    "None": "None", "02": "0.2", "03": "0.3", "075": "0.75", "08": "0.8",
    "085": "0.85", "09": "0.9", "095": "0.95", "125": "1.25"
}

SURPRISAL_COLUMNS = [
    'in_context_surprisal', #'out_of_context_surprisal', 'in_context_surprisal_rnd',
       # 'in_context_entropies', 'out_of_context_entropies', 'in_context_entropies_rnd',
       # 'in_context_deviations', 'out_of_context_deviations', 'in_context_deviations_rnd',
]


In [3]:
dfs = []
for dataset in DATASETS:
    print(f"Loading data for {dataset}...")
    DATA_DIR = f"../../../data/surprise/{dataset}"

    df = load_surprise_data(DATA_DIR)

    # Prettify column and value names
    df["corpus"] = df["corpus"].map(CORPUS_NAMES)
    df["sampling"] = df["sampling"].str.capitalize()
    df["sampling"] = df["sampling"].str.replace("Temp", "Temperature")

    df["model"] = df["model"].map(MODEL_NAME_MAP)

    sampling_param_map = {
        "None": "None", "02": "0.2", "03": "0.3", "075": "0.75", "08": "0.8",
        "085": "0.85", "09": "0.9", "095": "0.95", "125": "1.25"
    }
    df["sampling_param"] = df["sampling_param"].map(sampling_param_map)

    dfs.append(df)

surprise = pd.concat(dfs, axis=0, ignore_index=True)

# Merge sampling and sampling param column in surprise
surprise['sampling'] = surprise['sampling'].astype(str)
surprise['sampling_param'] = surprise['sampling_param'].astype(str)
surprise['sampling_strategy'] = surprise['sampling'] + '_' + surprise['sampling_param']


Loading data for BLL2018...
Size of dataset: 440000 rows


In [4]:
POWER_RANGE = np.arange(0.5, 5.01, 0.25)

def local_diff(x):
    """from RUID"""
    d = 0
    for i in range(len(x)-1):
        d += abs(x[i+1]-x[i])
    return d

def power(x, y):
    """from RUID"""
#     if x.mask.all():
#         return np.nan
#     return np.nanmean(x**y)
    return np.nansum(x**y)

def load_surprisal_data(data_dir, columns, surprisal_columns):
    """
    Load data from a directory of CSV files containing surprisal estimates.
    # Arguments:
        data_dir (str): Path to directory containing CSV files.
        columns (list): columns to load from CSV files. If None, all columns are loaded.
        surprisal_columns (list): columns to format with ast (ugly but fine for now)
    """
    data = None
    for file in os.listdir(data_dir):
        if file.endswith(".csv"):
            dset = data_dir.split('/')
            fp = os.path.join(data_dir, file)
            df = pd.read_csv(fp, usecols=columns)

            # Store with fields
            fields_from_name = fp.split('/')
            df['corpus'] = CORPUS_NAMES[fields_from_name[-2]]
            df['model'] = MODEL_NAME_MAP[fields_from_name[-1][:-4]]

            # literal eval
            for column in surprisal_columns:
                df[column] = df[column].apply(
                    lambda s: [
                        ast.literal_eval(x.strip(' []'))
                        for x in s.split() if x.strip(' []')
                    ]
                )

            if "data" in locals():
                data = pd.concat([data, df])
            else:
                data = df

    print(f"Size of dataset: {data.shape} rows, cols")

    # Add sequence surprisal aggregates
    surprise_cols = [c for c in data.columns if ("in_" in c or "out_" in c) and "acceptability" not in c]
    # print(surprise_cols)
    for col in surprise_cols:
        try:
            data[f'{col}_agg_sum'] = data[col].apply(np.sum)
            data[f'{col}_agg_variance'] = data[col].apply(np.var)
            data[f'{col}_agg_max'] = data[col].apply(np.amax)
            data[f'{col}_agg_mean'] = data[col].apply(np.mean)
            data[f'{col}_agg_ldiff'] = data[col].apply(local_diff)
        except TypeError:
            print(f"TypeError for {col}")
            print(data[col].head())
        for p in POWER_RANGE:
            data[f'{col}_agg_power_' + str(p).replace('.', '_')] = data[col].apply(lambda x: power(x, p))

    print(f"Size of dataset: {data.shape} rows, cols")
    return data


In [5]:
# cols = ['context_id', 'random_context_id', 'time_sum_mean', 'time_sum_list',
       # 'time_count_nonzero', 'time_mean', 'time_sum_mean_NO',
       # 'time_sum_list_NO', 'time_count_nonzero_NO', 'time_mean_NO',]

all_surprisal_dfs = []
for dataset in DATASETS:
    print(f"Loading data for {dataset}...")
    DATA_DIR = f"../../../data/surprisal/{dataset}"

    all_cols = ['context_id', 'target_id', 'mean_acceptability_in_context']
    all_cols += SURPRISAL_COLUMNS

    surprisal_df = load_surprisal_data(DATA_DIR, all_cols, SURPRISAL_COLUMNS)
    all_surprisal_dfs.append(surprisal_df)

surprisal = pd.concat(all_surprisal_dfs, axis=0, ignore_index=True)

# surprisal['mutual_info'] = surprisal['in_context_surprisal_agg_mean'] - surprisal['out_of_context_surprisal_agg_mean']


Loading data for BLL2018...
Size of dataset: (4000, 6) rows, cols
Size of dataset: (4000, 30) rows, cols


In [6]:
len(surprise)/len(surprisal)

110.0

## Measure correlations

In [7]:
corrs = []
for corpus in tqdm(surprisal.corpus.unique()):
    for model in surprisal.model.unique():
        for measure in SURPRISAL_COLUMNS:
            for agg in ['agg_sum', 'agg_variance', 'agg_max', 'agg_mean', 'agg_ldiff'] + [f"agg_power_{str(p).replace('.', '_')}" for p in POWER_RANGE]:

                _df = surprisal[
                    (surprisal['model'] == model) &
                    (surprisal['corpus'] == corpus)
                ]
                assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")

                if measure == 'mutual_info':
                    measure_name = measure
                else:
                    measure_name = f'{measure}_{agg}'

                rho, p = spearmanr(_df[measure_name], _df['mean_acceptability_in_context'], nan_policy="omit")

                corrs.append({
                    'corpus': corpus,
                    'model': model,
                    'n_samples': 'None',
                    'sampling': 'None',
                    'measure': measure,
                    'agg': agg,
                    'corr': rho,
                    'p-value': p,
                    'type': 'surprisal'
                })


iv_measures = [
    'surprise_mean_1gram', 'surprise_mean_2gram', 'surprise_mean_3gram',
    'surprise_mean_1gram_pos', 'surprise_mean_2gram_pos', 'surprise_mean_3gram_pos',
    'surprise_mean_cosine', 'surprise_mean_euclidean',
    'surprise_min_1gram', 'surprise_min_2gram', 'surprise_min_3gram',
    'surprise_min_1gram_pos', 'surprise_min_2gram_pos', 'surprise_min_3gram_pos',
    'surprise_min_cosine', 'surprise_min_euclidean'
]

for corpus in surprise.corpus.unique():
    for model in surprise.model.unique():
        print(model)
        for n_samples in tqdm(surprise.n_samples.unique()):
            for sampling in surprise.sampling_strategy.unique():
                for measure in iv_measures:
                    _df = surprise[
                        (surprise['model'] == model) &
                        (surprise['corpus'] == corpus) &
                        (surprise['n_samples'] == n_samples) &
                        (surprise['sampling_strategy'] == sampling)
                    ]
                    assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")

                    rho, p = spearmanr(_df[measure], _df['mean_acceptability_in_context'], nan_policy="omit")

                    corrs.append({
                        'corpus': corpus,
                        'model': model,
                        'n_samples': n_samples,
                        'sampling': sampling,
                        'measure': measure,
                        'agg': 'None',
                        'corr': rho,
                        'p-value': p,
                        'type': 'iv'
                    })

corrs_df = pd.DataFrame(corrs)


  assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")


  0%|          | 0/1 [00:00<?, ?it/s]

  assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")


GPT-Neo 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Medium


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-Neo 125M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Large


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 350M


  0%|          | 0/10 [00:00<?, ?it/s]

GPT-2 Small


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 125M


  0%|          | 0/10 [00:00<?, ?it/s]

OPT 1.3B


  0%|          | 0/10 [00:00<?, ?it/s]

In [17]:
corrs_df.to_csv("/Users/mario/code/surprise/code/notebooks/correlations/dataframes/correlations_clasp.csv", index=False)


## Show rankings

In [9]:
corrs_df[
    (corrs_df.corpus == 'Clasp') &
    (corrs_df.type == 'surprisal')
].sort_values('corr', ascending=True).head(30)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
171,Clasp,OPT 125M,,,in_context_surprisal,agg_mean,-0.558615,2.268065e-42,surprisal
27,Clasp,OPT 1.3B,,,in_context_surprisal,agg_mean,-0.547677,1.777142e-40,surprisal
75,Clasp,OPT 350M,,,in_context_surprisal,agg_mean,-0.545489,4.170835e-40,surprisal
123,Clasp,GPT-2 Large,,,in_context_surprisal,agg_mean,-0.52962,1.690744e-37,surprisal
99,Clasp,GPT-Neo 1.3B,,,in_context_surprisal,agg_mean,-0.518461,9.566251e-36,surprisal
3,Clasp,GPT-2 Medium,,,in_context_surprisal,agg_mean,-0.498946,7.842289e-33,surprisal
51,Clasp,GPT-2 Small,,,in_context_surprisal,agg_mean,-0.430965,4.967378e-24,surprisal
147,Clasp,GPT-Neo 125M,,,in_context_surprisal,agg_mean,-0.411792,6.923619e-22,surprisal
40,Clasp,OPT 1.3B,,,in_context_surprisal,agg_power_3_25,-0.406959,2.2879499999999998e-21,surprisal
41,Clasp,OPT 1.3B,,,in_context_surprisal,agg_power_3_5,-0.405595,3.1946509999999998e-21,surprisal


In [10]:
corrs_df[
    (corrs_df.corpus == 'Clasp') &
    (corrs_df.type == 'iv')
].sort_values('corr', ascending=True).head(30)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
7084,Clasp,GPT-2 Large,100,Nucleus_0.95,surprise_min_2gram_pos,,-0.233694,1.251341e-07,iv
7020,Clasp,GPT-2 Large,90,Nucleus_0.85,surprise_min_2gram_pos,,-0.22253,4.991703e-07,iv
1084,Clasp,GPT-Neo 1.3B,60,Temperature_1.25,surprise_min_2gram_pos,,-0.221416,5.708349e-07,iv
14094,Clasp,OPT 1.3B,90,Temperature_0.75,surprise_min_cosine,,-0.221016,5.989753e-07,iv
14095,Clasp,OPT 1.3B,90,Temperature_0.75,surprise_min_euclidean,,-0.221016,5.989753e-07,iv
14271,Clasp,OPT 1.3B,100,Temperature_0.75,surprise_min_euclidean,,-0.218799,7.802537e-07,iv
14270,Clasp,OPT 1.3B,100,Temperature_0.75,surprise_min_cosine,,-0.218799,7.802537e-07,iv
1260,Clasp,GPT-Neo 1.3B,70,Temperature_1.25,surprise_min_2gram_pos,,-0.217021,9.627191e-07,iv
13918,Clasp,OPT 1.3B,80,Temperature_0.75,surprise_min_cosine,,-0.216036,1.080875e-06,iv
13919,Clasp,OPT 1.3B,80,Temperature_0.75,surprise_min_euclidean,,-0.216036,1.080875e-06,iv


## Find best estimator per corpus and linguistic level

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
17822,Switchboard,DialoGPT Large,100,Temperature_1.25,surprise_min_cosine,,-0.702414,3.832227e-16,iv
17823,Switchboard,DialoGPT Large,100,Temperature_1.25,surprise_min_euclidean,,-0.702414,3.832227e-16,iv
17647,Switchboard,DialoGPT Large,90,Temperature_1.25,surprise_min_euclidean,,-0.692922,1.375102e-15,iv
17646,Switchboard,DialoGPT Large,90,Temperature_1.25,surprise_min_cosine,,-0.692922,1.375102e-15,iv
24703,Switchboard,GPT-2 Small,90,Typical_0.95,surprise_min_euclidean,,-0.692855,1.387327e-15,iv
24702,Switchboard,GPT-2 Small,90,Typical_0.95,surprise_min_cosine,,-0.692855,1.387327e-15,iv
24878,Switchboard,GPT-2 Small,100,Typical_0.95,surprise_min_cosine,,-0.688118,2.576984e-15,iv
24879,Switchboard,GPT-2 Small,100,Typical_0.95,surprise_min_euclidean,,-0.688118,2.576984e-15,iv
19742,Switchboard,GPT-2 Large,100,Ancestral_None,surprise_min_cosine,,-0.681657,5.885868e-15,iv
19743,Switchboard,GPT-2 Large,100,Ancestral_None,surprise_min_euclidean,,-0.681657,5.885868e-15,iv


In [12]:
corrs_df[
    (corrs_df.corpus == 'Clasp') &
    (corrs_df.type == "iv") &
    (~corrs_df.measure.str.contains('cosine')) &
    (~corrs_df.measure.str.contains('euclidean')) &
    (~corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=True).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
6890,Clasp,GPT-2 Large,90,Temperature_1.25,surprise_min_3gram,,-0.21,2e-06,iv
11418,Clasp,OPT 125M,40,Nucleus_0.85,surprise_min_3gram,,-0.205824,4e-06,iv
7066,Clasp,GPT-2 Large,100,Temperature_1.25,surprise_min_3gram,,-0.204071,4e-06,iv
6882,Clasp,GPT-2 Large,90,Temperature_1.25,surprise_mean_3gram,,-0.202738,5e-06,iv
11594,Clasp,OPT 125M,50,Nucleus_0.85,surprise_min_3gram,,-0.200002,7e-06,iv
11410,Clasp,OPT 125M,40,Nucleus_0.85,surprise_mean_3gram,,-0.199666,7e-06,iv
11242,Clasp,OPT 125M,30,Nucleus_0.85,surprise_min_3gram,,-0.195138,1.1e-05,iv
7058,Clasp,GPT-2 Large,100,Temperature_1.25,surprise_mean_3gram,,-0.194668,1.2e-05,iv
12122,Clasp,OPT 125M,80,Nucleus_0.85,surprise_min_3gram,,-0.194549,1.2e-05,iv
11586,Clasp,OPT 125M,50,Nucleus_0.85,surprise_mean_3gram,,-0.194251,1.2e-05,iv


In [14]:
corrs_df[
    (corrs_df.corpus == 'Clasp') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=True).head(10)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
7084,Clasp,GPT-2 Large,100,Nucleus_0.95,surprise_min_2gram_pos,,-0.233694,1.251341e-07,iv
7020,Clasp,GPT-2 Large,90,Nucleus_0.85,surprise_min_2gram_pos,,-0.22253,4.991703e-07,iv
1084,Clasp,GPT-Neo 1.3B,60,Temperature_1.25,surprise_min_2gram_pos,,-0.221416,5.708349e-07,iv
1260,Clasp,GPT-Neo 1.3B,70,Temperature_1.25,surprise_min_2gram_pos,,-0.217021,9.627191e-07,iv
11596,Clasp,OPT 125M,50,Nucleus_0.85,surprise_min_2gram_pos,,-0.215548,1.144311e-06,iv
11420,Clasp,OPT 125M,40,Nucleus_0.85,surprise_min_2gram_pos,,-0.21543,1.160251e-06,iv
14156,Clasp,OPT 1.3B,100,Nucleus_0.9,surprise_min_2gram_pos,,-0.215385,1.16631e-06,iv
1436,Clasp,GPT-Neo 1.3B,80,Temperature_1.25,surprise_min_2gram_pos,,-0.2143,1.323438e-06,iv
909,Clasp,GPT-Neo 1.3B,50,Temperature_1.25,surprise_min_3gram_pos,,-0.213865,1.392076e-06,iv
6908,Clasp,GPT-2 Large,90,Nucleus_0.95,surprise_min_2gram_pos,,-0.213347,1.47803e-06,iv


In [16]:
corrs_df[
    (corrs_df.corpus == 'Clasp') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('cosine') | corrs_df.measure.str.contains('euclidean'))
].sort_values('corr', ascending=True).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
14094,Clasp,OPT 1.3B,90,Temperature_0.75,surprise_min_cosine,,-0.221016,5.989753e-07,iv
14095,Clasp,OPT 1.3B,90,Temperature_0.75,surprise_min_euclidean,,-0.221016,5.989753e-07,iv
14271,Clasp,OPT 1.3B,100,Temperature_0.75,surprise_min_euclidean,,-0.218799,7.802537e-07,iv
14270,Clasp,OPT 1.3B,100,Temperature_0.75,surprise_min_cosine,,-0.218799,7.802537e-07,iv
13918,Clasp,OPT 1.3B,80,Temperature_0.75,surprise_min_cosine,,-0.216036,1.080875e-06,iv
13919,Clasp,OPT 1.3B,80,Temperature_0.75,surprise_min_euclidean,,-0.216036,1.080875e-06,iv
13742,Clasp,OPT 1.3B,70,Temperature_0.75,surprise_min_cosine,,-0.208115,2.686657e-06,iv
13743,Clasp,OPT 1.3B,70,Temperature_0.75,surprise_min_euclidean,,-0.208115,2.686657e-06,iv
14238,Clasp,OPT 1.3B,100,Nucleus_0.85,surprise_min_cosine,,-0.204211,4.154967e-06,iv
14239,Clasp,OPT 1.3B,100,Nucleus_0.85,surprise_min_euclidean,,-0.204211,4.154967e-06,iv
