In [1]:
### Import and utils
import ast
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from collections import defaultdict, Counter
from scipy.stats import kendalltau, linregress
from tqdm.notebook import tqdm
from typing import List, Dict, Tuple, Optional, Union
from ast import literal_eval
import statsmodels.api as sm
import statsmodels.formula.api as smf


# For local imports
import sys
sys.path.append('../../')
from utils import load_jsonl, load_surprise_data


# Plots
sns.set_context("paper")
sns.set_style("whitegrid")
sns.set_palette("colorblind")
sns.set(font_scale=1.8)

## Load data and preprocess

In [2]:
DATASETS = ['dailydialog', 'switchboard']

DATASET_CONTEXT_MAP = {
    'switchboard': 'switchboard/switchboard_results_is.jsonl',
    'dailydialog': 'dailydialog/dailydialog_results_is.jsonl',
    'BLL2018': 'BLL2018/processed_ratings.jsonl',
    'brown': 'RTs/brown_rt.jsonl',
    'ns': 'RTs/ns_rt.jsonl',
    'provo': 'RTs/provo_rt.jsonl',
}

CORPUS_NAMES = {
    'switchboard': 'Switchboard',
    'dailydialog': 'Dailydialog',
    'BLL2018': 'Clasp',
    'provo': 'Provo',
    'brown': 'Brown',
    'ns': 'Natural Stories',
}

MODEL_NAME_MAP = {
    'gpt2': 'GPT-2 Small',
    'gpt2-ft': 'GPT-2 Small Finetuned', # todo: remove
    'gpt2_medium': 'GPT-2 Medium',
    'gpt2-medium': 'GPT-2 Medium',
    'gpt2_large': 'GPT-2 Large',
    'gpt2-large': 'GPT-2 Large',
    'dialogpt_small': 'DialoGPT Small',
    'dialogpt-small': 'DialoGPT Small',
    'dialogpt_medium': 'DialoGPT Medium',
    'dialogpt-medium': 'DialoGPT Medium',
    'dialogpt_large': 'DialoGPT Large',
    'dialogpt-large': 'DialoGPT Large',
    'opt_125m': 'OPT 125M',
    'opt-125m': 'OPT 125M',
    'opt_350m': 'OPT 350M',
    'opt-350m': 'OPT 350M',
    'opt_1.3b': 'OPT 1.3B',
    'opt-1.3b': 'OPT 1.3B',
    'gpt_neo_125m': 'GPT-Neo 125M',
    'gpt-neo-125m': 'GPT-Neo 125M',
    'gpt_neo_1.3b': 'GPT-Neo 1.3B',
    'gpt-neo-1.3b': 'GPT-Neo 1.3B',
    'gpt-neo-1.3B': 'GPT-Neo 1.3B',
}

SAMPLING_PARAM_MAP = {
    "None": "None", "02": "0.2", "03": "0.3", "075": "0.75", "08": "0.8",
    "085": "0.85", "09": "0.9", "095": "0.95", "125": "1.25"
}

SURPRISAL_COLUMNS = [
    'in_context_surprisal', # 'out_of_context_surprisal', 'in_context_surprisal_rnd',
       # 'in_context_entropies', 'out_of_context_entropies', 'in_context_entropies_rnd',
       # 'in_context_deviations', 'out_of_context_deviations', 'in_context_deviations_rnd',
]


In [3]:
dfs = []
for dataset in DATASETS:
    print(f"Loading data for {dataset}...")
    DATA_DIR = f"../../../data/surprise/{dataset}"

    df = load_surprise_data(DATA_DIR)

    # Prettify column and value names
    df["corpus"] = df["corpus"].map(CORPUS_NAMES)
    df["sampling"] = df["sampling"].str.capitalize()
    df["sampling"] = df["sampling"].str.replace("Temp", "Temperature")

    df["model"] = df["model"].map(MODEL_NAME_MAP)

    sampling_param_map = {
        "None": "None", "02": "0.2", "03": "0.3", "075": "0.75", "08": "0.8",
        "085": "0.85", "09": "0.9", "095": "0.95", "125": "1.25"
    }
    df["sampling_param"] = df["sampling_param"].map(sampling_param_map)

    dfs.append(df)

surprise = pd.concat(dfs, axis=0, ignore_index=True)

# Merge sampling and sampling param column in surprise
surprise['sampling'] = surprise['sampling'].astype(str)
surprise['sampling_param'] = surprise['sampling_param'].astype(str)
surprise['sampling_strategy'] = surprise['sampling'] + '_' + surprise['sampling_param']


Loading data for dailydialog...
Size of dataset: 88000 rows
Loading data for switchboard...
Size of dataset: 88000 rows


In [4]:
POWER_RANGE = np.arange(0.5, 5.01, 0.25)

def local_diff(x):
    """from RUID"""
    d = 0
    for i in range(len(x)-1):
        d += abs(x[i+1]-x[i])
    return d

def power(x, y):
    """from RUID"""
#     if x.mask.all():
#         return np.nan
#     return np.nanmean(x**y)
    return np.nansum(x**y)

def load_surprisal_data(data_dir, columns, surprisal_columns):
    """
    Load data from a directory of CSV files containing surprisal estimates.
    # Arguments:
        data_dir (str): Path to directory containing CSV files.
        columns (list): columns to load from CSV files. If None, all columns are loaded.
        surprisal_columns (list): columns to format with ast (ugly but fine for now)
    """
    data = None
    for file in os.listdir(data_dir):
        if file.endswith(".csv"):
            dset = data_dir.split('/')
            fp = os.path.join(data_dir, file)
            df = pd.read_csv(fp, usecols=columns)

            # Store with fields
            fields_from_name = fp.split('/')
            df['corpus'] = CORPUS_NAMES[fields_from_name[-2]]
            df['model'] = MODEL_NAME_MAP[fields_from_name[-1][:-4]]

            # literal eval
            for column in surprisal_columns:
                df[column] = df[column].apply(
                    lambda s: [
                        ast.literal_eval(x.strip(' []'))
                        for x in s.split() if x.strip(' []')
                    ]
                )

            if "data" in locals():
                data = pd.concat([data, df])
            else:
                data = df

    print(f"Size of dataset: {data.shape} rows, cols")

    # Add sequence surprisal aggregates
    surprise_cols = [c for c in data.columns if ("in_" in c or "out_" in c) and "acceptability" not in c]
    # print(surprise_cols)
    for col in surprise_cols:
        try:
            data[f'{col}_agg_sum'] = data[col].apply(np.sum)
            data[f'{col}_agg_variance'] = data[col].apply(np.var)
            data[f'{col}_agg_max'] = data[col].apply(np.amax)
            data[f'{col}_agg_mean'] = data[col].apply(np.mean)
            data[f'{col}_agg_ldiff'] = data[col].apply(local_diff)
        except TypeError:
            print(f"TypeError for {col}")
            print(data[col].head())
        for p in POWER_RANGE:
            data[f'{col}_agg_power_' + str(p).replace('.', '_')] = data[col].apply(lambda x: power(x, p))

    print(f"Size of dataset: {data.shape} rows, cols")
    return data


In [5]:
# cols = ['context_id', 'random_context_id', 'time_sum_mean', 'time_sum_list',
       # 'time_count_nonzero', 'time_mean', 'time_sum_mean_NO',
       # 'time_sum_list_NO', 'time_count_nonzero_NO', 'time_mean_NO',]

all_surprisal_dfs = []
for dataset in DATASETS:
    print(f"Loading data for {dataset}...")
    DATA_DIR = f"../../../data/surprisal/{dataset}"

    all_cols = ['context_id', 'target_id', 'mean_acceptability']
    all_cols += SURPRISAL_COLUMNS

    surprisal_df = load_surprisal_data(DATA_DIR, all_cols, SURPRISAL_COLUMNS)
    all_surprisal_dfs.append(surprisal_df)

surprisal = pd.concat(all_surprisal_dfs, axis=0, ignore_index=True)

# surprisal['mutual_info'] = surprisal['in_context_surprisal_agg_mean'] - surprisal['out_of_context_surprisal_agg_mean']


Loading data for dailydialog...
Size of dataset: (800, 6) rows, cols
Size of dataset: (800, 30) rows, cols
Loading data for switchboard...
Size of dataset: (800, 6) rows, cols
Size of dataset: (800, 30) rows, cols


In [6]:
len(surprise)/len(surprisal)

110.0

## Measure correlations

In [53]:
corrs = []
for corpus in tqdm(surprisal.corpus.unique()):
    for model in surprisal.model.unique():
        for measure in SURPRISAL_COLUMNS:
            for agg in ['agg_sum', 'agg_variance', 'agg_max', 'agg_mean', 'agg_ldiff'] + [f"agg_power_{str(p).replace('.', '_')}" for p in POWER_RANGE]:

                _df = surprisal[
                    (surprisal['model'] == model) &
                    (surprisal['corpus'] == corpus)
                ]
                assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")

                if measure == 'mutual_info':
                    measure_name = measure
                else:
                    measure_name = f'{measure}_{agg}'

                rho, p = spearmanr(_df[measure_name], _df['mean_acceptability'], nan_policy="omit")

                corrs.append({
                    'corpus': corpus,
                    'model': model,
                    'n_samples': 'None',
                    'sampling': 'None',
                    'measure': measure,
                    'agg': agg,
                    'corr': rho,
                    'p-value': p,
                    'type': 'surprisal'
                })


iv_measures = [
    'surprise_mean_1gram', 'surprise_mean_2gram', 'surprise_mean_3gram',
    'surprise_mean_1gram_pos', 'surprise_mean_2gram_pos', 'surprise_mean_3gram_pos',
    'surprise_mean_cosine', 'surprise_mean_euclidean',
    'surprise_min_1gram', 'surprise_min_2gram', 'surprise_min_3gram',
    'surprise_min_1gram_pos', 'surprise_min_2gram_pos', 'surprise_min_3gram_pos',
    'surprise_min_cosine', 'surprise_min_euclidean'
]

for corpus in surprise.corpus.unique():
    for model in surprise.model.unique():
        print(model)
        for n_samples in tqdm(surprise.n_samples.unique()):
            for sampling in surprise.sampling_strategy.unique():
                for measure in iv_measures:
                    _df = surprise[
                        (surprise['model'] == model) &
                        (surprise['corpus'] == corpus) &
                        (surprise['n_samples'] == n_samples) &
                        (surprise['sampling_strategy'] == sampling)
                    ]
                    assert(len(_df) == 100, f"Expected 100 rows, got {len(_df)}")

                    rho, p = spearmanr(_df[measure], _df['mean_acceptability'], nan_policy="omit")

                    corrs.append({
                        'corpus': corpus,
                        'model': model,
                        'n_samples': n_samples,
                        'sampling': sampling,
                        'measure': measure,
                        'agg': 'None',
                        'corr': rho,
                        'p-value': p,
                        'type': 'iv'
                    })

corrs_df = pd.DataFrame(corrs)


In [None]:
corrs_df.to_csv("/Users/mario/code/surprise/code/notebooks/correlations/dataframes/correlations_dialogue.csv", index=False)

## Show rankings

In [9]:
corrs_df[
    (corrs_df.corpus == 'Switchboard') &
    (corrs_df.type == 'surprisal')
].sort_values('corr', ascending=True).head(30)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
211,Switchboard,GPT-2 Medium,,,in_context_surprisal,agg_power_4_0,-0.505806,7.962361e-08,surprisal
212,Switchboard,GPT-2 Medium,,,in_context_surprisal,agg_power_4_25,-0.505067,8.375239e-08,surprisal
213,Switchboard,GPT-2 Medium,,,in_context_surprisal,agg_power_4_5,-0.501352,1.077773e-07,surprisal
210,Switchboard,GPT-2 Medium,,,in_context_surprisal,agg_power_3_75,-0.499512,1.219779e-07,surprisal
214,Switchboard,GPT-2 Medium,,,in_context_surprisal,agg_power_4_75,-0.499368,1.23164e-07,surprisal
280,Switchboard,GPT-2 Large,,,in_context_surprisal,agg_power_3_25,-0.499284,1.23861e-07,surprisal
281,Switchboard,GPT-2 Large,,,in_context_surprisal,agg_power_3_5,-0.495274,1.617825e-07,surprisal
282,Switchboard,GPT-2 Large,,,in_context_surprisal,agg_power_3_75,-0.493681,1.797245e-07,surprisal
209,Switchboard,GPT-2 Medium,,,in_context_surprisal,agg_power_3_5,-0.493543,1.81368e-07,surprisal
283,Switchboard,GPT-2 Large,,,in_context_surprisal,agg_power_4_0,-0.493224,1.852099e-07,surprisal


In [10]:
corrs_df[
    (corrs_df.corpus == 'Switchboard') &
    (corrs_df.type == 'iv')
].sort_values('corr', ascending=True).head(30)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
17823,Switchboard,DialoGPT Large,100,Temperature_1.25,surprise_min_euclidean,,-0.702414,3.832227e-16,iv
17822,Switchboard,DialoGPT Large,100,Temperature_1.25,surprise_min_cosine,,-0.702414,3.832227e-16,iv
17646,Switchboard,DialoGPT Large,90,Temperature_1.25,surprise_min_cosine,,-0.692922,1.375102e-15,iv
17647,Switchboard,DialoGPT Large,90,Temperature_1.25,surprise_min_euclidean,,-0.692922,1.375102e-15,iv
24703,Switchboard,GPT-2 Small,90,Typical_0.95,surprise_min_euclidean,,-0.692855,1.387327e-15,iv
24702,Switchboard,GPT-2 Small,90,Typical_0.95,surprise_min_cosine,,-0.692855,1.387327e-15,iv
24879,Switchboard,GPT-2 Small,100,Typical_0.95,surprise_min_euclidean,,-0.688118,2.576984e-15,iv
24878,Switchboard,GPT-2 Small,100,Typical_0.95,surprise_min_cosine,,-0.688118,2.576984e-15,iv
19742,Switchboard,GPT-2 Large,100,Ancestral_None,surprise_min_cosine,,-0.681657,5.885868e-15,iv
19743,Switchboard,GPT-2 Large,100,Ancestral_None,surprise_min_euclidean,,-0.681657,5.885868e-15,iv


In [37]:
corrs_df[
    (corrs_df.corpus == 'Dailydialog') &
    (corrs_df.type == 'surprisal')
].sort_values('corr', ascending=True).head(30)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
5375,Dailydialog,GPT-2 Large,90,Nucleus_0.9,surprise_min_euclidean,,-0.584379,1.736895e-10,iv
5374,Dailydialog,GPT-2 Large,90,Nucleus_0.9,surprise_min_cosine,,-0.584379,1.736895e-10,iv
5551,Dailydialog,GPT-2 Large,100,Nucleus_0.9,surprise_min_euclidean,,-0.578579,2.889064e-10,iv
5550,Dailydialog,GPT-2 Large,100,Nucleus_0.9,surprise_min_cosine,,-0.578579,2.889064e-10,iv
5198,Dailydialog,GPT-2 Large,80,Nucleus_0.9,surprise_min_cosine,,-0.577742,3.106707e-10,iv
5199,Dailydialog,GPT-2 Large,80,Nucleus_0.9,surprise_min_euclidean,,-0.577742,3.106707e-10,iv
11918,Dailydialog,GPT-Neo 1.3B,60,Nucleus_0.8,surprise_min_cosine,,-0.575127,3.892641e-10,iv
11919,Dailydialog,GPT-Neo 1.3B,60,Nucleus_0.8,surprise_min_euclidean,,-0.575127,3.892641e-10,iv
5022,Dailydialog,GPT-2 Large,70,Nucleus_0.9,surprise_min_cosine,,-0.574599,4.072965e-10,iv
5023,Dailydialog,GPT-2 Large,70,Nucleus_0.9,surprise_min_euclidean,,-0.574599,4.072965e-10,iv


In [44]:
corrs_df[
    (corrs_df.corpus == 'Dailydialog') &
    (corrs_df.type == 'iv')
].sort_values('corr', ascending=True).head(30)

## Find best estimator per corpus and linguistic level

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
17822,Switchboard,DialoGPT Large,100,Temperature_1.25,surprise_min_cosine,,-0.702414,3.832227e-16,iv
17823,Switchboard,DialoGPT Large,100,Temperature_1.25,surprise_min_euclidean,,-0.702414,3.832227e-16,iv
17647,Switchboard,DialoGPT Large,90,Temperature_1.25,surprise_min_euclidean,,-0.692922,1.375102e-15,iv
17646,Switchboard,DialoGPT Large,90,Temperature_1.25,surprise_min_cosine,,-0.692922,1.375102e-15,iv
24703,Switchboard,GPT-2 Small,90,Typical_0.95,surprise_min_euclidean,,-0.692855,1.387327e-15,iv
24702,Switchboard,GPT-2 Small,90,Typical_0.95,surprise_min_cosine,,-0.692855,1.387327e-15,iv
24878,Switchboard,GPT-2 Small,100,Typical_0.95,surprise_min_cosine,,-0.688118,2.576984e-15,iv
24879,Switchboard,GPT-2 Small,100,Typical_0.95,surprise_min_euclidean,,-0.688118,2.576984e-15,iv
19742,Switchboard,GPT-2 Large,100,Ancestral_None,surprise_min_cosine,,-0.681657,5.885868e-15,iv
19743,Switchboard,GPT-2 Large,100,Ancestral_None,surprise_min_euclidean,,-0.681657,5.885868e-15,iv


In [46]:
corrs_df[
    (corrs_df.corpus == 'Switchboard') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('cosine') | corrs_df.measure.str.contains('euclidean'))
].sort_values('corr', ascending=True).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
20809,Switchboard,DialoGPT Medium,70,Temperature_1.25,surprise_min_2gram,,-0.43646,1.7e-05,iv
20985,Switchboard,DialoGPT Medium,80,Temperature_1.25,surprise_min_2gram,,-0.407725,6.6e-05,iv
21161,Switchboard,DialoGPT Medium,90,Temperature_1.25,surprise_min_2gram,,-0.402159,8.5e-05,iv
21337,Switchboard,DialoGPT Medium,100,Temperature_1.25,surprise_min_2gram,,-0.401664,8.7e-05,iv
20633,Switchboard,DialoGPT Medium,60,Temperature_1.25,surprise_min_2gram,,-0.392563,0.00013,iv
20457,Switchboard,DialoGPT Medium,50,Temperature_1.25,surprise_min_2gram,,-0.369605,0.000337,iv
21482,Switchboard,DialoGPT Medium,100,Nucleus_0.95,surprise_min_3gram,,-0.354729,0.000748,iv
21130,Switchboard,DialoGPT Medium,80,Nucleus_0.95,surprise_min_3gram,,-0.345498,0.001047,iv
21642,Switchboard,GPT-Neo 125M,10,Typical_0.3,surprise_min_3gram,,-0.340502,0.001251,iv
21496,Switchboard,DialoGPT Medium,100,Ancestral_None,surprise_min_1gram,,-0.3403,0.000532,iv


In [40]:
corrs_df[
    (corrs_df.corpus == 'Switchboard') &
    (corrs_df.type == "iv") &
    (~corrs_df.measure.str.contains('cosine')) &
    (~corrs_df.measure.str.contains('euclidean')) &
    (~corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=True).head(20)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
28540,Switchboard,DialoGPT Small,100,Ancestral_None,surprise_min_2gram_pos,,-0.44038,1.4e-05,iv
28364,Switchboard,DialoGPT Small,90,Ancestral_None,surprise_min_2gram_pos,,-0.406467,7e-05,iv
26461,Switchboard,GPT-Neo 1.3B,90,Typical_0.95,surprise_min_3gram_pos,,-0.392518,0.00017,iv
28188,Switchboard,DialoGPT Small,80,Ancestral_None,surprise_min_2gram_pos,,-0.391849,0.000134,iv
23261,Switchboard,GPT-Neo 125M,100,Ancestral_None,surprise_min_3gram_pos,,-0.391002,0.000181,iv
23260,Switchboard,GPT-Neo 125M,100,Ancestral_None,surprise_min_2gram_pos,,-0.374333,0.000278,iv
26637,Switchboard,GPT-Neo 1.3B,100,Typical_0.95,surprise_min_3gram_pos,,-0.372024,0.000388,iv
22908,Switchboard,GPT-Neo 125M,80,Ancestral_None,surprise_min_2gram_pos,,-0.371991,0.000306,iv
23084,Switchboard,GPT-Neo 125M,90,Ancestral_None,surprise_min_2gram_pos,,-0.361208,0.000469,iv
24860,Switchboard,GPT-2 Small,100,Temperature_1.25,surprise_min_2gram_pos,,-0.360643,0.00048,iv


In [None]:
corrs_df[
    (corrs_df.corpus == 'Switchboard') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=True).head(10)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
5374,Dailydialog,GPT-2 Large,90,Nucleus_0.9,surprise_min_cosine,,-0.584379,1.736895e-10,iv
5375,Dailydialog,GPT-2 Large,90,Nucleus_0.9,surprise_min_euclidean,,-0.584379,1.736895e-10,iv
5551,Dailydialog,GPT-2 Large,100,Nucleus_0.9,surprise_min_euclidean,,-0.578579,2.889064e-10,iv
5550,Dailydialog,GPT-2 Large,100,Nucleus_0.9,surprise_min_cosine,,-0.578579,2.889064e-10,iv
5199,Dailydialog,GPT-2 Large,80,Nucleus_0.9,surprise_min_euclidean,,-0.577742,3.106707e-10,iv
5198,Dailydialog,GPT-2 Large,80,Nucleus_0.9,surprise_min_cosine,,-0.577742,3.106707e-10,iv
11918,Dailydialog,GPT-Neo 1.3B,60,Nucleus_0.8,surprise_min_cosine,,-0.575127,3.892641e-10,iv
11919,Dailydialog,GPT-Neo 1.3B,60,Nucleus_0.8,surprise_min_euclidean,,-0.575127,3.892641e-10,iv
5023,Dailydialog,GPT-2 Large,70,Nucleus_0.9,surprise_min_euclidean,,-0.574599,4.072965e-10,iv
5022,Dailydialog,GPT-2 Large,70,Nucleus_0.9,surprise_min_cosine,,-0.574599,4.072965e-10,iv


In [51]:
corrs_df[
    (corrs_df.corpus == 'Dailydialog') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('cosine') | corrs_df.measure.str.contains('euclidean'))
].sort_values('corr', ascending=True).head(10)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
14104,Dailydialog,DialoGPT Small,80,Ancestral_None,surprise_min_1gram,,-0.383088,8.4e-05,iv
6232,Dailydialog,DialoGPT Medium,40,Temperature_0.75,surprise_min_1gram,,-0.381579,9e-05,iv
14456,Dailydialog,DialoGPT Small,100,Ancestral_None,surprise_min_1gram,,-0.377049,0.00011,iv
5544,Dailydialog,GPT-2 Large,100,Nucleus_0.9,surprise_min_1gram,,-0.376168,0.000115,iv
3832,Dailydialog,DialoGPT Large,100,Nucleus_0.85,surprise_min_1gram,,-0.372974,0.000133,iv
14280,Dailydialog,DialoGPT Small,90,Ancestral_None,surprise_min_1gram,,-0.37196,0.000139,iv
6408,Dailydialog,DialoGPT Medium,50,Temperature_0.75,surprise_min_1gram,,-0.371375,0.000142,iv
5016,Dailydialog,GPT-2 Large,70,Nucleus_0.9,surprise_min_1gram,,-0.370335,0.000149,iv
12456,Dailydialog,GPT-Neo 1.3B,90,Nucleus_0.85,surprise_min_1gram,,-0.367728,0.000167,iv
12632,Dailydialog,GPT-Neo 1.3B,100,Nucleus_0.85,surprise_min_1gram,,-0.367337,0.00017,iv


In [52]:
corrs_df[
    (corrs_df.corpus == 'Dailydialog') &
    (corrs_df.type == "iv") &
    (~corrs_df.measure.str.contains('cosine')) &
    (~corrs_df.measure.str.contains('euclidean')) &
    (~corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=True).head(10)

Unnamed: 0,corpus,model,n_samples,sampling,measure,agg,corr,p-value,type
3565,Dailydialog,DialoGPT Large,90,Temperature_1.25,surprise_min_3gram_pos,,-0.358922,0.000284,iv
1757,Dailydialog,GPT-2 Medium,80,Typical_0.3,surprise_min_3gram_pos,,-0.346284,0.000417,iv
3741,Dailydialog,DialoGPT Large,100,Temperature_1.25,surprise_min_3gram_pos,,-0.346038,0.000483,iv
1581,Dailydialog,GPT-2 Medium,70,Typical_0.3,surprise_min_3gram_pos,,-0.339266,0.000554,iv
14109,Dailydialog,DialoGPT Small,80,Ancestral_None,surprise_min_3gram_pos,,-0.333806,0.000782,iv
3213,Dailydialog,DialoGPT Large,70,Temperature_1.25,surprise_min_3gram_pos,,-0.329093,0.000937,iv
14461,Dailydialog,DialoGPT Small,100,Ancestral_None,surprise_min_3gram_pos,,-0.325886,0.001058,iv
3037,Dailydialog,DialoGPT Large,60,Temperature_1.25,surprise_min_3gram_pos,,-0.325052,0.001092,iv
14285,Dailydialog,DialoGPT Small,90,Ancestral_None,surprise_min_3gram_pos,,-0.323572,0.001154,iv
1933,Dailydialog,GPT-2 Medium,90,Typical_0.3,surprise_min_3gram_pos,,-0.3134,0.001498,iv


In [None]:
corrs_df[
    (corrs_df.corpus == 'Dailydialog') &
    (corrs_df.type == "iv") &
    (corrs_df.measure.str.contains('pos'))
].sort_values('corr', ascending=True).head(10)