In [1]:
import numpy as np
import pandas as pd

from ast import literal_eval
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from string import punctuation
from tqdm.notebook import tqdm


### Load Aligned measurements

In [2]:
aligned_df = pd.read_csv('../data/corpora/aligned/all_measures.csv')
aligned_df = aligned_df[aligned_df.columns.drop(
    list(aligned_df.filter(regex='GPT')) + 
    list(aligned_df.filter(regex='gram')) + 
    list(aligned_df.filter(regex='psg')) + 
    list(aligned_df.filter(regex='rnn')) +
    ['is_start_end']
    )]
aligned_df.head(3)

Unnamed: 0,item,word,word2,sentence,context_length,sent_id,item_id,list,rating_mean,rating_sd,...,EPNP,P600,PNP,RTfirstfix,RTfirstpass,RTrightbound,RTgopast,self_paced_reading_time,Subtlex_log10,length
0,Arthur placed,the,THE,Arthur placed the bars of chocolate on the cou...,2,96,577,1,4.770492,0.559567,...,-0.660875,1.005231,1.627545,71.142857,75.809524,86.571429,102.952381,296.166667,14.222247,3
1,Finally Maria sat down with a,cup,CUP,Finally Maria sat down with a cup of tea and a...,6,152,1093,1,3.803279,1.029881,...,-0.921127,1.257081,0.465343,129.52381,129.52381,131.52381,167.333333,248.941176,7.876259,3
2,He smiled again and felt like a,man,MAN,He smiled again and felt like a man and not ju...,7,183,1440,1,3.098361,1.179052,...,-0.112034,0.260007,0.870971,129.619048,133.333333,138.666667,182.47619,256.225352,11.452464,3


### Load estimates of surprisal and information value

In [3]:
# Constants

MODEL_NAMES = ['gpt2-small', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'] 

FORECAST_HORIZONS = list(range(1, 11))

LAYERS = {
    "gpt2-small": list(range(0, 13)),
    "gpt2-medium": list(range(0, 25, 2)),
    "gpt2-large": list(range(0, 37, 3)),
    "gpt2-xl": list(range(0, 49, 4))
}

SUMMARY_FNS = ['mean', 'max', 'min']

UNIT_SIZE = 1

In [4]:
# Surprisal

aligned_surprisal_path = "../data/estimates/aligned/surprisal"
aligned_surprisal_df = pd.DataFrame()

for model_name in MODEL_NAMES:
    tmp_df = pd.read_csv(f'{aligned_surprisal_path}/{model_name}_surprisal.csv')
    tmp_df['model'] = model_name
    aligned_surprisal_df = pd.concat([aligned_surprisal_df, tmp_df], axis=0)

# drop the entropy and deviation columns
aligned_surprisal_df = aligned_surprisal_df.drop(columns=['entropy', 'deviation'])

print(len(aligned_surprisal_df), "rows")
aligned_surprisal_df.head()

820 rows


Unnamed: 0,id,surprisal,tokens,model
0,1,[10.79873657 10.94567871 3.94667959 3.007099...,['Anne' 'lost' 'control' 'and' 'laughed'],gpt2-small
1,2,[10.19297791 8.59801865 4.61171198 2.989566...,['Billy' 'wrote' 'on' 'the' 'envelope'],gpt2-small
2,3,[5.94910049 5.62654495 7.64922857 3.51602721 2...,['He' 'called' 'over' 'his' 'shoulder'],gpt2-small
3,4,[5.94910049 7.5254631 7.93282318 1.08017111 3...,['He' 'stayed' 'against' 'the' 'wall'],gpt2-small
4,5,[10.8987447 10.02528667 3.68532658 1.169715...,['Helen' 'ran' 'to' 'the' 'toilet'],gpt2-small


In [5]:
# Incremental information value

aligned_iv_path = "../data/estimates/aligned/iv_k50"
aligned_iv_df = pd.DataFrame()

for model_name in MODEL_NAMES:
    tmp_df = pd.read_csv(f'{aligned_iv_path}/{model_name}_iv_n{UNIT_SIZE}.csv')
    tmp_df['model'] = model_name
    tmp_df['n'] = UNIT_SIZE
    aligned_iv_df = pd.concat([aligned_iv_df, tmp_df], axis=0)

print(f"{len(aligned_iv_df)} rows.")
aligned_iv_df.head()

319800 rows.


Unnamed: 0,id,horizon,layer,summary,score,tokens,model,n
0,1,1,0,mean,"[3.706263303756714, 3.7051799297332764, 2.1487...","['lost', 'control', 'and', 'laughed']",gpt2-small,1
1,2,1,0,mean,"[3.742562770843506, 2.409949779510498, 2.53718...","['wrote', 'on', 'the', 'envelope']",gpt2-small,1
2,3,1,0,mean,"[3.4552929401397705, 2.9963159561157227, 2.635...","['called', 'over', 'his', 'shoulder']",gpt2-small,1
3,4,1,0,mean,"[3.823862314224243, 3.4062201976776123, 1.1650...","['stayed', 'against', 'the', 'wall']",gpt2-small,1
4,5,1,0,mean,"[3.8114445209503174, 2.3981292247772217, 1.736...","['ran', 'to', 'the', 'toilet']",gpt2-small,1


In [6]:
# Preprocess score column

def get_list(s):
    """
    Transform "score" field (a string) into a list of floats.
    """
    try:
        return literal_eval(s)
    except SyntaxError:
        return list(map(float, s[1:-1].split()))
    except ValueError:
        return list(map(float, s[1:-1].split()))
    
# transform "scores" in surprisal and incremental information value dataframes into lists of floats
aligned_iv_df['score'] = aligned_iv_df['score'].apply(get_list)
aligned_surprisal_df['surprisal'] = aligned_surprisal_df['surprisal'].apply(get_list)


### Annotate data points in Aligned with surprisal and incremental information value estimates

In [7]:
estimates = defaultdict(list)

# Surprisal
for _, row in aligned_df.iterrows():
    sent_id = row['sent_id']
    word_position = row['context_length']  
    
    for model_name in MODEL_NAMES:
        df_tmp = aligned_surprisal_df[
            (aligned_surprisal_df['id'] == sent_id) & 
            (aligned_surprisal_df['model'] == model_name)
        ]
        estimates[F'{model_name}_surprisal'].append(
            df_tmp['surprisal'].values[0][word_position]
        )

# Incremental information value
for _, row in tqdm(aligned_df.iterrows(), total=len(aligned_df)):
    sent_id = row['sent_id']
    word_position = row['context_length'] - 1 # because we are using the estimates computed without the bos token 
    
    tmp_df_grouped = aligned_iv_df[aligned_iv_df["id"] == sent_id].groupby(
        ['model', 'n', 'horizon', 'layer', 'summary']
    )
    for group, df_group in tmp_df_grouped:
        model_name, unit_size, horizon, layer, summary_fn = group
        estimates[f'{model_name}_iv_{unit_size}_H{horizon}_L{layer}_S{summary_fn}'].append(
            df_group['score'].values[0][word_position]
        )
    
# Add estimates to the main dataframe
for k, v in estimates.items():
    aligned_df[k] = v


  0%|          | 0/1726 [00:00<?, ?it/s]

  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v
  aligned_df[k] = v


### Preprocessing following De Varda et al. 2023

In [8]:
# Punctuation
# From Frank: "Words attached to a comma, clitics, sentence-initial, and sentence-final words 
#              were discarded from further analysis [...]."
contains_punct = []
def contains_punct(word):
    if "." in word:
        return 1
    elif "," in word:
        return 1
    elif "'" in word:
        return 1
    else:
        return 0

aligned_df["contains_punct"] = aligned_df["word"].apply(contains_punct)
aligned_df_clean = aligned_df[aligned_df.contains_punct == 0]


  aligned_df["contains_punct"] = aligned_df["word"].apply(contains_punct)


### Create a normalised version of the dataframe

In [9]:
BASELINE_PREDICTORS = ['Subtlex_log10', 'length', 'zone']
SURPRISAL_PREDICTORS = [col for col in aligned_df_clean if '_surprisal' in col]
IV_PREDICTORS = [col for col in aligned_df_clean if '_iv_' in col]

scaler = MinMaxScaler()

# this step can take a while...
aligned_df_clean_norm = aligned_df_clean.copy()
aligned_df_clean_norm[IV_PREDICTORS + SURPRISAL_PREDICTORS] = scaler.fit_transform(
    aligned_df_clean_norm[IV_PREDICTORS + SURPRISAL_PREDICTORS]
)


### Save dataframes

In [10]:
aligned_df_clean.to_csv(
    'preprocessed_corpora/aligned_preprocessed.csv', 
    index=False
)

aligned_df_clean_norm.to_csv(
    'preprocessed_corpora/aligned_preprocessed_normalised.csv', 
    index=False
)