In [1]:
import numpy as np
import pandas as pd

from ast import literal_eval
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from string import punctuation
from tqdm.notebook import tqdm


### Load Natural Stories reading times

In [2]:
ns_df = pd.read_csv('../data/corpora/naturalstories/processed_RTs.tsv', sep='\t')
ns_df.head()

Unnamed: 0,WorkerId,WorkTimeInSeconds,correct,item,zone,RT,word,nItem,meanItemRT,sdItemRT,gmeanItemRT,gsdItemRT
0,A3QJPB0NZU5PY1,3960,6,1,1,924,If,84,369.011905,160.579935,340.566023,1.490513
1,A2RPQGUWVZPX7U,2431,5,1,1,474,If,84,369.011905,160.579935,340.566023,1.490513
2,A11KMPAZSE5Q0Q,1287,5,1,1,272,If,84,369.011905,160.579935,340.566023,1.490513
3,A1U1QL617G5DU3,2074,6,1,1,354,If,84,369.011905,160.579935,340.566023,1.490513
4,ACTW5YEWV9OR0,2213,6,1,1,577,If,84,369.011905,160.579935,340.566023,1.490513


### Load estimates of surprisal and incremental information value

In [3]:
# Constants

MODEL_NAMES = ['gpt2-small', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'] 

FORECAST_HORIZONS = list(range(1, 11))

LAYERS = {
    "gpt2-small": list(range(0, 13)),
    "gpt2-medium": list(range(0, 25, 2)),
    "gpt2-large": list(range(0, 37, 3)),
    "gpt2-xl": list(range(0, 49, 4))
}

SUMMARY_FNS = ['mean', 'max', 'min']

UNIT_SIZE = 1

In [4]:
# Surprisal

ns_surprisal_path = "../data/estimates/naturalstories/surprisal"
ns_surprisal_df = pd.DataFrame()

for model_name in MODEL_NAMES:
    tmp_df = pd.read_csv(f'{ns_surprisal_path}/{model_name}_surprisal_400words.csv')
    tmp_df['model'] = model_name
    ns_surprisal_df = pd.concat([ns_surprisal_df, tmp_df], axis=0)

# drop the entropy and deviation columns
ns_surprisal_df = ns_surprisal_df.drop(columns=['entropy', 'deviation'])

print(len(ns_surprisal_df), "rows")
ns_surprisal_df.head()

40 rows


Unnamed: 0,id,surprisal,tokens,model
0,1,[5.37758064e+00 5.60565889e-01 3.75532675e+00 ...,['If' 'you' 'were' 'to' 'journey' 'to' 'the' '...,gpt2-small
1,2,[3.94234681e+00 8.24601555e+00 2.85127807e+00 ...,['A' 'clear' 'and' 'joyous' 'day' 'it' 'was' '...,gpt2-small
2,3,[4.69777489e+00 2.15857363e+00 7.92688084e+00 ...,['It' 'was' 'cold' 'and' 'nearly' 'dark' 'on' ...,gpt2-small
3,4,[7.58915520e+00 2.35619020e+00 1.54661678e-03 ...,['Once' 'upon' 'a' 'time' 'the' 'birds' 'took'...,gpt2-small
4,5,[6.06746483e+00 8.77776718e+00 1.15439475e+00 ...,"['At' 'ten' 'years' 'old,' 'I' 'could' 'not' '...",gpt2-small


In [5]:
# Incremental information value

ns_iv_path = "../data/estimates/naturalstories/iv_k50"
ns_iv_df = pd.DataFrame()

for model_name in MODEL_NAMES:
    tmp_df = pd.read_csv(f'{ns_iv_path}/{model_name}_iv_n{UNIT_SIZE}_400words.csv')
    tmp_df['model'] = model_name
    tmp_df['n'] = UNIT_SIZE
    ns_iv_df = pd.concat([ns_iv_df, tmp_df], axis=0)

print(f"{len(ns_iv_df)} rows.")
ns_iv_df.head()

15600 rows.


Unnamed: 0,id,horizon,layer,summary,score,tokens,model,n
0,1,1,0,mean,"[2.900254487991333, 0.9754052758216858, 2.7851...","['If', 'you', 'were', 'to', 'journey', 'to', '...",gpt2-small,1
1,2,1,0,mean,"[2.7230708599090576, 3.9138975143432617, 2.982...","['A', 'clear', 'and', 'joyous', 'day', 'it', '...",gpt2-small,1
2,3,1,0,mean,"[2.6567976474761963, 2.16483736038208, 3.67803...","['It', 'was', 'cold', 'and', 'nearly', 'dark',...",gpt2-small,1
3,4,1,0,mean,"[3.459038257598877, 3.1826250553131104, 0.0039...","['Once', 'upon', 'a', 'time', 'the', 'birds', ...",gpt2-small,1
4,5,1,0,mean,"[2.9816982746124268, 3.4702093601226807, 2.442...","['At', 'ten', 'years', 'old,', 'I', 'could', '...",gpt2-small,1


In [6]:
# Preprocess score column

def get_list(s):
    """
    Transform "score" field (a string) into a list of floats.
    """
    try:
        return literal_eval(s)
    except SyntaxError:
        return list(map(float, s[1:-1].split()))
    except ValueError:
        return list(map(float, s[1:-1].split()))
    
# transform "scores" in surprisal and incremental information value dataframes into lists of floats
ns_iv_df['score'] = ns_iv_df['score'].apply(get_list)
ns_surprisal_df['surprisal'] = ns_surprisal_df['surprisal'].apply(get_list)


### Annotate data points in Natural Stories with surprisal and information value estimates

In [7]:
# Skip items with context lengths larger than 400 
# (no estimates obtained for these; context too large for LM's context window)
ns_df = ns_df[ns_df['zone'] <= 400]


estimates = defaultdict(lambda: defaultdict(float))

print('Extract estimates...')
df_grouped = ns_df.groupby(['item', 'zone'])
for (item, zone), group in tqdm(df_grouped, total=len(df_grouped)):
    for model_name in MODEL_NAMES:
        df_tmp = ns_surprisal_df[
            (ns_surprisal_df['id'] == item) & 
            (ns_surprisal_df['model'] == model_name)
        ]
        estimates[f'{model_name}_surprisal'][(item, zone)] = df_tmp['surprisal'].values[0][zone - 1]

    tmp_df_grouped = ns_iv_df[ns_iv_df["id"] == item].groupby(
        ['model', 'n', 'horizon', 'layer', 'summary']
    )
    for (model, n, horizon, layer, summary), group in tmp_df_grouped:
        col = f'{model}_iv_{n}_H{horizon}_L{layer}_S{summary}'
        estimates[col][(item, zone)] = group['score'].values[0][zone - 1]

print('Re-arrange estimates for dataframe...')
items_zones = list(zip(ns_df['item'].values, ns_df['zone'].values))
estimates_rearranged = {}
for k, v in tqdm(estimates.items(), total=len(estimates)):
    estimates_rearranged[k] = [v[(item, zone)] for item, zone in items_zones]

print('Add estimates to main dataframe...')
ns_df = pd.concat(
    [
        ns_df,
        pd.DataFrame(estimates_rearranged, index=ns_df.index)
    ], axis=1
)


Extract estimates...


  0%|          | 0/4000 [00:00<?, ?it/s]

Re-arrange estimates for dataframe...


  0%|          | 0/1564 [00:00<?, ?it/s]

Add estimates to main dataframe...


### Add control predictors

In [8]:
# Word length (number of characters)
ns_df['length'] = ns_df['word'].apply(len)

# Re-arange columns
cols = list(ns_df.columns)
cols = cols[:5] + [cols[-1]] + cols[5:-1]
ns_df = ns_df[cols]
ns_df.head()

Unnamed: 0,WorkerId,WorkTimeInSeconds,correct,item,zone,length,RT,word,nItem,meanItemRT,...,gpt2-xl_iv_1_H10_L36_Smin,gpt2-xl_iv_1_H10_L40_Smax,gpt2-xl_iv_1_H10_L40_Smean,gpt2-xl_iv_1_H10_L40_Smin,gpt2-xl_iv_1_H10_L44_Smax,gpt2-xl_iv_1_H10_L44_Smean,gpt2-xl_iv_1_H10_L44_Smin,gpt2-xl_iv_1_H10_L48_Smax,gpt2-xl_iv_1_H10_L48_Smean,gpt2-xl_iv_1_H10_L48_Smin
0,A3QJPB0NZU5PY1,3960,6,1,1,2,924,If,84,369.011905,...,94.727402,560.528442,218.050339,113.002907,836.843384,285.529846,130.084595,29.212479,19.739662,10.837491
1,A2RPQGUWVZPX7U,2431,5,1,1,2,474,If,84,369.011905,...,94.727402,560.528442,218.050339,113.002907,836.843384,285.529846,130.084595,29.212479,19.739662,10.837491
2,A11KMPAZSE5Q0Q,1287,5,1,1,2,272,If,84,369.011905,...,94.727402,560.528442,218.050339,113.002907,836.843384,285.529846,130.084595,29.212479,19.739662,10.837491
3,A1U1QL617G5DU3,2074,6,1,1,2,354,If,84,369.011905,...,94.727402,560.528442,218.050339,113.002907,836.843384,285.529846,130.084595,29.212479,19.739662,10.837491
4,ACTW5YEWV9OR0,2213,6,1,1,2,577,If,84,369.011905,...,94.727402,560.528442,218.050339,113.002907,836.843384,285.529846,130.084595,29.212479,19.739662,10.837491


In [9]:
# Subtlex log frequency
subtlex_df = pd.read_excel('../data/SUBTLEXusExcel2007.xlsx')
subtlex = {}
for index, row in subtlex_df.iterrows():
    subtlex[row['Word']] = row['Lg10WF']

# Natural Stories vocabulary
vocab = ns_df['word'].unique()

# Add frequency information to the dataframe
for word in vocab:
    if word in subtlex: 
        freq = subtlex[word]
    elif word.lower() in subtlex:
        freq = subtlex[word.lower()]
    elif word.capitalize() in subtlex:
        freq = subtlex[word.capitalize()]
    else:
        _word = word.strip(punctuation)
        if _word in subtlex:
            freq = subtlex[_word]
        elif _word.lower() in subtlex:
            freq = subtlex[_word.lower()]
        elif _word.capitalize() in subtlex:
            freq = subtlex[_word.capitalize()]
        else:
            print(f"Word {word} not found in SUBTLEX")
            freq = np.nan
    # Add log frequency to the dataframe
    ns_df.loc[ns_df['word'] == word, 'Subtlex_log10'] = freq
    
# Re-arrange columns
cols = list(ns_df.columns)
cols = cols[:6] + [cols[-1]] + cols[6:-1]
ns_df = ns_df[cols]

# Drop rows for which Subtlex log frequency is not available
ns_df_clean = ns_df.copy()
ns_df_clean = ns_df_clean.dropna(subset=['Subtlex_log10'])


Word Bradford, not found in SUBTLEX
Word clattered not found in SUBTLEX
Word long-bearded not found in SUBTLEX
Word boar's not found in SUBTLEX
Word Bradford. not found in SUBTLEX
Word Bradford not found in SUBTLEX
Word beast's not found in SUBTLEX
Word mother's not found in SUBTLEX
Word marshmallow-like not found in SUBTLEX
Word Aqua's not found in SUBTLEX
Word gelid not found in SUBTLEX
Word It's not found in SUBTLEX
Word Year's, not found in SUBTLEX
Word o'clock not found in SUBTLEX
Word Correthers, not found in SUBTLEX
Word Elvis's not found in SUBTLEX
Word x-ray not found in SUBTLEX
Word Abby's not found in SUBTLEX
Word 'He's not found in SUBTLEX
Word he's not found in SUBTLEX
Word didn't not found in SUBTLEX
Word wouldn't not found in SUBTLEX
Word 'Where's not found in SUBTLEX
Word Josephs not found in SUBTLEX
Word dejectedly, not found in SUBTLEX
Word 'I'm not found in SUBTLEX
Word 'Haven't not found in SUBTLEX
Word Lucy's not found in SUBTLEX
Word mom's not found in SUBTLEX
Wor

### Further preprocessing following De Varda et al. 2023

In [11]:
# Punctuation
# From Frank: "Words attached to a comma, clitics, sentence-initial, and sentence-final words 
#              were discarded from further analysis [...]."
def contains_punct(word):
    if "." in word:
        return 1
    elif "," in word:
        return 1
    elif "'" in word:
        return 1
    else:
        return 0

ns_df_clean["contains_punct"] = ns_df_clean["word"].apply(contains_punct)
ns_df_clean = ns_df_clean[ns_df_clean.contains_punct == 0]


### Create a normalised version of the dataframe

In [12]:
BASELINE_PREDICTORS = ['Subtlex_log10', 'length', 'zone']
SURPRISAL_PREDICTORS = [col for col in ns_df_clean if '_surprisal' in col]
IV_PREDICTORS = [col for col in ns_df_clean if '_iv_' in col]

scaler = MinMaxScaler()

ns_df_clean_norm = ns_df_clean.copy()
ns_df_clean_norm[IV_PREDICTORS + SURPRISAL_PREDICTORS] = scaler.fit_transform(ns_df_clean_norm[IV_PREDICTORS + SURPRISAL_PREDICTORS])


### Save dataframes

In [13]:
# this can take a while... (runtime: 19m 56s)

ns_df_clean.to_csv(
    'preprocessed_corpora/naturalstories_preprocessed.csv', 
    index=False
)

ns_df_clean_norm.to_csv(
    'preprocessed_corpora/naturalstories_preprocessed_normalised.csv', 
    index=False
)