In [4]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import GPT2Tokenizer
from nltk import ngrams
from collections import defaultdict


In [5]:
with open('chains_all.json', 'r') as f:
    chains = json.load(f)
len(chains)


186

In [6]:
df = pd.read_csv(
    'surprisal_SBNC_gpt2_50_1e-3_agg.csv'
)

dialogue_ids = set(df['Dialogue ID'].tolist())
print('{} dialogues'.format(len(dialogue_ids)))


186 dialogues


In [7]:
def facilitating_effect(turn_surprisal_values, construction_indices, window=10):
    start_constr, end_constr = construction_indices
    
    if window:
        start_ctx = start_constr - window
        if start_ctx < 0:
            start_ctx = 0
        end_ctx = end_constr + window
        if end_ctx > len(turn_surprisal_values):
            end_ctx = len(turn_surprisal_values)
    else:
        start_ctx = 0
        end_ctx = len(turn_surprisal_values)
        
    indices_locus = [i for i in range(start_ctx, end_ctx) if i not in range(start_constr, end_constr)]
    
    if not indices_locus:
        return 1.
    
    surprisal_wo_constr = np.mean(
        [h for i, h in enumerate(turn_surprisal_values) if i in indices_locus]
    )
    surprisal_constr = np.mean(
        [h for i, h in enumerate(turn_surprisal_values) if i in range(start_constr, end_constr)]
    )
    
    return np.log2(surprisal_wo_constr / surprisal_constr)


def std_surprisal(turn_surprisal_values, construction_indices, window=None):
    start_constr, end_constr = construction_indices
    surprisal_constr = np.mean(
        [h for i, h in enumerate(turn_surprisal_values) if i in range(start_constr, end_constr)]
    )
    if window:
        start_ctx = start_constr - window
        if start_ctx < 0:
            start_ctx = 0
        end_ctx = end_constr + window
        if end_ctx > len(turn_surprisal_values):
            end_ctx = len(turn_surprisal_values)
    else:
        start_ctx = 0
        end_ctx = len(turn_surprisal_values)
        
    mu = np.mean(turn_surprisal_values[start_ctx: end_ctx])
    sigma = np.std(turn_surprisal_values[start_ctx: end_ctx])
    
    return (surprisal_constr - mu) / sigma


def surprisal(turn_surprisal_values, construction_indices):
    start_constr, end_constr = construction_indices
    surprisal_constr = np.mean(
        [h for i, h in enumerate(turn_surprisal_values) if i in range(start_constr, end_constr)]
    )
    return surprisal_constr



In [8]:
len_distr = {3: 53196, 4: 6301, 5: 791, 6: 182, 7: 34}
n_types = sum(len_distr.values())
len_p_distr = [x / n_types for x in len_distr.values()]
len_p_distr


[0.8792145973819913,
 0.10414187491736084,
 0.013073515800608225,
 0.0030080655824408302,
 0.0005619463175988365]

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [10]:
all_constructions = []
for d_id in chains:
    for constr in chains[d_id]:
        constr_tokens_w_space = tokenizer.convert_ids_to_tokens(tokenizer(' ' + constr)['input_ids'])
        constr_tokens_wo_space = tokenizer.convert_ids_to_tokens(tokenizer(constr)['input_ids'])
        
        if constr_tokens_w_space not in all_constructions:
            all_constructions.append(constr_tokens_w_space)
        
        if constr_tokens_wo_space not in all_constructions:
            all_constructions.append(constr_tokens_wo_space)


In [None]:
seqs_s = defaultdict(list)
seqs_fe = defaultdict(list)

for _, row in tqdm(df.iterrows(), total=len(df)):
    turn_tokens = eval(row['Tokens'])
    if not turn_tokens:
        print('skip')
        continue
    
    tok_surprisal = eval(row['Surprisal'])
    assert len(tok_surprisal) == len(turn_tokens)
    
    for n in range(3, 8):
        for _range in list(ngrams(range(len(turn_tokens)), n)):
            start, end = _range[0], _range[-1] + 1
            
            seq = turn_tokens[start: end]

            if seq in all_constructions:
                continue

            fe = facilitating_effect(tok_surprisal, (start, end), window=None)
            s = surprisal(tok_surprisal, (start, end))
            
            seqs_fe[n].append(fe)
            seqs_s[n].append(s)
            


In [None]:
for seed in [4, 6, 13, 17, 95]:
    np.random.seed(seed)
    samples_fe = {}
    samples_s = {}

    for n in range(3, 8):
        print(n)
        assert len(seqs_fe[n]) == len(seqs_s[n])
        print('Sample...')
        indices = np.random.choice(len(seqs_fe[n]), len_distr[n], replace=False)
        print(len_distr[n], len(indices), len(seqs_fe[n]))

        samples_fe[n] = [_fe for i, _fe in enumerate(seqs_fe[n]) if i in indices]
        samples_s[n] = [_s for i, _s in enumerate(seqs_s[n]) if i in indices]

        print(len(samples_fe[n]), len(samples_s[n]))
        print()
    seq_dataset = []
    for n in range(3, 8):
        assert len(samples_fe[n]) == len(samples_s[n])
        for sample in zip(samples_fe[n], samples_s[n]):
            seq_dataset.append((
                n,
                sample[0],
                sample[1]
            ))

    seq_df = pd.DataFrame(seq_dataset, columns=['Length', 'FE', 'S'])

#     seq_df.to_csv('non-constructions_all_{}.csv'.format(seed))
    

In [None]:
for n in range(3, 8):
    print('-- {} --'.format(n))
    print('FE: {:.2f} ± {:.2f}'.format(np.mean(samples_fe[n]), np.std(samples_fe[n])))
    print(' S: {:.2f} ± {:.2f}'.format(np.mean(samples_s[n]), np.std(samples_s[n])))
    print()

In [None]:
seq_df['FE'].mean(), seq_df['FE'].std(), seq_df['FE'].median()

In [None]:
seq_df['S'].mean(), seq_df['S'].std(), seq_df['S'].median()

In [None]:
seq_df.to_csv('non-constructions.csv')