In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import trange, tqdm
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (6,4)

In [None]:
ptb = pd.read_csv('../results_conll/mi/ptb_gpt2-ft.csv').dropna()
pb = pd.read_csv('../results_conll/mi/pb_gpt2-ft.csv').dropna()
bncs = pd.read_csv('../results_conll/mi/bnc_spoken_gpt2-ft.csv').dropna()


In [None]:
ptb['position'] = ptb['position_in_doc']
pb['position'] = pb['position_in_dialogue']

ptb['path'] = ptb['doc_id']
pb['path'] = pb['dialogue_id']
bncs['path'] = bncs['dialogue_id']

# Global centrality

## Using average estimates per position

In [None]:
def global_centrality(df, estimate='normalised_h_doc', ctx_name='path', shuffle=False):
    centralities = []
    for doc_id in set(df[ctx_name].values):
        doc_h_estimates = df[df[ctx_name] == doc_id][estimate].values
        if shuffle:
            doc_h_estimates = np.random.permutation(doc_h_estimates)
        var = np.mean(abs(doc_h_estimates - doc_h_estimates.mean()) ** 2)
        centralities.append(var)
    return - np.array(centralities)

def normalised_global_centrality(df, estimate='normalised_h_doc', ctx_name='path', shuffle=False):
    centralities = []
    for doc_id in set(df[ctx_name].values):
        doc_h_estimates = df[df[ctx_name] == doc_id][estimate].values
        if shuffle:
            doc_h_estimates = np.random.permutation(doc_h_estimates)
        var = np.mean(abs(doc_h_estimates / np.mean(doc_h_estimates) - 1) ** 2)
        centralities.append(var)
    return - np.array(centralities)



# Local predictability

In [None]:
def local_predictability(df, estimate='normalised_h_doc', ctx_name='path', shuffle=False):
    centralities = []
    for doc_id in set(df[ctx_name].values):
        doc_h_estimates = df[df[ctx_name] == doc_id][estimate].values
        if shuffle:
            doc_h_estimates = np.random.permutation(doc_h_estimates)
        sum_squared_diffs = 0
        for i in range(1, len(doc_h_estimates)):
            sum_squared_diffs += abs(doc_h_estimates[i] - doc_h_estimates[i - 1]) ** 2
        var = sum_squared_diffs / len(doc_h_estimates)
        centralities.append(var)
    return - np.array(centralities)


def normalised_local_predictability(df, estimate='normalised_h_doc', ctx_name='path', shuffle=False):
    centralities = []
    for doc_id in set(df[ctx_name].values):
        doc_h_estimates = df[df[ctx_name] == doc_id][estimate].values
        if shuffle:
            doc_h_estimates = np.random.permutation(doc_h_estimates)
        doc_h_mean = np.mean(doc_h_estimates)
        sum_squared_diffs = 0
        for i in range(1, len(doc_h_estimates)):
            sum_squared_diffs += abs(doc_h_estimates[i] - doc_h_estimates[i - 1]) ** 2
        normalised_sum = sum_squared_diffs / (doc_h_mean ** 2)
        var = normalised_sum / len(doc_h_estimates)
        centralities.append(var)
    return - np.array(centralities)


def locally_normalised_local_predictability(df, estimate='normalised_h_doc', ctx_name='path', shuffle=False):
    centralities = []
    for doc_id in set(df[ctx_name].values):
        doc_h_estimates = df[df[ctx_name] == doc_id][estimate].values 
        if shuffle:
            doc_h_estimates = np.random.permutation(doc_h_estimates)
        sum_squared_diffs = 0
        for i in range(1, len(doc_h_estimates)):
            sum_squared_diffs += abs(doc_h_estimates[i] / doc_h_estimates[i - 1] - 1) ** 2
        var = sum_squared_diffs / len(doc_h_estimates)
        centralities.append(var)
    return - np.array(centralities)


In [None]:
metrics = ['Local predictability', 'Global centrality']

----

# All corpora

In [None]:
uid_list = []

for corpus, corpus_name in [(ptb, 'Penn Treebank'), (pb, 'PhotoBook'), (bncs, 'Spoken BNC')]:
#     , (fiction, 'Fiction'), (news, 'News'), (acad, 'Academic'), (nonac, 'Non-academic'), (unpub, 'Unpublished'), (other, 'Other pubs')]:

#     corpus = corpus[corpus['position'] > 5]
    corpus = corpus[corpus['length'] > 0]
    
    for x in global_centrality(corpus, 'normalised_h_doc'):
        uid_list.append((corpus_name, 'Global centrality', x))
        
#     for x in normalised_global_centrality(corpus):
#         uid_list.append((corpus_name, 'Normalised global centrality', x))
        
    for x in local_predictability(corpus, 'normalised_h_doc'):
        uid_list.append((corpus_name, 'Local predictability', x))
        
#     for x in normalised_local_predictability(corpus):
#         uid_list.append((corpus_name, 'Normalised local predictability', x))
        
#     for x in locally_normalised_local_predictability(corpus):
#         uid_list.append((corpus_name, 'Locally normalised local predictability', x))


In [None]:
df_flat = pd.DataFrame(uid_list, columns=['corpus', 'Uniformity metric', 'score'])

In [None]:
for metric in ['Global centrality']:
#     sns.set_style("whitegrid")
#     sns.set_palette(sns.color_palette(colors))
    sns.set_style("whitegrid")
    ax = sns.barplot(x='corpus', y='score',
                     data=df_flat[df_flat['Uniformity metric']==metric], color="teal")
    sns.despine(left=True)
    plt.xlabel('')
    plt.ylabel('$\leftarrow$ {}'.format(metric))
    sns.set(font_scale = 1.5)
    ax.invert_yaxis()
    plt.tight_layout()
#     plt.show()
    plt.savefig('/Users/mario/code/erp-paper/conll2021/figures/wlimit/glob-cent.pdf')

# All corpora shuffled

In [None]:
seeds = [np.random.randint(100000) for _ in range(100)]

In [None]:
random_dfs = []
for seed in tqdm(seeds):
    uid_list_random = []

    for corpus, corpus_name in [(ptb, 'Penn Treebank'), (pb, 'PhotoBook'), (bncs, 'Spoken BNC')]:
#         (fiction, 'Fiction'), (news, 'News'), (acad, 'Academic'), (nonac, 'Non-academic'), (unpub, 'Unpublished'), (other, 'Other pubs')]:
        
        corpus = corpus[corpus['length'] > 0]
        
        np.random.seed(seed)
        
        for x in global_centrality(corpus, 'normalised_h_doc', shuffle=True):
            uid_list_random.append((corpus_name, 'Global centrality', x))

#         for x in normalised_global_centrality(corpus, 'normalised_h', shuffle=True):
#             uid_list_random.append((corpus_name, 'Normalised global centrality', x))

        for x in local_predictability(corpus,'normalised_h_doc', shuffle=True):
            uid_list_random.append((corpus_name, 'Local predictability', x))

#         for x in normalised_local_predictability(corpus, 'normalised_h', shuffle=True):
#             uid_list_random.append((corpus_name, 'Normalised local predictability', x))

#         for x in locally_normalised_local_predictability(corpus, 'normalised_h', shuffle=True):
#             uid_list_random.append((corpus_name, 'Locally normalised local predictability', x))

    random_dfs.append(
        pd.DataFrame(uid_list_random, columns=['corpus', 'Uniformity metric', '_score'])
    )

In [None]:
df_concat = pd.concat([d['_score'] for d in random_dfs], axis=1)

In [None]:
df_concat['corpus'] = random_dfs[0]['corpus'].values
df_concat['Uniformity metric'] = random_dfs[0]['Uniformity metric'].values

In [None]:
df_concat['score'] = df_concat['_score'].mean(axis=1).values
df_concat.drop(columns=['_score'], inplace=True)

In [None]:
df_flat['Order'] = 'True'
df_flat.head()

df_concat['Order'] = "Control"
df_concat.head()

In [None]:
final_df = pd.concat((df_flat, df_concat))

In [None]:
# for metric in ['Local predictability']:
    
sns.set_style("whitegrid")
colors = ["teal", "lightsteelblue"]

sns.set_palette(sns.color_palette(colors))

ax = sns.barplot(x='corpus', y='score', hue='Order',
                 data=final_df[final_df['Uniformity metric'] == 'Local predictability'])
sns.despine(left=True)
plt.xlabel('')
plt.ylabel('$\leftarrow$ {}'.format('Local predictability'))
sns.set(font_scale = 1.5)
ax.invert_yaxis()
plt.tight_layout()
#     plt.show()

plt.savefig('/Users/mario/code/erp-paper/conll2021/figures/wlimit/loc-pred.pdf')