In [None]:
import pandas as pd

import sys
sys.path.insert(0, '../..')
from src.utils.latex import show_latex, TABLES

# Load data

In [None]:
df = pd.read_pickle('../../data/from_inception_tsv/annotated_df_week14_part1_parsed.pkl')
len(df)

In [None]:
batch_info = pd.read_pickle('../../data/to_inception_conll/week_14.pkl')

In [None]:
iaa = batch_info.query("samp_meth == 'kwd_iaa'").NotitieID.unique()
df = df.query("NotitieID.isin(@iaa)")
len(df)

In [None]:
levels = [col for col in df.columns if '_lvl' in col]
domains = [col[:3] for col in levels]
other = ['disregard', 'background', 'target', 'plus']

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Document-level

In [None]:
# domain
domain = df.groupby(['NotitieID', 'annotator'])[domains].any().stack().unstack(1)
domain = domain.loc[domain.any(axis=1)]

# levels
get_levels = lambda g, level: g[level].apply(lambda s: [i for i in s.unique() if i==i])
grouped = df.groupby(['NotitieID', 'annotator'])   

lvl_labels = pd.concat([get_levels(grouped, level) for level in levels], axis=1).stack().unstack(1)
lvl_labels = lvl_labels.loc[lvl_labels.any(axis=1)]

# disregard, background, target, plus
othr = df.groupby(['NotitieID', 'annotator'])[other].any().stack().unstack(1)
othr = othr.loc[othr.any(axis=1)]

# overview
overview = pd.concat([domain, lvl_labels, othr]).sort_index()
overview

### Save table per note to latex

In [None]:
for noteID in iaa:
    caption = f'{noteID} - overview of annotations (document level)'
    label = f'{noteID}_overview'
    overview.loc[noteID].pipe(show_latex, caption, label, cell_format=str)

In [None]:
prefix = 'iaa_week_14'
for idx, table in enumerate(TABLES):
    with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
        f.write(table)

# Sentence-level

In [None]:
# domain
domain = df.groupby(['sen_id', 'annotator'])[domains].any().stack().unstack(1)
domain = domain.loc[domain.any(axis=1)]

# levels
get_levels = lambda g, level: g[level].apply(lambda s: [i for i in s.unique() if i==i])
grouped = df.groupby(['sen_id', 'annotator'])   

lvl_labels = pd.concat([get_levels(grouped, level) for level in levels], axis=1).stack().unstack(1)
lvl_labels = lvl_labels.loc[lvl_labels.any(axis=1)]

# disregard, background, target, plus
othr = df.groupby(['sen_id', 'annotator'])[other].any().stack().unstack(1)
othr = othr.loc[othr.any(axis=1)]

# overview
pd.concat([domain, lvl_labels, othr]).sort_index()

# Token-level

In [None]:
cols = ['label', 'relation']
idx = ['sen_id', 'tok', 'token']
annotators = df.annotator.unique()

def assign_annotator(df, annotator):
    to_rename = {'label': f'label_{annotator}', 'relation': f'relation_{annotator}'}
    return df.set_index(idx).query(f"annotator == '{annotator}'")[cols].rename(columns=to_rename)

dfs = [assign_annotator(df, ann) for ann in annotators]
overview = pd.concat(dfs, axis=1).reset_index()
non_empty_rows = overview.iloc[:, 3:].applymap(lambda x: x == x and x != '_').any(axis=1)
non_empty_sen_ids = overview.loc[non_empty_rows].sen_id.unique()
overview.loc[overview.sen_id.isin(non_empty_sen_ids)]

In [None]:
sen_id = '428039733_114'
annotator = 'meskers'
df.query("(sen_id == @sen_id) & (annotator == @annotator)")