In [1]:
import pandas as pd

import sys
sys.path.insert(0, '..')
from utils.latex import show_latex, TABLES
from utils.config import PATHS

# Load data

In [2]:
week = 'week_18'

In [3]:
path = PATHS.getpath('data_from_inception_tsv')
df = pd.read_pickle(path / f'annotated_df_{week}_parsed.pkl')
ze = pd.read_pickle(path / 'annotated_df_ze_iaa_files_parsed.pkl')
df = pd.concat([df, ze])
len(df)

245525

In [4]:
path = PATHS.getpath('data_to_inception_conll')
batch_info = pd.read_pickle(path / f'{week}.pkl')

In [5]:
iaa = batch_info.query("samp_meth == 'kwd_iaa'").NotitieID.unique()
df = df.query("NotitieID.isin(@iaa)")
len(df)

8987

In [6]:
levels = [col for col in df.columns if '_lvl' in col]
domains = [col[:3] for col in levels]
other = ['disregard', 'background', 'target', 'plus']

In [7]:
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Document-level

In [8]:
# domain
domain = df.groupby(['NotitieID', 'annotator'])[domains].any().stack().unstack(1)
domain = domain.loc[domain.any(axis=1)]

# levels
get_levels = lambda g, level: g[level].apply(lambda s: [i for i in s.unique() if i==i])
grouped = df.groupby(['NotitieID', 'annotator'])   

lvl_labels = pd.concat([get_levels(grouped, level) for level in levels], axis=1).stack().unstack(1)
lvl_labels = lvl_labels.loc[lvl_labels.any(axis=1)]

# disregard, background, target, plus
othr = df.groupby(['NotitieID', 'annotator'])[other].any().stack().unstack(1)
othr = othr.loc[othr.any(axis=1)]

# overview
doc_overview = pd.concat([domain, lvl_labels, othr]).sort_index()
doc_overview

Unnamed: 0_level_0,annotator,avelli,edwin,katsburg,meskers,opsomer,ron,vervaart
NotitieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
413949805,ADM,True,True,True,True,False,False,True
413949805,ADM_lvl,[4.0],[4.0],[4.0],[4.0],[],[],[4.0]
413949805,ENR,True,False,True,True,False,True,True
413949805,ENR_lvl,[4.0],[],[4.0],[4.0],[],[1.0],[4.0]
413949805,disregard,False,False,False,False,True,False,False
433007000,ADM,True,True,True,True,True,False,True
433007000,ADM_lvl,"[4.0, 3.0]",[4.0],[4.0],[4.0],[4.0],[],[4.0]
433007000,ENR,False,False,False,True,False,False,False
433007000,ENR_lvl,[],[],[],[3.0],[],[],[]
433007000,ETN,True,False,True,False,False,True,True


### Save table per note to latex

In [9]:
for noteID in iaa:
    caption = f'{noteID} - overview of annotations (document level)'
    label = f'{noteID}_overview'
    doc_overview.loc[noteID].pipe(show_latex, caption, label, cell_format=str)

In [10]:
prefix = f'iaa_{week}'
for idx, table in enumerate(TABLES):
    with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
        f.write(table)

# Sentence-level

In [11]:
# domain
domain = df.groupby(['sen_id', 'annotator'])[domains].any().stack().unstack(1)
domain = domain.loc[domain.any(axis=1)]

# levels
get_levels = lambda g, level: g[level].apply(lambda s: [i for i in s.unique() if i==i])
grouped = df.groupby(['sen_id', 'annotator'])   

lvl_labels = pd.concat([get_levels(grouped, level) for level in levels], axis=1).stack().unstack(1)
lvl_labels = lvl_labels.loc[lvl_labels.any(axis=1)]

# disregard, background, target, plus
othr = df.groupby(['sen_id', 'annotator'])[other].any().stack().unstack(1)
othr = othr.loc[othr.any(axis=1)]

# overview
sen_overview = pd.concat([domain, lvl_labels, othr]).sort_index()
sen_overview

Unnamed: 0_level_0,annotator,avelli,edwin,katsburg,meskers,opsomer,ron,vervaart
sen_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
413949805_1,disregard,False,False,False,False,True,False,False
413949805_15,ADM,True,True,True,True,False,False,True
413949805_15,ADM_lvl,[],[],[4.0],[4.0],[],[],[]
413949805_16,ADM_lvl,[4.0],[4.0],[4.0],[4.0],[],[],[4.0]
413949805_21,ENR,True,False,True,True,False,True,True
413949805_21,ENR_lvl,[],[],[4.0],[4.0],[],[1.0],[]
413949805_22,ENR_lvl,[4.0],[],[4.0],[4.0],[],[],[4.0]
433007000_10,STM,False,True,False,False,False,False,False
433007000_10,STM_lvl,[],[4.0],[],[],[],[],[]
433007000_19,ADM,True,True,True,True,True,False,True


In [16]:
# all sentences of a note

note_id = '433007000'
sen_overview.query(f"sen_id.str.contains('{note_id}')")

Unnamed: 0_level_0,annotator,avelli,edwin,katsburg,meskers,opsomer,ron,vervaart
sen_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
433007000_10,STM,False,True,False,False,False,False,False
433007000_10,STM_lvl,[],[4.0],[],[],[],[],[]
433007000_19,ADM,True,True,True,True,True,False,True
433007000_20,ADM_lvl,[4.0],[4.0],[4.0],[4.0],[4.0],[],[4.0]
433007000_22,ADM,True,False,False,False,False,False,False
433007000_23,ADM_lvl,[3.0],[],[],[],[],[],[]
433007000_3,ETN,True,False,True,False,False,True,False
433007000_3,ETN_lvl,[3.0],[],[3.0],[],[],[3.0],[]
433007000_5,ETN,True,False,False,False,False,False,False
433007000_5,ETN_lvl,[4.0],[],[],[],[],[],[]


### View sentence(s) annotations of a specific annotator

In [None]:
sen_id = ['240792918_5', '240792918_6', '240792918_7', '240792918_8']
# annotator = 'avelli'
# annotator = 'katsburg'
# annotator = 'meskers'
# annotator = 'opsomer'
# annotator = 'swartjes'
annotator = 'vervaart'
# annotator = 'edwin'
# annotator = 'edith'
# annotator = 'hinke'
# annotator = 'ron'

query = "(sen_id == @sen_id) & (annotator == @annotator)"
print(' '.join(df.query(query).token.to_list()))
print()

df.query(query)

### Save table per sentence to latex

In [None]:
# sen_ids = ['404161434_10', '251665715_56']
# for sen_id in sen_ids:
#     caption = f'{sen_id} - overview of annotations (sentence level)'
#     label = f'{sen_id}_overview'
#     sen_overview.query("sen_id == @sen_id").pipe(show_latex, caption, label, cell_format=str)

In [None]:
# prefix = f'iaa_{week}_444788701'
# for idx, table in enumerate(TABLES):
#     with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
#         f.write(table)

# Token-level

In [None]:
cols = ['label', 'relation']
idx = ['sen_id', 'tok', 'token']
annotators = df.annotator.unique()

def assign_annotator(df, annotator):
    to_rename = {'label': f'label_{annotator}', 'relation': f'relation_{annotator}'}
    return df.set_index(idx).query(f"annotator == '{annotator}'")[cols].rename(columns=to_rename)

dfs = [assign_annotator(df, ann) for ann in annotators]
overview = pd.concat(dfs, axis=1).reset_index()
non_empty_rows = overview.iloc[:, 3:].applymap(lambda x: x == x and x != '_').any(axis=1)
non_empty_sen_ids = overview.loc[non_empty_rows].sen_id.unique()
overview.loc[overview.sen_id.isin(non_empty_sen_ids)]