In [1]:
import random
import pandas as pd
from pathlib import Path

In [2]:
pd.set_option("max_columns", None)

# Load data

In [3]:
path = Path('../../data/expr_july')

### ZonMw annotations (VUmc + AMC)

In [4]:
# load core team annotations; pickles are deduplicated during processing
annot = pd.concat([pd.read_pickle(fp) for fp in path.glob('*_dedup.pkl')], ignore_index=True)

# load ze annotations and remove IAA files
ze = pd.concat(
    [pd.read_pickle(fp) for fp in path.glob('annotated_df_ze_*.pkl')], ignore_index=True
).query("~NotitieID.isin(@annot.NotitieID)")

# concat and remove `disregard` files
df = pd.concat([annot, ze], ignore_index=True).assign(
    disregard_note = lambda df: df.groupby('NotitieID').disregard.transform('max'),
).query("disregard_note != True").drop(columns='disregard_note')

### Pilot annotations (VUmc + AMC)

In [5]:
# load pilot annotations and remove `disregard` files
pilot = pd.concat([pd.read_pickle(fp) for fp in path.glob('pilot_*.pkl')], ignore_index=True).assign(
    disregard_note = lambda df: df.groupby('NotitieID').disregard.transform('max'),
).query("disregard_note != True").drop(columns='disregard_note')

# Stats

### Annotated notes

In [12]:
df.pivot_table(
    index='year',
    values='NotitieID',
    aggfunc='nunique',
    margins=True,
    margins_name='total',
).rename(columns={'NotitieID': 'n_notes_zonmw'}
).join(
        pilot.pivot_table(
        index='year',
        values='NotitieID',
        aggfunc='nunique',
        margins=True,
        margins_name='total',
    ).rename(columns={'NotitieID': 'n_notes_pilot'})
).fillna(0).astype(int)

Unnamed: 0_level_0,n_notes_zonmw,n_notes_pilot
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,455,3048
2018,445,0
2020,1691,1583
total,2591,4631


### Annotated sentences (ZonMw): domains, background & target

In [18]:
domains = ['ENR', 'ATT', 'STM', 'ADM', 'INS', 'MBW', 'FAC', 'BER', 'ETN']
rows_with_domain = df.loc[df[domains].any(axis=1)]
domain_totals_per_sen_id = rows_with_domain.groupby(['year', 'sen_id'])[domains].any()

n_sent = df.groupby('year').sen_id.nunique()

n_sent_with_label = df.assign(
    has_domain = lambda df: df[domains].any(axis=1),
).query("has_domain == True").groupby('year').sen_id.nunique()

n_backgrnd = df.query("(background == True) | (target == True)").groupby('year').sen_id.nunique()

table = pd.concat([
    n_sent.rename('n_all_sents'),
    n_sent_with_label.rename('n_sents_with_labels'),
    n_backgrnd.rename('n_bckgrnd_sents'),
], axis=1)
table.loc['total'] = table.sum()
table.assign(
    prc_sents_with_labels=lambda df: (df.n_sents_with_labels / df.n_all_sents).mul(100).round(1),
    prc_bckgrnd_sents=lambda df: (df.n_bckgrnd_sents / df.n_all_sents).mul(100).round(1),
)

Unnamed: 0_level_0,n_all_sents,n_sents_with_labels,n_bckgrnd_sents,prc_sents_with_labels,prc_bckgrnd_sents
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017,22786,1145,811,5.0,3.6
2018,21789,1065,556,4.9,2.6
2020,92465,5688,3510,6.2,3.8
total,137040,7898,4877,5.8,3.6


### Annotated sentences (pilot): background & target

In [19]:
n_sent = pilot.groupby('year').sen_id.nunique()

n_backgrnd = pilot.query("(background == True) | (target == True)").groupby('year').sen_id.nunique()

table = pd.concat([
    n_sent.rename('n_all_sents'),
    n_backgrnd.rename('n_bckgrnd_sents'),
], axis=1)
table.loc['total'] = table.sum()
table.assign(
    prc_bckgrnd_sents=lambda df: (df.n_bckgrnd_sents / df.n_all_sents).mul(100).round(1),
)

Unnamed: 0_level_0,n_all_sents,n_bckgrnd_sents,prc_bckgrnd_sents
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,189049,31182,16.5
2020,86451,16226,18.8
total,275500,47408,17.2
