In [1]:
import sys
sys.path.insert(0, '../..')
from src.utils.latex import add_colname, show_latex, TABLES

In [2]:
import json
import re
import pandas as pd
from pathlib import Path

# Annotated data

## Parse labels

In [3]:
with open('../../tagsets/legacy_stella.json', 'r') as f:
    tagset = json.load(f)['tags']

In [38]:
def categorize_tags(tagset):
    "tbd"
    tag_names = [i['tag_name'].split(':')[0] for i in tagset]
    # define regexes
    rdomain = re.compile('\..*')
    rlevel = re.compile('[A-Z]{3} \d')
    rdisregard = re.compile('disregard_file')
    # find categories
    domains = [tag for tag in tag_names if rdomain.match(tag)]
    levels = [tag for tag in tag_names if rlevel.match(tag)]
    disregard = [tag for tag in tag_names if rdisregard.match(tag)]
    other = [tag for tag in tag_names if tag not in domains + levels + disregard]
    return dict(
        domains=domains,
        levels=levels,
        disregard=disregard,
        other=other,
    )

def create_parse_index(conversions):
    domainlevels = zip(conversions.values(), [f"{i}_lvl" for i in conversions.values()])
    domainlevels = [i for domlev in domainlevels for i in domlev]
    return  pd.Index(domainlevels+['disregard', 'other'])

def parse_label(label, parse_index, cols_to_lbl, reg_others):
    s = pd.Series(index=parse_index)
    for idx in s.index:
        if idx in cols_to_lbl:
            s[idx] = cols_to_lbl[idx] in label
        elif '_lvl' in idx:
            regex = re.compile(f"{idx[:3]} (\d)")
            if regex.search(label):
                s[idx] = int(regex.search(label).group(1))
        else:
            s[idx] = reg_others.findall(label)
    return s

def parse_df(df, tagset):
    cat_tags = categorize_tags(tagset)
    conversions = {
        '.B152': 'STM',
        '.B455': 'INS',
        '.D450': 'FAC',
        '.D840-859': 'BER',
    }
    parse_index = create_parse_index(conversions)
    reg_others = re.compile('|'.join(cat_tags['other']))
    cols_to_lbl = {v:k for k,v in conversions.items()}
    cols_to_lbl['disregard'] = 'disregard\\_file'

    parse_label_from_row = lambda row: parse_label(row.label, parse_index, cols_to_lbl, reg_others)
    select_labels = (df.label != '_') & df.label.notna()
    parsed = df.loc[select_labels].apply(parse_label_from_row, result_type='expand', axis=1)
    return df.join(parsed)

In [5]:
# TO DO: drop duplicate notes

def preprocessing(df):
    return df.assign(
        sen_id = lambda df: df.NotitieID.astype(str) + '_' + df.sen_tok.str.split('-').str[0],
        tok = lambda df: df.sen_tok.str.split('-').str[1],
    )

## Non-COVID (2017)

In [4]:
noncovpath = Path('../../../Non_covid_data_15oct/from_inception_tsv')

In [32]:
# noncov = pd.read_pickle(noncovpath / 'annotated_df_Batch1_pilot.pkl').pipe(preprocessing).pipe(parse_df, tagset)
# noncov.to_pickle(noncovpath / 'annotated_df_Batch1_pilot_parsed.pkl')

In [5]:
noncov = pd.read_pickle(noncovpath / 'annotated_df_Batch1_pilot_parsed.pkl')

In [6]:
noncov.shape

(1589820, 24)

In [6]:
noncov.columns

Index(['sen_tok', 'char', 'token', 'label', 'relation', 'annotator',
       'institution', 'MDN', 'NotitieID', 'NotitieCSN', 'batch',
       'legacy_rawfile', 'sen_id', 'tok', 'STM', 'STM_lvl', 'INS', 'INS_lvl',
       'FAC', 'FAC_lvl', 'BER', 'BER_lvl', 'disregard', 'other'],
      dtype='object')

In [22]:
regex = re.compile(r'.B152: Stemming(?!\[)')
x = ".B152: Stemming"
regex.search(x)

In [23]:
noncov.loc[noncov.label.str.contains(r'.B152: Stemming(?!\[)', na=False)]

Unnamed: 0,sen_tok,char,token,label,relation,annotator,institution,MDN,NotitieID,NotitieCSN,...,STM,STM_lvl,INS,INS_lvl,FAC,FAC_lvl,BER,BER_lvl,disregard,other
2083,48-8,2835-2841,somber,.B152: Stemming,48-7,edwin,VUMC,,148700779,,...,True,,False,,False,,False,,False,[]
9020,7-2,250-261,onzekerheid,.B152: Stemming,7-3[3_0],edwin,VUMC,,161847304,,...,True,,False,,False,,False,,False,[]
11609,24-9,1065-1071,stress,.B152: Stemming,24-7|24-1[2_0],edwin,VUMC,,192507400,,...,True,,False,,False,,False,,False,[]
11617,25-7,1100-1106,stress,.B152: Stemming,24-1[2_0]|25-6,edwin,VUMC,,192507400,,...,True,,False,,False,,False,,False,[]
11731,38-3,1739-1745,stress,.B152: Stemming,_,edwin,VUMC,,192507400,,...,True,,False,,False,,False,,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1572647,7-15,462-468,gevoel,.B152: Stemming,7-14,meskers,VUMC,,201544848,,...,True,,False,,False,,False,,False,[]
1575716,133-2,5105-5113,stemming,.B152: Stemming,133-4,meskers,VUMC,,183067319,,...,True,,False,,False,,False,,False,[]
1575723,133-9,5152-5158,affect,.B152: Stemming,133-8,meskers,VUMC,,183067319,,...,True,,False,,False,,False,,False,[]
1575863,148-2,6024-6032,stemming,.B152: Stemming,148-4,meskers,VUMC,,183067319,,...,True,,False,,False,,False,,False,[]


In [29]:
noncov.loc[noncov.sen_id == '183067319_133']
# .query("label.notna() and label != '_'")

Unnamed: 0,sen_tok,char,token,label,relation,annotator,institution,MDN,NotitieID,NotitieCSN,...,STM,STM_lvl,INS,INS_lvl,FAC,FAC_lvl,BER,BER_lvl,disregard,other
141859,133-1,5102-5104,De,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141860,133-2,5105-5113,stemming,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141861,133-3,5114-5123,imponeert,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141862,133-4,5124-5136,neerslachtig,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141863,133-5,5137-5138,",",_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141864,133-6,5139-5142,met,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141865,133-7,5143-5146,een,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141866,133-8,5147-5151,vlak,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141867,133-9,5152-5158,affect,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,
141868,133-10,5159-5160,.,_,_,edwin,VUMC,,183067319,,...,,,,,,,,,,


In [20]:
# select_labels = (noncov.label != '_') & noncov.label.notna()
# noncov.loc[select_labels]

cols = ['sen_tok', 'token', 'label','relation', 'sen_id', 'tok', 'STM', 'STM_lvl', 'INS', 'INS_lvl', 'FAC', 'FAC_lvl', 'BER', 'BER_lvl', 'disregard', 'other']
noncov.loc[noncov.sen_id.isin(['201738071_49', '201738071_53']), cols]

Unnamed: 0,sen_tok,token,label,relation,sen_id,tok,STM,STM_lvl,INS,INS_lvl,FAC,FAC_lvl,BER,BER_lvl,disregard,other
1589643,49-1,Bemerkt,_,_,201738071_49,1,,,,,,,,,,
1589644,49-2,spiermassaverlies,_,_,201738071_49,2,,,,,,,,,,
1589645,49-3,(,_,_,201738071_49,3,,,,,,,,,,
1589646,49-4,niet,_,_,201738071_49,4,,,,,,,,,,
1589647,49-5,geobjectiveerd,_,_,201738071_49,5,,,,,,,,,,
1589648,49-6,",",_,_,201738071_49,6,,,,,,,,,,
1589649,49-7,verminderde,_,_,201738071_49,7,,,,,,,,,,
1589650,49-8,energie,_,_,201738071_49,8,,,,,,,,,,
1589651,49-9,",",_,_,201738071_49,9,,,,,,,,,,
1589652,49-10,grote,.B455: Inspanningstolerantie[4]|INS 2[5],49-10[5_4],201738071_49,10,False,,True,2.0,False,,False,,False,[]


## COVID (2020)

In [3]:
covpath = Path('../../../Covid_data_11nov/from_inception_tsv')

In [40]:
# cov = pd.read_pickle(covpath / 'annotated_df_CovidBatch_pilot.pkl').pipe(preprocessing).pipe(parse_df, tagset)
# cov.to_pickle(covpath / 'annotated_df_CovidBatch_pilot_parsed.pkl')

In [4]:
cov = pd.read_pickle(covpath / 'annotated_df_CovidBatch_pilot_parsed.pkl')

In [7]:
cov.shape

(589329, 24)

In [6]:
cov.loc[cov.NotitieID == '416426863']

Unnamed: 0,sen_tok,char,token,label,relation,annotator,institution,MDN,NotitieID,NotitieCSN,...,STM,STM_lvl,INS,INS_lvl,FAC,FAC_lvl,BER,BER_lvl,disregard,other
448687,1-1,0-9,Klinische,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448688,1-2,10-18,gegevens,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448689,1-3,19-20,:,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448690,2-1,21-27,nieuwe,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448691,2-2,28-37,maagsonde,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448692,2-3,38-47,geplaatst,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448693,2-4,48-49,",",_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448694,2-5,50-57,positie,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448695,2-6,58-65,onzeker,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,
448696,2-7,66-79,Vraagstelling,_,,meskers,AMC,1830570,416426863,422658373,...,,,,,,,,,,


### Make tables

In [9]:
total = noncov.assign(source='non-covid 2017'
).append(
    cov.assign(source='covid 2020'), ignore_index=True
)
del noncov
del cov

In [15]:
caption = ""
label = ""

total.pivot_table(
    index='source',
    values='NotitieID',
    aggfunc='nunique',
    margins=True,
    margins_name='Total',
).rename(columns={'NotitieID': 'n_notes'}
).join(
        total.query("disregard == True").pivot_table(
        index='source',
        values='NotitieID',
        aggfunc='nunique',
        margins=True,
        margins_name='total',
    ).rename(columns={'NotitieID': 'n_disregard'})
).pipe(show_latex, caption=caption, label=label)

In [25]:
# mean/min/max number sentences per note
caption = ""
label = ""

s = total.groupby(['source', 'NotitieID']).sen_id.nunique()
by_source = s.groupby(level=0)
by_source.agg(['mean', 'median', 'min', 'max']).astype(int).append(
    s.agg(['mean', 'median', 'min', 'max']).astype(int).rename('total')
).pipe(show_latex, caption=caption, label=label)

Unnamed: 0_level_0,mean,median,min,max
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
covid 2020,56,31,1,330
non-covid 2017,64,42,1,364
total,61,38,1,364


In [35]:
# mean/min/max number notes per annotator
caption = ""
label = ""

s = total.groupby(['source', 'annotator']).NotitieID.nunique()
s.groupby(level=0).agg(['mean', 'median', 'min', 'max']).astype(int
).pipe(show_latex, caption=caption, label=label)

Unnamed: 0_level_0,mean,median,min,max
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
covid 2020,211,200,171,307
non-covid 2017,444,434,242,686


In [11]:
total.columns

Index(['sen_tok', 'char', 'token', 'label', 'relation', 'annotator',
       'institution', 'MDN', 'NotitieID', 'NotitieCSN', 'batch',
       'legacy_rawfile', 'sen_id', 'tok', 'STM', 'STM_lvl', 'INS', 'INS_lvl',
       'FAC', 'FAC_lvl', 'BER', 'BER_lvl', 'disregard', 'other', 'source'],
      dtype='object')

In [86]:
# label analysis | n_labels (total, by domain)
caption = ""
label = ""

n_labels = total.pivot_table(
    index='source',
    values=['STM', 'INS', 'FAC', 'BER'],
    aggfunc='sum',
    margins=True,
    margins_name='total',
).assign(total=lambda df: df.sum(axis=1))

p_labels = (n_labels.div(n_labels.iloc[:, -1], axis=0) * 100).round()

n_labels.pipe(add_colname, 'n').join(
    p_labels.pipe(add_colname, '%')
).astype('Int64'
).sort_index(axis=1, level=[0,1], ascending=[True, False]
).pipe(show_latex, caption=caption, label=label)

Unnamed: 0_level_0,BER,BER,FAC,FAC,INS,INS,STM,STM,total,total
Unnamed: 0_level_1,n,%,n,%,n,%,n,%,n,%
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
covid 2020,33,1,815,26,1495,48,773,25,3116,100
non-covid 2017,907,9,2346,24,3014,31,3516,36,9783,100
total,940,7,3161,25,4509,35,4289,33,12899,100


In [76]:
# label analysis | for each domain, distribution of levels
caption = ""
label = ""

level_cols = [col for col in total.columns if '_lvl' in col]
by_source = total.astype({col:'Int64' for col in level_cols}).groupby('source')
counts_by_level = [by_source[col].value_counts() for col in level_cols]
level_counts = pd.concat(counts_by_level, axis=1
).rename_axis(['source', 'level']
).rename(lambda col: col[:3], axis=1)

level_totals = level_counts.groupby(level=0).sum()
level_percs = (level_counts / level_totals * 100).round()

level_counts.pipe(add_colname, 'n').join(
    level_percs.pipe(add_colname, '%')
).sort_index(axis=1, level=[0,1], ascending=[True, False]
).astype('Int64'
).pipe(show_latex, caption=caption, label=label)

Unnamed: 0_level_0,Unnamed: 1_level_0,BER,BER,FAC,FAC,INS,INS,STM,STM
Unnamed: 0_level_1,Unnamed: 1_level_1,n,%,n,%,n,%,n,%
source,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
covid 2020,0,9.0,39.0,590,50,562,35,277.0,30.0
covid 2020,1,,,93,8,689,42,367.0,40.0
covid 2020,2,,,115,10,219,14,114.0,12.0
covid 2020,3,,,173,15,111,7,109.0,12.0
covid 2020,4,14.0,61.0,197,17,35,2,55.0,6.0
covid 2020,5,,,5,0,6,0,,
non-covid 2017,0,316.0,26.0,355,10,187,6,882.0,22.0
non-covid 2017,1,46.0,4.0,181,5,741,22,2073.0,52.0
non-covid 2017,2,199.0,17.0,319,9,729,22,472.0,12.0
non-covid 2017,3,301.0,25.0,489,13,711,21,427.0,11.0
