In [1]:
import pandas as pd
from functools import partial
from pathlib import Path

import sys
sys.path.insert(0, '../..')
from src.utils.latex import show_latex, TABLES

# Load data

In [2]:
path = Path('../../data')

cov_2020 = pd.read_pickle(path / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')
kwd_cov_2020 = pd.read_pickle(path / 'keyword_results/cov_2020_kwd_v1.pkl')
annotated = pd.read_csv(path / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

In [3]:
df = cov_2020.drop_duplicates(subset=['institution', 'MDN', 'NotitieID', 'all_text'], keep='first'
)[['institution', 'MDN', 'NotitieID']
].merge(kwd_cov_2020, on=['institution', 'MDN', 'NotitieID'], how='left'
).merge(annotated.iloc[:,:-1], on=['institution', 'MDN', 'NotitieID'], how='left')

# Stats

In [4]:
domains = ['ENR', 'ATT', 'STM', 'ADM', 'INS', 'MBW', 'FAC', 'BER']
matched_domains = [f"matched_{domain}" for domain in domains]
count_domains = [f"n_{domain}" for domain in domains]

def op_count(df, domain):
    "Number of matches for `domain` keywords."
    return df[domain].apply(lambda x: len(x) if x==x else 0)

def op_bool(df, domain):
    "Are there any matches for `domain` keywords (boolean)."
    return df[domain].fillna(0).apply(bool)

ops_count = {f"n_{domain}":partial(op_count, domain=domain) for domain in domains}
ops_bool = {f"matched_{domain}":partial(op_bool, domain=domain) for domain in domains}

In [5]:
results = df.assign(**ops_count, **ops_bool
).assign(
    kwd_match=lambda df: df[domains].any(axis=1),
    batch=lambda df: df.batch.fillna('not annotated'),
    n_domains=lambda df: df[matched_domains].sum(axis=1)
)

### Overview

In [6]:
caption = "Num. notes with/without keyword matches in the COVID data" 
label = "kwd_covid_overview"

results.pivot_table(
    index=['batch','institution',],
    columns=['kwd_match'],
    values='NotitieID',
    aggfunc='count',
    margins=True,
    margins_name='Totals',
).pipe(show_latex, caption, label)

Unnamed: 0_level_0,kwd_match,False,True,Totals
batch,institution,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
not annotated,amc,4765,17014,21779
not annotated,vumc,4837,17101,21938
pilot,amc,34,919,953
pilot,vumc,34,727,761
Totals,,9670,35761,45431


### Stats by domain (non-annotated notes only)

In [8]:
q_no_ann = "batch == 'not annotated'"
q_match = "n_domains > 0"

In [9]:
caption = "Domains overview: num. notes with at least one keyword match" 
label = "kwd_covid_matched_domains"

results.query(q_no_ann).pivot_table(
    index=['institution',],
    values=matched_domains,
    aggfunc='sum',
    margins=True,
    margins_name='Totals',
).T.sort_values('Totals', ascending=False).pipe(show_latex, caption, label)

institution,amc,vumc,Totals
matched_ADM,12172,11558,23730
matched_MBW,9371,9389,18760
matched_BER,5717,5032,10749
matched_FAC,4526,4567,9093
matched_ENR,3990,4019,8009
matched_INS,2904,3630,6534
matched_STM,2813,3067,5880
matched_ATT,2559,2498,5057


In [10]:
caption = " Mean/median/max num. keyword matches per domain" 
label = "kwd_covid_kwd_per_dom"

dfs = [results.query(q_no_ann).query(f"n_{dom} > 0")[f"n_{dom}"].agg(['mean', 'median', 'max']) for dom in domains]
pd.concat(dfs, keys=domains).unstack(1).sort_values('mean', ascending=False).round(2
).astype({
    'median': int,
    'max': int,
}).pipe(show_latex, caption, label)

Unnamed: 0,mean,median,max
ADM,3.32,2,39
MBW,3.1,2,134
BER,2.06,1,22
INS,1.98,1,23
FAC,1.87,1,27
ENR,1.84,1,13
STM,1.56,1,31
ATT,1.39,1,13


In [11]:
caption = "Number of matched domains per note" 
label = "kwd_covid_n_matched domains"

results.query(q_no_ann).query(q_match).pivot_table(
    columns=['institution',],
    index=['n_domains'],
    aggfunc='count',
    values='NotitieID',
    margins=True,
    margins_name='Totals',
).pipe(show_latex, caption, label)

institution,amc,vumc,Totals
n_domains,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5107,5133,10240
2,4418,4528,8946
3,3323,3461,6784
4,2127,2047,4174
5,1148,1084,2232
6,501,484,985
7,234,236,470
8,156,128,284
Totals,17014,17101,34115


In [12]:
prefix = 'kwd_covid'
for idx, table in enumerate(TABLES):
    with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
        f.write(table)