In [1]:
import pandas as pd
from pathlib import Path

import sys
sys.path.insert(0, '../..')
from scripts.utils.latex import show_latex, TABLES

# Load data

In [2]:
datapath = Path('../../data')

notes = pd.read_pickle(datapath / '2020_raw/processed.pkl')
cov = pd.read_pickle(datapath / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')
annotated = pd.read_csv(path / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

In [3]:
cols = ['MDN', 'CSN', 'typecontact', 'DBC-id', 'specialisme', 'episodenaam', 'DBC_diagnose', 'ICD10_diagnose']
settings = dict(sep=';', names=cols, encoding = 'utf-8')
extract_name = lambda f: str(f).split(' ')[1].lower()
dfs = {extract_name(f):pd.read_csv(f, **settings) for f in datapath.glob('Diagnoses*.csv')}
diag = pd.concat(dfs.values(), keys=dfs.keys()
).astype({'MDN':str}).reset_index(level=0
).rename(columns={'level_0': 'institution'})

# All data

In [4]:
caption = "Number of unique patients in `Diagnoses` and `Notities` files"
label = "2020_n_pat_diag_note"

pd.concat([
    notes.groupby('institution').MDN.nunique().rename('patients in `Notities`'),
    diag.groupby('institution').MDN.nunique().rename('patients in `Diagnoses`'),
    ], axis=1
).T.assign(total=lambda df: df.sum(axis=1)).rename_axis('n_unique').pipe(show_latex, caption, label)

institution,amc,vumc,total
n_unique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
patients in `Notities`,56662,43658,100320
patients in `Diagnoses`,45154,35654,80808


In [5]:
caption = "All data: num patients, num notes, num diagnoses"
label = "2020_all_overview"

pd.concat([
    notes.query("MDN.isin(@diag.MDN)").groupby('institution').MDN.nunique().rename('patients'),
    notes.query("MDN.isin(@diag.MDN)").groupby('institution').NotitieID.nunique().rename('notes'),
    diag.groupby('institution').ICD10_diagnose.nunique().rename('diagnoses'),
    ], axis=1
).T.assign(total=lambda df: df.sum(axis=1)).rename_axis('n_unique').pipe(show_latex, caption, label)

institution,amc,vumc,total
n_unique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
patients,45427,36072,81499
notes,1287903,987870,2275773
diagnoses,3272,2783,6055


# COVID

### COVID-19, virus geïdentificeerd \[U07.1]

In [6]:
caption = "COVID data overview ('COVID-19, virus geïdentificeerd [U07.1]')"
label = "2020_covid_overview"

selection = annotated.query("year==2020")
piv = pd.concat([
    selection.groupby('institution')[['MDN', 'NotitieID']].nunique(),
    cov.groupby('institution')[['MDN', 'NotitieID']].nunique(),
], keys=['annotated', 'all']).rename(columns={'MDN':'patients', 'NotitieID':'notes'}
).unstack(0).stack(0).assign(rest=lambda df: df['all'] - df.annotated)

piv.append(pd.concat([piv.xs('amc') + piv.xs('vumc')], keys=['total'])).pipe(show_latex, caption, label)

Unnamed: 0_level_0,Unnamed: 1_level_0,all,annotated,rest
institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
amc,notes,22220,941,21279
amc,patients,261,31,230
vumc,notes,22292,746,21546
vumc,patients,233,29,204
total,notes,44512,1687,42825
total,patients,494,60,434


In [7]:
prefix = '2020'
for idx, table in enumerate(TABLES):
    with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
        f.write(table)