In [1]:
import sys
sys.path.insert(0, '../..')
from src.utils.latex import show_latex, TABLES

In [2]:
import pandas as pd
from pathlib import Path
from string import Template
from textwrap import indent

# Load data

In [3]:
datapath = Path('../../../Covid_data_11nov/raw')

### LOAD NOTES ###
print(f"Loading all 'Notities' files from {datapath}...")

cols = ['MDN', 'NotitieID', 'NotitieCSN', 'Typenotitie', 'Notitiedatum', 'Notitietekst1', 'Notitietekst2', 'Notitietekst3']
amc = pd.concat(pd.read_csv(f, sep=';', names=cols, encoding='utf-8-sig') for f in datapath.glob('Notities AMC*.csv'))
vumc = pd.concat(pd.read_csv(f, sep=';', names=cols, encoding='utf-8-sig') for f in datapath.glob('Notities VUMC*.csv'))

print(f"DataFrames loaded: {amc.shape=}, {vumc.shape=}")


### LOAD DIAGNOSES ###
print(f"Loading all 'Diagnoses' files from {datapath}...")

cols = ['MDN', 'CSN', 'typecontact', 'DBC-id', 'specialisme', 'episodenaam', 'DBC_diagnose', 'ICD10_diagnose']
f = datapath / 'Diagnoses AMC 2020 sept.csv'
diag_amc = pd.read_csv(f, sep=';', names=cols, encoding = 'utf-8')
f = datapath / 'Diagnoses VUMC 2020 sept.csv'
diag_vumc = pd.read_csv(f, sep=';', names=cols, encoding = 'utf-8')

print(f"DataFrames loaded: {diag_amc.shape=}, {diag_vumc.shape=}")

Loading all 'Notities' files from ../../../Covid_data_11nov/raw...
DataFrames loaded: amc.shape=(1515300, 8), vumc.shape=(1132728, 8)
Loading all 'Diagnoses' files from ../../../Covid_data_11nov/raw...
DataFrames loaded: diag_amc.shape=(77788, 8), diag_vumc.shape=(60757, 8)


In [4]:
annotated = pd.read_csv('../../../Covid_data_11nov/from_inception_tsv/annotated_notes_ids.csv')

# All data

In [5]:
caption = "Number of unique patients in `Diagnoses` and `Notities` files"
label = "2020_n_pat_diag_note"

def find_nunique(hospital, df, diag_df):
    return pd.Series([
        diag_df.MDN.nunique(),
        df.MDN.nunique(),
    ], index=['patients in `Diagnoses`', 'patients in `Notities`'],
    name=hospital)

pd.concat([
    find_nunique('AMC', amc, diag_amc),
    find_nunique('VUMC', vumc, diag_vumc),
], axis=1).rename_axis('n_unique').assign(total=lambda df: df.sum(axis=1)).pipe(show_latex, caption, label)

Unnamed: 0_level_0,AMC,VUMC,total
n_unique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
patients in `Diagnoses`,45154,35654,80808
patients in `Notities`,56662,43658,100320


In [6]:
caption = "All data: num patients, num notes, num diagnoses"
label = "2020_all_overview"


def find_nunique(hospital, df, diag_df):
    return pd.Series([
        diag_df.MDN.nunique(),
        df.query("MDN.isin(@diag_df.MDN)").NotitieID.nunique(),
        diag_df.ICD10_diagnose.nunique(),
    ], index=['patients', 'notes', 'diagnoses'],
    name=hospital)

pd.concat([
    find_nunique('AMC', amc, diag_amc),
    find_nunique('VUMC', vumc, diag_vumc),
], axis=1).rename_axis('n_unique').assign(total=lambda df: df.sum(axis=1)).pipe(show_latex, caption, label)

Unnamed: 0_level_0,AMC,VUMC,total
n_unique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
patients,45154,35654,80808
notes,1268517,963970,2232487
diagnoses,3272,2783,6055


In [7]:
caption = "Top 10 most frequent diagnoses"
label = "2020_top10_diag"
column_format = r"{>{\footnotesize}m{1.5cm}|>{\footnotesize}m{9cm}|>{\footnotesize}m{1.5cm}}"

def topten_diagnoses(hospital, diag_df):
    diag_df = diag_df.drop_duplicates(subset=['MDN', 'ICD10_diagnose'])
    return diag_df.groupby('ICD10_diagnose').size().sort_values(ascending=False).head(10)

pd.concat([
    topten_diagnoses('AMC', diag_amc),
    topten_diagnoses('VUMC', diag_vumc),
], keys=['AMC', 'VUMC']).rename('n').rename_axis(['', 'topten_diagnoses']).to_frame(
).pipe(show_latex, caption, label, column_format=column_format)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
Unnamed: 0_level_1,topten_diagnoses,Unnamed: 2_level_1
AMC,"Voorlichting en advies, niet gespecificeerd [Z71.9]",1781
AMC,Overige en niet gespecificeerde buikpijn [R10.4],822
AMC,Overige gespecificeerde medische zorg [Z51.8],821
AMC,Observatie in verband met verdenking op andere gespecificeerde ziekten en aandoeningen [Z03.8],779
AMC,"Hulpverlening in verband met voortplanting, niet gespecificeerd [Z31.9]",706
AMC,"Controle van normale zwangerschap, niet gespecificeerd [Z34.9]",612
AMC,"Angina pectoris, niet gespecificeerd [I20.9]",548
AMC,"Pijn op borst, niet gespecificeerd [R07.4]",505
AMC,Niet gespecificeerd letsel van hoofd [S09.9],503
AMC,"Observatie in verband met verdenking op ziekte of aandoening, niet gespecificeerd [Z03.9]",464


# COVID

### COVID-19, virus geïdentificeerd \[U07.1]

In [8]:
def add_colname(df,lbl):
    """
    TBD
    """
    return pd.concat([df], keys=[lbl], axis=1).swaplevel(axis=1)

In [9]:
caption = "COVID data overview ('COVID-19, virus geïdentificeerd [U07.1]')"
label = "2020_covid_overview"

def find_nunique(hospital, df, diag_df):
    diag_df = diag_df.query("ICD10_diagnose == 'COVID-19, virus geïdentificeerd [U07.1]'")
    return pd.Series([
        diag_df.MDN.nunique(),
        df.query("MDN.isin(@diag_df.MDN)").NotitieID.nunique(),
    ], index=['patients', 'notes'],
    name=hospital)

add_label = lambda df,lbl: pd.concat([df], keys=[lbl], axis=1)

all_covid = pd.concat([
    find_nunique('AMC', amc, diag_amc),
    find_nunique('VUMC', vumc, diag_vumc),
], axis=1).rename_axis('n_unique').assign(total=lambda df: df.sum(axis=1))

ann_covid = annotated.pivot_table(
    index='institution',
    values=['MDN', 'NotitieID'],
    aggfunc='nunique',
).T.rename({'MDN': 'patients', 'NotitieID': 'notes'}
).rename(columns=str.upper
).rename_axis('n_unique'
).rename_axis(None, axis=1
).assign(total=lambda df: df.sum(axis=1))

rst_covid = all_covid - ann_covid

pd.concat([
    all_covid.pipe(add_label, 'all'),
    ann_covid.pipe(add_label, 'annot'),
    rst_covid.pipe(add_label, 'rest'),
], axis=1).swaplevel(axis=1
).reindex(['AMC', 'VUMC', 'total'], axis=1, level=0
).T.pipe(show_latex, caption, label)

Unnamed: 0,n_unique,patients,notes
AMC,all,255,21181
AMC,annot,31,941
AMC,rest,224,20240
VUMC,all,227,21435
VUMC,annot,29,746
VUMC,rest,198,20689
total,all,482,42616
total,annot,60,1687
total,rest,422,40929


In [10]:
prefix = '2020'
for idx, table in enumerate(TABLES):
    with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
        f.write(table)