In [1]:
import pandas as pd
from pathlib import Path
from string import Template
from textwrap import indent

# Load data

In [2]:
datapath = Path('/data/notes/vumc/all_data')
raw = [datapath / 'notities_2017_deel1.csv', datapath / 'notities_2017_deel2.csv']
cols = ['MDN', 'NotitieID', 'Typenotitie', 'Notitiedatum', 'Notitietekst1', 'Notitietekst2', 'Notitietekst3']
df = pd.concat(pd.read_csv(f, sep=';', names=cols, encoding='utf-8-sig') for f in raw).drop_duplicates(ignore_index=True)
df.shape

(1869079, 7)

In [3]:
annotated = pd.read_csv('../../../Non_covid_data_15oct/from_inception_tsv/annotated_notes_ids.csv')

### Template and function for saving tables to LaTeX

In [6]:
TABLES = list()

def show_latex(df, caption, label, column_format=None):
    template = Template('\n'.join([
        r"\begin{table}[]",
        r"    \centering",
        r"$tabular",
        r"    \caption{$caption}",
        r"    \label{tab:$label}",
        r"\end{table}",
    ]))
    alignment = {'int64': 'r', 'float64': 'r'}
    if column_format is None:
        col_formats = [alignment.get(str(i), 'l') for i in df.dtypes.values]
        idx_formats = ['l'] * df.index.nlevels
        column_format = ''.join(idx_formats + col_formats)
    df = df.applymap(lambda x: f'{x:,}')
    tab = ' ' * 4
    tabular = indent(df.to_latex(column_format=column_format), tab).rstrip('\n')
    table = template.substitute(tabular=tabular, caption=caption, label=label)
    TABLES.append(table)
    return df

# Stats

In [11]:
caption = "All data: num patients, num notes"
label = "2017_all_overview"


def find_nunique(df):
    return pd.Series([
        df.MDN.nunique(),
        df.NotitieID.nunique(),
    ], index=['patients', 'notes'])

add_label = lambda df,lbl: pd.concat([df], keys=[lbl], axis=1)

all = find_nunique(df)
annot = find_nunique(df.query("NotitieID.isin(@annotated.NotitieID)"))
rst = all - annot

pd.concat([
    all.pipe(add_label, 'all'),
    annot.pipe(add_label, 'annot'),
    rst.pipe(add_label, 'rest'),
], axis=1).pipe(show_latex, caption, label)

Unnamed: 0,all,annot,rest
patients,79970,2456,77514
notes,1869079,3377,1865702


In [12]:
prefix = '2017'
for idx, table in enumerate(TABLES):
    with open(f'./tables/{prefix}_{idx}.tex', 'w', encoding='utf8') as f:
        f.write(table)