# COVID-19 data

- Notes of patients with the diagnosis `COVID-19, virus geïdentificeerd \[U07.1]`.
- The data is from 2020 and Q1 of 2021.
- The data is from the two locations of the Amsterdam UMC: `amc` and `vumc`.

In [1]:
import pandas as pd

import sys
sys.path.insert(0, '..')
from utils.latex import show_latex
from utils.config import PATHS

# Load data

In [2]:
datapath = PATHS.getpath('data')

cov1 = pd.read_pickle(datapath / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')
cov2 = pd.read_pickle(datapath / '2020-Q4_2021-Q1_raw/ICD_U07.1/notes_[U07.1]_2020_q4_2021_q1.pkl')

df = pd.concat([cov1, cov2], ignore_index=True).drop_duplicates(subset=['MDN', 'NotitieID', 'all_text'])

# Mark annotated

In [3]:
def isin_multicol(
    df1 : pd.DataFrame,
    df2 : pd.DataFrame,
    *args
) -> pd.Series:
    cols = list(args)
    return df1.set_index(cols).index.isin(df2.set_index(cols).index)

In [4]:
annotated = pd.read_csv(datapath / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

df['annotated'] = df.pipe(isin_multicol, annotated, 'institution', 'MDN', 'NotitieID')

# Stats

In [5]:
df.pivot_table(
    index=['institution'],
    columns=['annotated'],
    values=['NotitieID', 'MDN'],
    aggfunc={'MDN': 'nunique', 'NotitieID': 'count'},
    margins=True,
    margins_name='Total',
).iloc[:,2:].pipe(show_latex, caption='na', label='na')

Unnamed: 0_level_0,MDN,NotitieID,NotitieID,NotitieID
annotated,Total,False,True,Total
institution,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
amc,719,60678,1314,61992
vumc,577,47722,1067,48789
Total,1290,108400,2381,110781


# Save

In [6]:
df.drop(columns='all_text').to_pickle('covid_data.pkl')

In [7]:
df.drop(columns='all_text').to_csv('covid_data.tsv', sep='\t', index=False)