# COVID-19 data

- Notes of patients with the diagnosis `COVID-19, virus geïdentificeerd \[U07.1]`.
- The data is from 2020 and Q1 of 2021.
- The data is from the two locations of the Amsterdam UMC: `amc` and `vumc`.

In [1]:
import spacy
import pandas as pd

import sys
sys.path.insert(0, '../..')
from pathlib import Path
from utils.config import PATHS
from utils.data_process import concat_annotated, fix_week_14, anonymize

# Load data

In [2]:
datapath = PATHS.getpath('data')

cov1 = pd.read_pickle(datapath / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')
cov2 = pd.read_pickle(datapath / '2020-Q4_2021-Q1_raw/ICD_U07.1/notes_[U07.1]_2020_q4_2021_q1.pkl')

df = pd.concat([cov1, cov2], ignore_index=True).drop_duplicates(subset=['MDN', 'NotitieID', 'all_text'])

# Mark annotated

In [3]:
def isin_multicol(
    df1 : pd.DataFrame,
    df2 : pd.DataFrame,
    *args
) -> pd.Series:
    cols = list(args)
    return df1.set_index(cols).index.isin(df2.set_index(cols).index)

In [4]:
annotated = pd.read_csv(datapath / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

df['annotated'] = df.pipe(isin_multicol, annotated, 'institution', 'MDN', 'NotitieID')

# Process annotations and add gold labels

In [5]:
datapath = PATHS.getpath('data_expr_sept')

domains=['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']
levels = [f"{domain}_lvl" for domain in domains]
info_cols = ['institution', 'MDN']
other = ['target', 'background', 'plus']

def add_boolean_selectors(df):
    is_background = lambda df: df.groupby('sen_id').background.transform('any')
    is_target = lambda df: df.groupby('sen_id').target.transform('any')
    is_disregard = lambda df: df.groupby('NotitieID').disregard.transform('any')
    return df.assign(
        background_sent = is_background,
        target_sent = is_target,
        disregard_note = is_disregard,
    )

def standardize_nans(df):
    df[domains + other] = df[domains + other].fillna(False)
    df[['label', 'relation']] = df[['label', 'relation']].fillna('_')
    df['token'] = df['token'].fillna('')
    return df

def load_and_preprocess_data(datapath):
    return concat_annotated(datapath
        ).pipe(fix_week_14
        ).pipe(add_boolean_selectors
        ).pipe(standardize_nans)

def create_note_level_labels(df, info_cols, levels):
    info = df.groupby('NotitieID')[info_cols].first()
    labels = df.groupby('NotitieID')[levels].mean()
    return pd.concat([info, labels], axis=1).reset_index()

annot = load_and_preprocess_data(datapath)
gold = create_note_level_labels(annot, info_cols, levels)

df = df.merge(gold, how='left', on=['NotitieID', 'MDN', 'institution'])

# Prepare for ML pipeline

The notes for which there are no gold labels, get labeled by the ML models. Since this is done on a separate server, the actual functions are not in this notebook; here only the inputs for ML are prepared and the outputs from ML are processed.

In [None]:
nlp = spacy.load('nl_core_news_lg')

## Anonymize notes

In [None]:
datapath = PATHS.getpath('covid_data')

to_ml = df.query("not annotated").all_text
notes = to_ml.apply(lambda i: anonymize(i, nlp)[0])
# notes.to_pickle(datapath / 'intermediate_files/anonym_notes.pkl')

## Split to sentences

In [None]:
to_sentence = lambda i: [str(t) for t in list(nlp(i).sents)]
sents = notes.apply(to_sentence).explode().rename('text').reset_index()
# sents.to_pickle(datapath / 'intermediate_files/sents.pkl')

In [48]:
# the sentences are split into batches of 200,000 sentences
# the batches are sent to the domain classifier

sents = sents.assign(chunk = lambda df: df.index // 200000)

for chunk in sents.chunk.unique():
    if chunk == 0:
        continue
    boolean = sents.chunk == chunk
    sents.loc[boolean, :'text'].to_pickle(f"sents_part{chunk}.pkl")

## Process domain predictions for level clf's

In [6]:
# this file is the concatination of the labeled batches

dom_preds = pd.read_pickle(datapath / 'intermediate_files/sents_with_dom_preds.pkl')
pred_col = 'pred_domains_eb_ap_mod1'

In [9]:
# for each domain, sentences that were labeled as this domain are sent to the level classifier

for i, dom in enumerate(domains):
    boolean = dom_preds[pred_col].apply(lambda x: bool(x[i]))
    results = dom_preds[boolean]
    if results.empty:
        print(f"we have a boner: {dom}!")
        continue
    print(f"{dom}: {len(results)=}")
    results.to_pickle(f"lvl_preds_{dom}.pkl")

ADM: len(results)=107421
ATT: len(results)=1047
BER: len(results)=2212
ENR: len(results)=10671
ETN: len(results)=31783
FAC: len(results)=21729
INS: len(results)=9794
MBW: len(results)=5292
STM: len(results)=20165


# Process level predictions

In [25]:
dom_lvl_preds = dom_preds.copy()
for f in Path().glob('lvl_preds_*.pkl'):
    col = f"{f.stem[-3:]}_lvl"
    dom_lvl_preds = dom_lvl_preds.merge(
        pd.read_pickle(f)[col],
        how='left',
        left_index=True,
        right_index=True,
    )

# Aggregate to note-level

In [31]:
predictions = dom_lvl_preds.groupby('index')[levels].mean()

# Merge back to the original df

In [37]:
df = df.loc[~df.annotated, :'annotated'].merge(
    predictions,
    how='left',
    left_index=True,
    right_index=True,
).append(df.loc[df.annotated]).sort_index()

# Save

In [49]:
# df.drop(columns='all_text').to_pickle(datapath / 'covid_data.pkl')
# df.drop(columns='all_text').to_csv(datapath / 'covid_data.tsv', sep='\t', index=False)

# df.to_pickle(datapath / 'covid_data_with_text.pkl')
# df.to_csv(datapath / 'covid_data_with_text.tsv', sep='\t', index=False)