# Merging annotated Covid data with raw notes

This notebook reads the extracted covid annotations (i.e. the output of `annotation_extraction_cov.ipynb`). It then matches these notes up with the raw patient records from 2020, using patient and note ID numbers. This **provides the date-time stamp for each note, which is essential for timeseries analysis**. 


Be warned, this notebook shows all of the "working out" steps. There were multiple versions of the original patient records with different structures and no column names. A lot of this notebook is just me trying to figure out which columns are which and match them up with the annotated sentences. A sneaky shift-index-by-one issue made this tricky, but it worked in the end.

In [None]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sklearn
import statsmodels
import torch
from matplotlib import pyplot as plt
from tqdm import tqdm as tqdm

# Make graphics nice
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('paper')

In [None]:
# Load up the annotated covid data
df_annot = pd.read_csv('~/gianluca_data/data/covid_traindata.tsv', sep='\t')
print(df_annot.shape)

In [None]:
df_annot.head(4).src_file.values

In [None]:
def parse_source(row):
    centre, srcid, pid, nid, mysid, date, batch, suff = row.src_file.split("--")
    srcid = int(srcid) - 1
    return centre, int(srcid), int(pid), int(nid), int(mysid), date, batch
    
# Parse the src_file field into new columns with corresponding information
df_par = pd.DataFrame(df_annot.apply(parse_source, axis=1).to_list(), columns=[
        'centre', 'src_id', 'patient_id', 'note_id', 'mystery_id', 'date', 'batch'])
df_annot = pd.concat([df_annot, df_par], axis=1)
df_annot.sample(5)

In [None]:
df_annot.drop('src_file', axis=1, inplace=True)

In [None]:
df_annot.head()

In [None]:
# Load up the raw covid notes

covid_dir = '//data/bestanden 2020/'

covid_files = [
    'Notities AMC 2020 Q3.csv',
    'Notities VUMC 2020 Q1.csv',
    'Notities VUMC 2020 Q2.csv',
    'Notities AMC 2020 Q1.csv',
    'Notities VUMC 2020 Q3.csv',
    'Notities AMC 2020 Q2.csv',
]

covid_files = [covid_dir + c for c in covid_files]

def load_notes(fpath):
    try:
        col_names = ['mdn', 'note_id', 'note_csn', 'type', 'date', 'note', 'other1', 'other2']
        df = pd.read_csv(fpath, sep=';', names=col_names, index_col=False)
        df.drop(['other1', 'other2'], axis=1, inplace=True)
        df['source_file'] = fpath.split('/')[-1]
        return df
    except Exception as e:
        print(f"Failed: {fpath}\t{e}")

dfs = []
for fpath in tqdm(covid_files):
    dfs.append(load_notes(fpath))

In [None]:
df_notes = pd.concat(dfs)
print(df_notes.shape)
df_notes.head(5)

In [None]:
df_notes.info()

In [None]:
#Try figure out which IDs are which
for id1 in ['mdn', 'note_id', 'note_csn']:
    for id2 in ['src_id', 'patient_id', 'note_id', 'mystery_id']:
        print(id1, id2, np.intersect1d(df_notes[id1].unique(), df_annot[id2].unique()).size)

In [None]:
df_annot.patient_id.nunique(), df_annot.note_id.nunique(), df_annot.mystery_id.nunique()

In [None]:
df_annot.info()

In [None]:
# Merge the annotated and raw notes to check correspondence
df_merged = df_annot.merge(df_notes, how='outer', left_on=['note_id', 'patient_id', 'date'], right_on=['note_id', 'mdn', 'date'])
df_merged.to_csv('~/gianluca_data/data/covid_traindata_matched.tsv', sep='\t', index=False)
df_merged.shape

In [None]:
for c in df_merged.columns:
    print(c, df_merged[c].nunique(), sep='\t\t')

In [None]:
# sanity check (output redacted for patient privacy)
print(df_merged[(df_merged.patient_id == 1831037) & (df_merged.note_id == 422549521)].sentence.values)
print(df_merged[(df_merged.patient_id == 1831037) & (df_merged.note_id == 422549521)].head(1).note.values)