# Patient selection

This notebook searches through the **Covid** patients from the raw medical notes and selects those with desirable statistical properties for timeseries modelling. This was necessary because (as `analysis_annot_covid.ipynb` shows) the sentence-level annotations resulting in incomplete timeseries across patients. 

This selection informed which patients would be subsequently expert-annotated in a novel, continuous method that we developed. Towards the end of this notebook, half of the selected Covid patients have their records converted to Excel-format files with some empty columns for each ICF domain. These spreadsheets were then given to experts for annotation.

In [1]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sklearn
import statsmodels
import torch
from matplotlib import pyplot as plt
from tqdm import tqdm as tqdm

# Make graphics nice
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('paper')

In [2]:
def select_ids(df_diagnoses, search_5=[], search_7=[]):

    MDN_ids = set()
    for query in search_7:
        temp_set = set(df_diagnoses.loc[df_diagnoses[7] == query][0])
        MDN_ids.update(temp_set)

    return MDN_ids

In [3]:
# Compile all notes from 2020 across hospitals. Filter for patients with a confirmed covid diagnosis.

for hospital in ['VUMC', 'AMC']:
    print(f"Hospital: {hospital} ...")
    diagnoses_filepath = f"//data/bestanden 2020/Diagnoses {hospital} 2020 sept.csv"
    notities_filepaths = []
    for q in [1, 2, 3]:    
        notities_filepaths.append(f"//data/bestanden 2020/Notities {hospital} 2020 Q{q}.csv")

    
    # Define patient id pat_id_column
    pat_id_column = 0
    # pat_id_column = 'Pat_id'

    # Read in files as pd.DataFrame types
    print("Reading diagnoses...")
    df_diagnoses = pd.read_csv(diagnoses_filepath, sep=';', header=None, encoding = 'utf-8')
    notities_dfs = []
    for fpath in tqdm(notities_filepaths, desc='Quarters'):
        notities_dfs.append(
            pd.read_csv(fpath, sep=';', header=None, encoding = 'utf-8-sig', engine='python', error_bad_lines=False)
        )
    print("\nConcatenating...")
    df_notities = pd.concat(notities_dfs)
    
    print(df_notities.shape)
    
    # Search queries
    #search_5 = []# ['acute respiratoire aandoening door SARS-CoV-2', 'infectie met SARS-CoV-2', 
                # 'dyspnoe bij infectie met SARS-CoV-2']
    search_7 =  ["COVID-19, virus geïdentificeerd [U07.1]"]

    # MDN_ids is patient id
    MDN_ids = select_ids(df_diagnoses, search_7=search_7)

    # Create df with selected MDN ids
    df_selection = df_notities.loc[df_notities[pat_id_column].isin(MDN_ids)]

    # Print statements for counts
    print(search_7)
    print("Aantal patient ids in search", len(MDN_ids))
    print("Patient ids die ook in notities staan", len(MDN_ids & set(df_notities[pat_id_column])))
    print("Aantal notities van die patienten", df_selection.shape[0])
    print("Gemiddeld aantal documenten per patient", df_selection.shape[0]/len(MDN_ids & set(df_notities[pat_id_column])))

    # Write to csv
    output_filepath = f'../data/covid_notes_{hospital}.tsv'
    df_selection.to_csv(output_filepath, sep='\t')

Hospital: VUMC ...
Reading diagnoses...


Quarters: 100%|██████████| 3/3 [01:41<00:00, 33.82s/it]



Concatenating...
(1132728, 8)
['COVID-19, virus geïdentificeerd [U07.1]']
Aantal patient ids in search 227
Patient ids die ook in notities staan 227
Aantal notities van die patienten 21612
Gemiddeld aantal documenten per patient 95.20704845814979
Hospital: AMC ...
Reading diagnoses...


Quarters: 100%|██████████| 3/3 [02:19<00:00, 46.54s/it]



Concatenating...
(1515300, 8)
['COVID-19, virus geïdentificeerd [U07.1]']
Aantal patient ids in search 255
Patient ids die ook in notities staan 255
Aantal notities van die patienten 21422
Gemiddeld aantal documenten per patient 84.00784313725491


In [4]:
cols = ['indexer', 'patient_id', 'note_id', 'note_csn', 'type', 'date', 'note', 'other1', 'other2']

df_amc = pd.read_csv('../data/covid_notes_AMC.tsv', sep='\t')
df_vumc = pd.read_csv('../data/covid_notes_VUMC.tsv', sep='\t')

df_amc.columns = cols
df_vumc.columns = cols

In [None]:
df_amc.tail()

In [None]:
df_vumc.head()

In [7]:
for centre, df in {'AMC': df_amc, 'VUMC': df_vumc}.items():
    print(centre)
    print('Unique patients:', df.patient_id.nunique())
    print('Unique notes:', df.note_id.nunique())
    print('Notes per patient:')
    print(df.groupby('patient_id').count().note_id.describe())
    print()

AMC
Unique patients: 255
Unique notes: 21181
Notes per patient:
count    255.000000
mean      84.007843
std      101.897354
min        1.000000
25%       22.500000
50%       49.000000
75%      100.000000
max      607.000000
Name: note_id, dtype: float64

VUMC
Unique patients: 227
Unique notes: 21435
Notes per patient:
count    227.000000
mean      95.207048
std      110.342774
min        3.000000
25%       30.000000
50%       57.000000
75%      122.000000
max      806.000000
Name: note_id, dtype: float64



In [None]:
# Unify the dataframes and remove empty columns
df_vumc['centre'] = 'VUMC'
df_amc['centre'] = 'AMC'
df = pd.concat([df_vumc, df_amc])
df = df.sort_values('date')
df.drop(['other1', 'other2'], axis=1, inplace=True)
df

# Analysis and Patient Selection

In [9]:
df.type.value_counts().head()

Zorgplan/VPK rapportage    15441
Voortgangsverslag          13012
Brief                       2774
Familiegesprek              1846
Consulten                   1678
Name: type, dtype: int64

In [10]:
def date_range(dates):
    dates = list(dates)
    return pd.Timedelta(dates[-1] - dates[0])

def date_spread(dates):
    diffs = []
    for t1, t2 in zip(dates, dates[1:]):
        diffs.append(pd.Timedelta(t2 - t1))
    return np.mean(diffs)

def is_amc(strs):
    return 1 if list(strs)[0] == 'AMC' else 0

# Make a dataframe for generating our patient-selection features
_df = df.copy()
_df['date'] = pd.to_datetime(_df['date'])
_df = _df.groupby(['patient_id']).agg({'note_id': 'nunique', 'date': [date_range, date_spread, 'first', 'last'], 'centre': is_amc})
_df.describe()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0_level_0,note_id,date,date,centre
Unnamed: 0_level_1,nunique,date_range,date_spread,is_amc
count,478.0,478,477,478.0
mean,88.711297,117 days 07:40:55.230125524,3 days 07:59:05.067745149,0.529289
std,105.645864,80 days 05:19:37.953628645,6 days 04:22:44.429950911,0.499664
min,1.0,0 days 00:00:00,0 days 00:00:00,0.0
25%,27.0,33 days 00:00:00,0 days 13:02:36.521739130,0.0
50%,51.5,130 days 00:00:00,1 days 15:50:24,1.0
75%,112.0,173 days 18:00:00,3 days 17:29:29.491525423,1.0
max,800.0,267 days 00:00:00,87 days 00:00:00,1.0


In [11]:
_df[('date', 'first')].mean(), _df[('date', 'last')].mean()

(Timestamp('2020-03-30 13:09:17.322175744'),
 Timestamp('2020-07-25 20:50:12.552301056'))

In [12]:
selection =_df[(
        _df[('note_id', 'nunique')].between(27, 112) )&(
        _df[('date', 'date_range')].between('30 days', '180 days') )&(
        _df[('date', 'date_spread')].between('1 days', '4 days') )&(
        _df[('date', 'first')].between('2020-03-15', '2020-08-01')
    )]
selected_patients = selection.reset_index().patient_id.unique()
print("Selected patients:", len(selected_patients))
selection.describe()

Selected patients: 84


Unnamed: 0_level_0,note_id,date,date,centre
Unnamed: 0_level_1,nunique,date_range,date_spread,is_amc
count,84.0,84,84,84.0
mean,58.488095,127 days 10:34:17.142857142,2 days 09:03:24.405685688,0.535714
std,20.994832,32 days 16:18:10.891781388,0 days 20:06:19.878978775,0.501718
min,28.0,32 days 00:00:00,1 days 01:27:16.363636363,0.0
25%,42.75,109 days 06:00:00,1 days 16:24:01.139240506,0.0
50%,54.5,135 days 12:00:00,2 days 08:11:41.451990632,1.0
75%,73.0,153 days 06:00:00,3 days 01:10:00,1.0
max,112.0,178 days 00:00:00,3 days 23:15:00,1.0


In [13]:
# Filter overall dataset by the specified patients
df_subjects = df[df.patient_id.isin(selected_patients)]
print(df_subjects.patient_id.nunique())
candidate_ids = list(df_subjects.patient_id.unique())
print(candidate_ids)

84
[1825193, 1824900, 4883738, 1825349, 1683660, 9172964, 1826096, 1825973, 1826295, 6308072, 1826470, 1826708, 1035156, 8297685, 3286406, 4099178, 1776065, 4174967, 1827868, 1516667, 1827925, 1828106, 1828353, 1827855, 1828039, 6403079, 7474489, 7391983, 1828584, 8227934, 6779382, 3007670, 1828811, 1829083, 8012257, 1228841, 1829326, 1829557, 7704030, 1830231, 1830250, 1830607, 772676, 2974474, 9058241, 1830568, 8293683, 7052538, 1830514, 3174190, 9426094, 1628243, 1831152, 3949143, 8581219, 7002288, 1833136, 4715790, 1733802, 7594581, 1834688, 1834711, 1834253, 957223, 2727819, 1701305, 4190540, 1836190, 1245032, 7283210, 2977287, 7651109, 4667364, 1810046, 6260057, 7843710, 8555667, 8920715, 3850656, 8776560, 1845127, 7256295, 1847708, 1848618]


In [14]:
# Remove patients that were in the training dataset for NLP models

used_patients = [1828641, 1185845, 1831037, 4326354, 7984077, 2149576, 5019042, 8381579, 8089852, 2305497, 1835614, 1832701, 1202301, 2307176, 3728030, 5848191, 3259094, 1829964, 1829556, 1833261, 3205083, 1830570, 1830183, 1695264, 1049919, 762325, 5704617, 1832313, 6235501, 1830682, 4685903, 7205475, 4696494, 2850556, 1829957, 1834390, 1836576, 1827821, 1550849, 4452515, 9817236, 2919660, 1812801, 9816148, 2900519, 1877479, 5356404, 9220100, 1829928, 4002536, 9256011, 1835625, 1188552, 8770138, 1830212, 9556081, 4321077, 1610429, 1834150, 8866717]

for p in used_patients:
    if p in candidate_ids:
        candidate_ids.remove(p)
    
print(len(candidate_ids))
print(candidate_ids)

84
[1825193, 1824900, 4883738, 1825349, 1683660, 9172964, 1826096, 1825973, 1826295, 6308072, 1826470, 1826708, 1035156, 8297685, 3286406, 4099178, 1776065, 4174967, 1827868, 1516667, 1827925, 1828106, 1828353, 1827855, 1828039, 6403079, 7474489, 7391983, 1828584, 8227934, 6779382, 3007670, 1828811, 1829083, 8012257, 1228841, 1829326, 1829557, 7704030, 1830231, 1830250, 1830607, 772676, 2974474, 9058241, 1830568, 8293683, 7052538, 1830514, 3174190, 9426094, 1628243, 1831152, 3949143, 8581219, 7002288, 1833136, 4715790, 1733802, 7594581, 1834688, 1834711, 1834253, 957223, 2727819, 1701305, 4190540, 1836190, 1245032, 7283210, 2977287, 7651109, 4667364, 1810046, 6260057, 7843710, 8555667, 8920715, 3850656, 8776560, 1845127, 7256295, 1847708, 1848618]


In [15]:
# Let's split that into two sets (one for annotated test set, one for train set)
from sklearn.model_selection import train_test_split

train_ids, test_ids = train_test_split(candidate_ids, random_state=42, test_size=0.5)
print('Train\n', train_ids)
print('\n\nTest\n', test_ids)

len(train_ids), len(test_ids)

Train
 [1834253, 1825349, 4174967, 7704030, 1826295, 4190540, 1826096, 1829326, 7843710, 1834711, 9426094, 8293683, 1847708, 4099178, 7391983, 1830607, 7474489, 1830514, 1828039, 2974474, 1845127, 8920715, 4715790, 1828811, 8776560, 7594581, 957223, 4667364, 1829557, 8227934, 1824900, 1831152, 1828106, 4883738, 1827855, 3850656, 6260057, 1827925, 1834688, 7651109, 3286406, 1628243]


Test
 [1810046, 1825193, 1733802, 1828353, 1035156, 2977287, 1826470, 1827868, 1683660, 1701305, 1830250, 7052538, 1228841, 7256295, 7002288, 8581219, 1829083, 8555667, 772676, 3174190, 1826708, 6779382, 1830231, 1828584, 6308072, 1245032, 9172964, 1830568, 3007670, 8012257, 9058241, 1848618, 1776065, 2727819, 1825973, 1836190, 1833136, 1516667, 7283210, 6403079, 3949143, 8297685]


(42, 42)

In [17]:
df[df.patient_id.isin(train_ids)].to_csv('../data/timeseries_covid_train.tsv', sep='\t', index=False)

# Exporting the notes for selected patients to Excel files

In [38]:
EXCEL_DIR = '../sheets'

for pid in tqdm(test_ids):
    # Select the patient's data
    _df = df[df.patient_id == pid]
    # Discard extra columns and sort chronologically
    _df = _df[['centre', 'patient_id', 'note_id', 'note_csn', 'date', 'type', 'note']].sort_values('date').reset_index(drop=True)
    # Add columns for annotating
    for c in ['STM', 'INS', 'BER', 'FAC']:
        _df[f"{c}_level"] = np.nan
    # Move note column to the end
    cols = list(_df.columns)
    cols.remove('note')
    _df = _df[[*cols, 'note']]
    
    # Get the medical centre
    centre = str(_df['centre'].values[0])
    
    # Export to excel file
    _df.to_excel(f"{EXCEL_DIR}/annot_{pid}_{centre}.xlsx", index=False, freeze_panes=(1, 11))

100%|██████████| 42/42 [00:03<00:00, 13.71it/s]


# Testing the EXCEL export

In [None]:
TEST_ID = 3449606
_df = df[df.patient_id == TEST_ID]
_df = _df[['centre', 'patient_id', 'note_id', 'note_csn', 'date', 'type', 'note']].sort_values('date').reset_index(drop=True)
for c in ['STM', 'INS', 'BER', 'FAC']:
    _df[f"{c}_level"] = np.nan
cols = list(_df.columns)
cols.remove('note')
_df = _df[[*cols, 'note']]
_df

In [46]:
_df.to_excel('../sheets/test_annotatable.xlsx', index=False)

In [None]:
df_annot = pd.read_excel('../sheets/test_annotatable.xlsx')
df_annot

In [54]:
# Make a list of all the different types of notes
types = list(df_vumc.type.unique())
types.extend(list(df_amc.type.unique()))
types = [str(t) for t in list(set(types))]
for t in sorted(types):
    print(t)

Addendum notitie
Anesthesie pre-op. evaluatie
Anesthesie verrichtingsnotities
Begeleidingsadviezen en afspraken
Brief
Consulten
Consulten (niet-arts)
Crisisplan
Dagprogramma (tijdens opname)
Familiegesprek
Groepstherapie
Informatiekluis
Intake psychiatrie
Lactatie-notitie
MDO voorbereiding
MDO-verslag
Notitie voor beoordeling & plan
OBS Beleid
OK notitie
Observatielijst
Ontslag - Instructies bij problemen/complicaties
Ontslag - Instructies wond/drains/lijnen etc
Ontslag - Medische instructies
Ontslaginstr. - activiteit
Ontslaginstr. - afspraken
Ontslaginstr. - apotheek
Ontslaginstr. - dieet
Ontslaginstr. - overige orders
Ontslagsamenvattingen
Operatie verslag
Patiëntinstructies
Postop. evaluatie anesthesie
Research notitie
SEH arts notitie
SEH verslag
SEH-triagenotities
Signaleringsplan psychiatrie
Telefonisch contact
VK-bevallingsverslag
VPK verloop opname
Verpleegkundig verslag
Verpleegkundige notitie op SEH
Verrichtingen
Voortgangsverslag
Zorgoverdracht
Zorgplan/VPK rapportage
nan
