<a href="https://colab.research.google.com/github/deepw02/comp3931-sc20dpw/blob/prepare_train/pm_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process Mining/Machine Learning to Predict Diagnosis ICD codes


### Authentication & Connecting to MIMIC-III

In [1]:
from google.colab import auth
from google.cloud import bigquery
from google.colab import drive

import pandas as pd
import numpy as np
import sys

auth.authenticate_user()
client = bigquery.Client(project='physionet-data-369222')
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


### Loading Dataframes from Google Drive

In [2]:
cardiovascular = pd.read_csv('drive/My Drive/comp3931-sc20dpw/cardiovascular.csv')
filtered_cardiovascular = pd.read_csv('drive/My Drive/comp3931-sc20dpw/filtered_cardiovascular.csv')
admissions = pd.read_csv('drive/My Drive/comp3931-sc20dpw/admissions.csv')
discharges = pd.read_csv('drive/My Drive/comp3931-sc20dpw/discharges.csv')
procedures = pd.read_csv('drive/My Drive/comp3931-sc20dpw/procedures.csv')
freq_procedures = pd.read_csv('drive/My Drive/comp3931-sc20dpw/freq_procedures.csv')
labtests = pd.read_csv('drive/My Drive/comp3931-sc20dpw/labtests.csv')
freq_labtests = pd.read_csv('drive/My Drive/comp3931-sc20dpw/freq_labtests.csv')
train_features = pd.read_csv('drive/My Drive/comp3931-sc20dpw/train_features.csv')

### Function Definitions

In [3]:
# Function to filter out records where age > 100 during admission

def filter_age(df):

  filter_age_query = (f'''
    SELECT
      hadm_id
    FROM
      physionet-data.mimiciii_clinical.admissions,
      physionet-data.mimiciii_clinical.patients
    WHERE
      admissions.subject_id = patients.subject_id AND
      hadm_id in {tuple(df.hadm_id.tolist())} AND
      DATE_DIFF(admittime, dob, year) <= 100
  ''')

  result = client.query(filter_age_query).to_dataframe()

  return df[df.hadm_id.isin(result.hadm_id)]

In [4]:
# Function to filter out records that have don't include any procedures

def filter_procedures(df):
   
  filter_procedures_query = (f'''
    SELECT
      hadm_id
    FROM
      physionet-data.mimiciii_clinical.procedureevents_mv
    WHERE
      hadm_id in {tuple(df.hadm_id.tolist())}
    GROUP BY
      hadm_id
  ''')
  
  result = client.query(filter_procedures_query).to_dataframe()

  return df[df.hadm_id.isin(result.hadm_id)]

### Event Log Creation

In [5]:
# Extracting subject_id, hadm_id & ICD-9 codes of cardiovascular patients

cardiovascular_query = ('''
  SELECT
    hadm_id,
    subject_id,
    icd9_code
  FROM
    physionet-data.mimiciii_clinical.diagnoses_icd
  WHERE
    icd9_code BETWEEN '390%' AND '459%' AND
    seq_num = 1
''')

cardiovascular = client.query(cardiovascular_query).to_dataframe()

cardiovascular['icd9_code'] = cardiovascular[['icd9_code']].apply(pd.to_numeric)

In [6]:
# Filter cardiovascular patient records

# Remove records with abnormal ages
filtered_cardiovascular = filter_age(cardiovascular)

# Remove records without any procedures
filtered_cardiovascular = filter_procedures(filtered_cardiovascular)

# Reset dataframe index
filtered_cardiovascular = filtered_cardiovascular.reset_index()

In [7]:
# No repetitions
hadm_all = tuple((filtered_cardiovascular.hadm_id.tolist()))

# DO I NEED SUBJECT IDS?
# Contains repetitions
# subject_all = tuple(set(filtered_cardiovascular.subject_id.tolist()))

# hadm_random
# subject_random - AGAIN DO I NEED SUBJECT IDS?

hadm_unique = hadm_all
# subject_unique = subject_all

In [8]:
# Creating admissions event log

admissions_query = (f'''
  SELECT 
    hadm_id,
    "Admission" as event,
    admittime as timestamp,
    admission_type,
    DATE_DIFF(admittime, dob, year) as age,
    gender
  FROM
    physionet-data.mimiciii_clinical.admissions,
    physionet-data.mimiciii_clinical.patients
  WHERE
    hadm_id IN {hadm_unique} AND
    admissions.subject_id = patients.subject_id
''')

admissions = client.query(admissions_query).to_dataframe()

In [9]:
# Creating discharge/death event log

discharges_query = (f'''
    SELECT
      hadm_id,
      IF(deathtime IS NOT NULL, "Death", "Discharge") as event,
      dischtime as timestamp,
      IF(deathtime IS NOT NULL, 1, 0) as expire_flag
    FROM 
      physionet-data.mimiciii_clinical.admissions
    WHERE
    hadm_id IN {hadm_unique}
''')

discharges = client.query(discharges_query).to_dataframe()

In [10]:
# Query to find n most frequent procedures conducted on cardiovascular patients

num_procedures = 3

freq_procedures_query = (f'''
  SELECT
    p.itemid,
    COUNT(*) as count_procedures,
    LOWER(REPLACE(label, ' ', '_')) as label
  FROM
    physionet-data.mimiciii_clinical.procedureevents_mv as p,
    physionet-data.mimiciii_clinical.d_items as d
  WHERE
    hadm_id in {hadm_unique} AND
    d.itemid = p.itemid
  GROUP BY
    p.itemid,
    label
  ORDER BY
    count_procedures DESC
  LIMIT
    {num_procedures}
''')

freq_procedures = client.query(freq_procedures_query).to_dataframe()

In [11]:
# Creating procedures event log

procedures_query = (f'''
    SELECT
      hadm_id,
      "Procedure" as event,
      starttime as timestamp,
      LOWER(REPLACE(label, ' ', '_')) as label
    FROM
      physionet-data.mimiciii_clinical.procedureevents_mv as p,
      physionet-data.mimiciii_clinical.d_items as d
    WHERE
      hadm_id in {hadm_unique} AND
      p.itemid in {tuple(freq_procedures.itemid.tolist())} AND
      d.itemid = p.itemid
''')

procedures = client.query(procedures_query).to_dataframe()

In [12]:
# Query to find n most frequent lab tests conducted on cardiovascular patients

num_labtests = 2

freq_labtests_query = (f'''
  SELECT
    lab.itemid,
    COUNT(*) as count_labtests,
    LOWER(REPLACE(label, ' ', '_')) as label
  FROM
    physionet-data.mimiciii_clinical.labevents as lab,
    physionet-data.mimiciii_clinical.d_labitems as d
  WHERE
    hadm_id in {hadm_unique} AND
    lab.itemid = d.itemid
  GROUP BY
    lab.itemid,
    label
  ORDER BY
    count_labtests DESC
  LIMIT
    {num_labtests}
''')

freq_labtests = client.query(freq_labtests_query).to_dataframe()

In [13]:
# Create lab tests event log

labtests_query = (f'''
  SELECT
    hadm_id,
    "Labtest" as event,
    charttime as timestamp,
    LOWER(REPLACE(label, ' ', '_')) as label,
    IF(FLAG = "abnormal", "abnormal", "normal") as flag
  FROM
    physionet-data.mimiciii_clinical.labevents as lab,
    physionet-data.mimiciii_clinical.d_labitems as d
  WHERE
    hadm_id IN {hadm_unique} AND
    lab.itemid = d.itemid AND
    lab.itemid IN {tuple(freq_labtests.itemid.tolist())} AND
    valuenum IS NOT NULL
''')

labtests = client.query(labtests_query).to_dataframe()

### Prepare Data to Train Model

In [14]:
# Checking if admissions & discharges table have hadm_id in the same order

def corr_check():

  if len(admissions) != len(discharges):
    return False

  for i in range(len(admissions)):
    if admissions.iloc[i].hadm_id != discharges.iloc[i].hadm_id:
      return False

  return True

corr = corr_check()

if not corr:
  sys.exit()

In [15]:
# Adding information from admissions & discharges

train_features = pd.merge(admissions[['hadm_id', 'timestamp', 'admission_type', 'age', 'gender']], discharges[['hadm_id', 'timestamp', 'expire_flag']], on=['hadm_id'], how='inner')

train_features.rename(columns={"timestamp_x": "admittime", "timestamp_y": "dischtime"}, inplace=True, errors="raise")

In [16]:
# Storing order of hadm_id's in train_features in a list

hadm_ordered = train_features.hadm_id.tolist()

In [17]:
# Adding information from procedures

for label in freq_procedures.label.tolist():
  
  procedure_count = []
  
  for hadm_id in hadm_ordered:
    
    count = len(procedures.loc[(procedures['hadm_id'] == hadm_id) & (procedures['label'] == label)])
    procedure_count.append(count)
  
  train_features[f'count_{label}'] = procedure_count

In [18]:
# Adding information from labtests

for label in freq_labtests.label.tolist():

  for flag in ['abnormal', 'normal']:
  
    labtest_count = []
    
    for hadm_id in hadm_ordered:
      
      count = len(labtests.loc[(labtests['hadm_id'] == hadm_id) & (labtests['flag'] == flag) & (labtests['label'] == label)])
      labtest_count.append(count)
    
    train_features[f'count_{flag}_{label}'] = labtest_count

In [21]:
# CELL FOR OUTPUTS

display(train_features)

Unnamed: 0,hadm_id,admittime,admission_type,age,gender,dischtime,expire_flag,count_chest_x-ray,count_20_gauge,count_ekg,count_abnormal_potassium,count_normal_potassium,count_abnormal_hematocrit,count_normal_hematocrit
0,155231,2109-12-11 22:27:00,EMERGENCY,71,F,2109-12-28 14:13:00,0,0,0,0,0,26,19,0
1,166990,2180-03-30 12:27:00,EMERGENCY,57,M,2180-03-31 12:00:00,1,3,1,1,0,7,4,0
2,105708,2126-09-17 07:15:00,ELECTIVE,66,M,2126-09-23 18:24:00,0,4,0,2,0,8,12,0
3,185488,2164-05-30 21:17:00,EMERGENCY,65,M,2164-06-08 15:45:00,0,2,2,1,0,10,6,3
4,190094,2190-07-03 15:11:00,EMERGENCY,72,F,2190-07-26 16:30:00,0,4,4,0,3,28,19,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6663,165886,2194-04-15 22:40:00,EMERGENCY,84,F,2194-04-25 17:38:00,0,0,1,1,2,10,12,7
6664,148552,2194-05-07 19:18:00,EMERGENCY,84,F,2194-05-13 13:02:00,0,0,2,1,1,7,7,1
6665,178246,2134-10-29 07:15:00,ELECTIVE,39,M,2134-11-03 19:00:00,0,2,1,1,0,5,6,0
6666,156806,2123-09-08 06:45:00,EMERGENCY,66,F,2123-09-24 17:22:00,0,2,3,1,3,23,18,0


### Saving Dataframes to Google Drive

In [20]:
# Saving dataframes to Google Drive

cardiovascular.to_csv('cardiovascular.csv', index=False)
!cp cardiovascular.csv 'drive/My Drive/comp3931-sc20dpw'

filtered_cardiovascular.to_csv('filtered_cardiovascular.csv', index=False)
!cp filtered_cardiovascular.csv 'drive/My Drive/comp3931-sc20dpw'

admissions.to_csv('admissions.csv', index=False)
!cp admissions.csv 'drive/My Drive/comp3931-sc20dpw'

discharges.to_csv('discharges.csv', index=False)
!cp discharges.csv 'drive/My Drive/comp3931-sc20dpw'

procedures.to_csv('procedures.csv', index=False)
!cp procedures.csv 'drive/My Drive/comp3931-sc20dpw'

freq_procedures.to_csv('freq_procedures.csv', index=False)
!cp freq_procedures.csv 'drive/My Drive/comp3931-sc20dpw'

labtests.to_csv('labtests.csv', index=False)
!cp labtests.csv 'drive/My Drive/comp3931-sc20dpw'

freq_labtests.to_csv('freq_labtests.csv', index=False)
!cp freq_labtests.csv 'drive/My Drive/comp3931-sc20dpw'

train_features.to_csv('train_features.csv', index=False)
!cp train_features.csv 'drive/My Drive/comp3931-sc20dpw'