# Pivot up top-100 diagnosis and 19 disease groups 

Beside HADM_ID and SUBJECT_ID fields
<ul> <li> There are 100 binary columns for top-100 ICD9 codes </ul> </li>
<ul> <li> OTHER_ICD9_CODES: This column provides unique count of other ICD9 codes associated to the admission that are not one of the top-100 codes (So this is a count column and not binary) </ul> </li>
<ul> <li> Apart from these labels, I have included 19 grouped ICD9 fields: for example a 1 in column '001_139' represents that there is an ICD9 code for this admission which lies between 001_139 which relates to 'infectious and parasitic diseases' </ul> </li>
<ul> <li> OTHER_ICD9_GROUP: Any group other ICD9 grouping that doesn't fall under the above 19 groups of disease areas </ul> </li>

## 1. Pre process data tables

In [None]:
import re
import os
import ast
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import collect_list
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, VectorAssembler, StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [2]:
app_name = "ClickThrough"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

Edit the S3 URL as per your config

In [3]:
S3_PREFIX = 's3://'
S3_BUCKET = '<Enter S3 bucket name>'
S3_PATH = '<Enter S3 path>'
S3_URL = S3_PREFIX + S3_BUCKET + S3_PATH

In [4]:
filename_LIST = ['D_ICD_DIAGNOSES.csv','DIAGNOSES_ICD.csv','NOTEEVENTS.csv']
for filename_LIST_ITEM in filename_LIST:
    filename,fileformat = filename_LIST_ITEM.split('.')
    exec(filename+'_DF = sqlContext.read.format("'+fileformat+'").option("header", "true").option("multiline",True).'+
         'option("escape",'+"'"+'"'+"')"+'.load("'+S3_URL+'/'
         +filename+'.csv")')
    exec(filename+'_DF.createOrReplaceTempView("'+filename+'")')

### 1.1 Build HADM level diagnosis flag tables

In [6]:
spark.sql("""
select A.*, B.SHORT_TITLE, B.LONG_TITLE, 
case when substr(A.ICD9_CODE,1,1) in ('E','V') then 'external causes of injury and supplemental classification'
when substr(A.ICD9_CODE,1,3) between 001 and 139 then 'infectious and parasitic diseases'
when substr(A.ICD9_CODE,1,3) between 140 and 239 then 'neoplasms'
when substr(A.ICD9_CODE,1,3) between 240 and 279 then 'endocrine, nutritional and metabolic diseases, and immunity disorders'
when substr(A.ICD9_CODE,1,3) between 280 and 289 then 'diseases of the blood and blood-forming organs'
when substr(A.ICD9_CODE,1,3) between 290 and 319 then 'mental disorders'
when substr(A.ICD9_CODE,1,3) between 320 and 389 then 'diseases of the nervous system and sense organs'
when substr(A.ICD9_CODE,1,3) between 390 and 459 then 'diseases of the circulatory system'
when substr(A.ICD9_CODE,1,3) between 460 and 519 then 'diseases of the respiratory system'
when substr(A.ICD9_CODE,1,3) between 520 and 579 then 'diseases of the digestive system'
when substr(A.ICD9_CODE,1,3) between 580 and 629 then 'diseases of the genitourinary system'
when substr(A.ICD9_CODE,1,3) between 630 and 679 then 'complications of pregnancy, childbirth, and the puerperium'
when substr(A.ICD9_CODE,1,3) between 680 and 709 then 'diseases of the skin and subcutaneous tissue'
when substr(A.ICD9_CODE,1,3) between 710 and 739 then 'diseases of the musculoskeletal system and connective tissue'
when substr(A.ICD9_CODE,1,3) between 740 and 759 then 'congenital anomalies'
when substr(A.ICD9_CODE,1,3) between 760 and 779 then 'certain conditions originating in the perinatal period'
when substr(A.ICD9_CODE,1,3) between 780 and 799 then 'symptoms, signs, and ill-defined conditions'
when substr(A.ICD9_CODE,1,3) between 800 and 999 then 'injury and poisoning' 
end as ICD_GROUP,
case when substr(A.ICD9_CODE,1,1) in ('E','V') then 'E_V'
when substr(A.ICD9_CODE,1,3) between 001 and 139 then '001_139'
when substr(A.ICD9_CODE,1,3) between 140 and 239 then '140_239'
when substr(A.ICD9_CODE,1,3) between 240 and 279 then '240_279'
when substr(A.ICD9_CODE,1,3) between 280 and 289 then '280_289'
when substr(A.ICD9_CODE,1,3) between 290 and 319 then '290_319'
when substr(A.ICD9_CODE,1,3) between 320 and 389 then '320_389'
when substr(A.ICD9_CODE,1,3) between 390 and 459 then '390_459'
when substr(A.ICD9_CODE,1,3) between 460 and 519 then '460_519'
when substr(A.ICD9_CODE,1,3) between 520 and 579 then '520_579'
when substr(A.ICD9_CODE,1,3) between 580 and 629 then '580_629'
when substr(A.ICD9_CODE,1,3) between 630 and 679 then '630_679'
when substr(A.ICD9_CODE,1,3) between 680 and 709 then '680_709'
when substr(A.ICD9_CODE,1,3) between 710 and 739 then '710_739'
when substr(A.ICD9_CODE,1,3) between 740 and 759 then '740_759'
when substr(A.ICD9_CODE,1,3) between 760 and 779 then '760_779'
when substr(A.ICD9_CODE,1,3) between 780 and 799 then '780_799'
when substr(A.ICD9_CODE,1,3) between 800 and 999 then '800_999' 
end as ICD_GROUP_CODE
from DIAGNOSES_ICD A
left join D_ICD_DIAGNOSES B
on A.ICD9_CODE = B.ICD9_CODE
""").createOrReplaceTempView('DIAGNOSES_ICD_WITH_GROUPING')

In [12]:
spark.sql("""
select ICD9_CODE, count(distinct HADM_ID) as FREQUENCY from 
(select DISTINCT HADM_ID, SUBJECT_ID, ICD9_CODE,ICD_GROUP_CODE 
from DIAGNOSES_ICD_WITH_GROUPING) group by ICD9_CODE order by FREQUENCY Desc limit 100""")\
.createOrReplaceTempView('TOP_100_ICD9')
#DIAGNOSIS_GROUPING_PIVOT_DF = DIAGNOSIS_GROUPING_DF.groupBy('HADM_ID').pivot('ICD_GROUP_ID').count().fillna(0)
#DIAGNOSIS_GROUPING_PIVOT_DF.createOrReplaceTempView('DIAGNOSIS_GROUPING_PIVOT')

In [26]:
DIAGNOSIS_GROUPING_DF = spark.sql("""
select distinct A.HADM_ID, A.SUBJECT_ID, A.ICD_GROUP_CODE,
case when B.include = 1 then A.ICD9_CODE else 'OTHER_CODES' end as ICD9_CODE_100 
from DIAGNOSES_ICD_WITH_GROUPING A
left join (select *, 1 as include from TOP_100_ICD9) B on A.ICD9_CODE = B.ICD9_CODE
""")

### 1.2 Pivot up Diagnosis group and top-100 codes

In [28]:
DIAGNOSIS_ICD_GROUP_PIVOT_DF = DIAGNOSIS_GROUPING_DF.groupBy('HADM_ID','SUBJECT_ID')\
.pivot('ICD_GROUP_CODE').count().fillna(0)
DIAGNOSIS_ICD_PIVOT_DF = DIAGNOSIS_GROUPING_DF.groupBy('HADM_ID','SUBJECT_ID')\
.pivot('ICD9_CODE_100').count().fillna(0)

DIAGNOSIS_ICD_GROUP_PIVOT_DF.createOrReplaceTempView('DIAGNOSIS_ICD_GROUP_PIVOT')
DIAGNOSIS_ICD_PIVOT_DF.createOrReplaceTempView('DIAGNOSIS_ICD_PIVOT')

### 1.3 Combine Pivoted up ICD9 and ICD group

In [29]:
DIAGNOSIS_PIVOT_DF = DIAGNOSIS_ICD_GROUP_PIVOT_DF\
                    .join(DIAGNOSIS_ICD_PIVOT_DF, on=['HADM_ID','SUBJECT_ID'], how='inner')

In [35]:
DIAGNOSIS_PIVOT_DF.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
    .save(S3_URL+'/DIAGNOSIS_PIVOT.csv', header=True)

In [33]:
DIAGNOSIS_PIVOT_DF.printSchema()

root
 |-- HADM_ID: string (nullable = true)
 |-- SUBJECT_ID: string (nullable = true)
 |-- null: long (nullable = true)
 |-- 001_139: long (nullable = true)
 |-- 140_239: long (nullable = true)
 |-- 240_279: long (nullable = true)
 |-- 280_289: long (nullable = true)
 |-- 290_319: long (nullable = true)
 |-- 320_389: long (nullable = true)
 |-- 390_459: long (nullable = true)
 |-- 460_519: long (nullable = true)
 |-- 520_579: long (nullable = true)
 |-- 580_629: long (nullable = true)
 |-- 630_679: long (nullable = true)
 |-- 680_709: long (nullable = true)
 |-- 710_739: long (nullable = true)
 |-- 740_759: long (nullable = true)
 |-- 760_779: long (nullable = true)
 |-- 780_799: long (nullable = true)
 |-- 800_999: long (nullable = true)
 |-- E_V: long (nullable = true)
 |-- 00845: long (nullable = true)
 |-- 0389: long (nullable = true)
 |-- 07054: long (nullable = true)
 |-- 2449: long (nullable = true)
 |-- 25000: long (nullable = true)
 |-- 2639: long (nullable = true)
 |-- 2720: 