In [1]:
import os, fnmatch
import pandas as pd


DATA_FILES_PATH='data/physionet.org.nosync/files/mimiciii/1.4/'
FILE_PATTERN="*.csv"

In [2]:
listOfFiles = os.listdir(DATA_FILES_PATH)
listOfFiles.sort()
listOfCSVs = [ filename for filename in listOfFiles if fnmatch.fnmatch(filename, FILE_PATTERN)]
print(listOfCSVs)

['ADMISSIONS.csv', 'CALLOUT.csv', 'CAREGIVERS.csv', 'CHARTEVENTS.csv', 'CPTEVENTS.csv', 'DATETIMEEVENTS.csv', 'DIAGNOSES_ICD.csv', 'DRGCODES.csv', 'D_CPT.csv', 'D_ICD_DIAGNOSES.csv', 'D_ICD_PROCEDURES.csv', 'D_ITEMS.csv', 'D_LABITEMS.csv', 'ICUSTAYS.csv', 'INPUTEVENTS_CV.csv', 'INPUTEVENTS_MV.csv', 'LABEVENTS.csv', 'MICROBIOLOGYEVENTS.csv', 'NOTEEVENTS.csv', 'OUTPUTEVENTS.csv', 'PATIENTS.csv', 'PRESCRIPTIONS.csv', 'PROCEDUREEVENTS_MV.csv', 'PROCEDURES_ICD.csv', 'SERVICES.csv', 'TRANSFERS.csv']


In [14]:
'ADMISSIONS.csv'.split('.')[0]

'ADMISSIONS'

In [None]:
for file in listOfCSVs:
    exec(file.split('.')[0]+'_DF=pd.read_csv("'+ DATA_FILES_PATH+file + '")')

In [3]:
#Convert to Parque files
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .master("local[*]") \
  .appName("Gaurav_EDA") \
  .getOrCreate()



In [4]:
for file in listOfCSVs:
    print('Working on file '+file)
    df = spark.read.csv(DATA_FILES_PATH+file, header = True)
    print("Read into df")
    parque_filename = "tmp/"+file.split('.')[0]
    print('Writing into parquet at '+parque_filename)
    df.write.mode('overwrite').parquet(parque_filename)

Working on file ADMISSIONS.csv
Read into df
Writing into parquet at tmp/ADMISSIONS
Working on file CALLOUT.csv
Read into df
Writing into parquet at tmp/CALLOUT
Working on file CAREGIVERS.csv
Read into df
Writing into parquet at tmp/CAREGIVERS
Working on file CHARTEVENTS.csv
Read into df
Writing into parquet at tmp/CHARTEVENTS
Working on file CPTEVENTS.csv
Read into df
Writing into parquet at tmp/CPTEVENTS
Working on file DATETIMEEVENTS.csv
Read into df
Writing into parquet at tmp/DATETIMEEVENTS
Working on file DIAGNOSES_ICD.csv
Read into df
Writing into parquet at tmp/DIAGNOSES_ICD
Working on file DRGCODES.csv
Read into df
Writing into parquet at tmp/DRGCODES
Working on file D_CPT.csv
Read into df
Writing into parquet at tmp/D_CPT
Working on file D_ICD_DIAGNOSES.csv
Read into df
Writing into parquet at tmp/D_ICD_DIAGNOSES
Working on file D_ICD_PROCEDURES.csv
Read into df
Writing into parquet at tmp/D_ICD_PROCEDURES
Working on file D_ITEMS.csv
Read into df
Writing into parquet at tmp/D_

In [10]:
def getDFFromParquet(filename):
    return spark.read.parquet('tmp/'+filename)

def createTempTableFromParquet(filename):
    df = getDFFromParquet(filename)
    df.createOrReplaceTempView(filename)

In [6]:
ADMISSIONS_DF = getDFFromParquet('ADMISSIONS')

In [13]:
createTempTableFromParquet('ADMISSIONS')

In [18]:
spark.sql("SELECT count(1) from ADMISSIONS").show()

+--------+
|count(1)|
+--------+
|   58976|
+--------+



In [17]:
ADMISSIONS_DF.first()


Row(ROW_ID='21', SUBJECT_ID='22', HADM_ID='165315', ADMITTIME='2196-04-09 12:26:00', DISCHTIME='2196-04-10 15:54:00', DEATHTIME=None, ADMISSION_TYPE='EMERGENCY', ADMISSION_LOCATION='EMERGENCY ROOM ADMIT', DISCHARGE_LOCATION='DISC-TRAN CANCER/CHLDRN H', INSURANCE='Private', LANGUAGE=None, RELIGION='UNOBTAINABLE', MARITAL_STATUS='MARRIED', ETHNICITY='WHITE', EDREGTIME='2196-04-09 10:06:00', EDOUTTIME='2196-04-09 13:24:00', DIAGNOSIS='BENZODIAZEPINE OVERDOSE', HOSPITAL_EXPIRE_FLAG='0', HAS_CHARTEVENTS_DATA='1')

In [12]:
CHARTEVENTS_DF.printSchema()

root
 |-- ROW_ID: string (nullable = true)
 |-- SUBJECT_ID: string (nullable = true)
 |-- HADM_ID: string (nullable = true)
 |-- ICUSTAY_ID: string (nullable = true)
 |-- ITEMID: string (nullable = true)
 |-- CHARTTIME: string (nullable = true)
 |-- STORETIME: string (nullable = true)
 |-- CGID: string (nullable = true)
 |-- VALUE: string (nullable = true)
 |-- VALUENUM: string (nullable = true)
 |-- VALUEUOM: string (nullable = true)
 |-- ERROR: string (nullable = true)
 |-- RESULTSTATUS: string (nullable = true)
 |-- STOPPED: string (nullable = true)



In [53]:
NOTEEVENTS_DF = getDFFromParquet('NOTEEVENTS')
createTempTableFromParquet('NOTEEVENTS')

In [32]:
NOTEEVENTS_DF.select('TEXT').distinct().count() #899174
spark.sql("SELECT isnull(TEXT),count(1) from NOTEEVENTS group by isnull(TEXT)").show(5)

+--------------+--------+
|(TEXT IS NULL)|count(1)|
+--------------+--------+
|          true|79682649|
|         false| 2116993|
+--------------+--------+



So we have 79M blank notes and only 2M filled. But even from these we have only 900K uniq notes. Rest 1.1M are duplicates. May be some standard notes.

In [40]:
spark.sql("SELECT TEXT,count(1) from NOTEEVENTS where text is not null group by TEXT order by count(1) desc").show(
    n=10,truncate=False)

+-------------------------+--------+
|TEXT                     |count(1)|
+-------------------------+--------+
|PATIENT/TEST INFORMATION:|45789   |
|Chief Complaint:         |39635   |
|TITLE:                   |31702   |
|Demographics             |28199   |
|Respiratory Care         |26460   |
|Neonatology Attending    |25621   |
|NPN                      |19414   |
|Sinus rhythm             |19307   |
|Nursing Progress Note    |16079   |
|Neonatology              |15265   |
+-------------------------+--------+
only showing top 10 rows



Looks like lot of small standard text

In [45]:
spark.sql("SELECT length(length(TEXT)),count(1) from NOTEEVENTS group by length(length(TEXT)) order by length(length(TEXT)) desc").show(
    truncate=False)

+------------------------------------+--------+
|length(CAST(length(TEXT) AS STRING))|count(1)|
+------------------------------------+--------+
|4                                   |1825    |
|3                                   |106118  |
|2                                   |1804053 |
|1                                   |204997  |
|null                                |79682649|
+------------------------------------+--------+



Lets see if repeatative notes yeild different ICD codes

In [52]:
D_ICD_DIAGNOSES_DF = getDFFromParquet('D_ICD_DIAGNOSES')
createTempTableFromParquet('D_ICD_DIAGNOSES')
DIAGNOSES_ICD_DF = getDFFromParquet('DIAGNOSES_ICD')
createTempTableFromParquet('DIAGNOSES_ICD')

In [93]:
spark.sql("""SELECT * from DIAGNOSES_ICD""").show(n=5,truncate=False)

+------+----------+-------+-------+---------+
|ROW_ID|SUBJECT_ID|HADM_ID|SEQ_NUM|ICD9_CODE|
+------+----------+-------+-------+---------+
|1297  |109       |172335 |1      |40301    |
|1298  |109       |172335 |2      |486      |
|1299  |109       |172335 |3      |58281    |
|1300  |109       |172335 |4      |5855     |
|1301  |109       |172335 |5      |4254     |
+------+----------+-------+-------+---------+
only showing top 5 rows



In [59]:
# check if there are some HADM for which ICD codes are missing
df1 = NOTEEVENTS_DF.select('SUBJECT_ID','HADM_ID').distinct()
df2 = DIAGNOSES_ICD_DF.select('SUBJECT_ID','HADM_ID').distinct()

In [61]:
print(df1.count(),df2.count())

5841450 58976


In [86]:
df1.exceptAll(df2).count() # These combination we need to investigate further to see why no ICD codes

5783089

In [65]:
df2.exceptAll(df1).count() # These combination we need exclude from our model as these are dead ends

615

In [69]:
#Lets investigates notes which are missing ICD codes => Looks like non numeric SUBJECT_ID/HADM_ID 
#=> should be excluded from further analysis
spark.sql("""SELECT SUBJECT_ID,HADM_ID from NOTEEVENTS
             where (SUBJECT_ID,HADM_ID) not in
                 (select SUBJECT_ID,HADM_ID
                 from DIAGNOSES_ICD)""").show(n=5,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|SUBJECT_ID                                                                                                                                                                                                                              |HADM_ID                                                                                                                                                                                                                           |
+---------------------------------------------------------------------------

In [85]:
from pyspark.sql import functions as F

NOTEEVENTS_DF.select(
  F.col("subject_id").cast("int").isNotNull().alias("is_subject_id_int"),
    F.col("HADM_ID").cast("int").isNotNull().alias("is_HADM_ID_int")
).crosstab("is_subject_id_int","is_HADM_ID_int" ).show()
#So effectively we have only 1852320 notes

+--------------------------------+--------+-------+
|is_subject_id_int_is_HADM_ID_int|   false|   true|
+--------------------------------+--------+-------+
|                           false|79702214|   5888|
|                            true|  239220|1852320|
+--------------------------------+--------+-------+



In [95]:
spark.sql("""SELECT ne.SUBJECT_ID,ne.HADM_ID,d.ICD9_CODE,TEXT, did.SHORT_TITLE, did.LONG_TITLE
             from NOTEEVENTS ne
             inner join DIAGNOSES_ICD d
             on ne.subject_id=d.subject_id
             and ne.HADM_ID=d.HADM_ID
             inner join D_ICD_DIAGNOSES did
             on did.icd9_code=d.icd9_code""").show(n=50,truncate=False)

+----------+-------+---------+---------------------------------------------------------------------------------------------------------+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|SUBJECT_ID|HADM_ID|ICD9_CODE|TEXT                                                                                                     |SHORT_TITLE             |LONG_TITLE                                                                                                                                                                                                              |
+----------+-------+---------+---------------------------------------------------------------------------------------------------------+------------------------+---------------------------------------------------------------------------------