In [1]:
#import necessary packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors

In [2]:
#load in neccesary CSV files
diagnosis_icd9 = pd.read_csv("/project/projectdirs/m1532/Projects_MVP/_datasets/mimiciii/DIAGNOSES_ICD.csv", usecols= ["SUBJECT_ID", "ICD9_CODE"])
diag_name = pd.read_csv("/project/projectdirs/m1532/Projects_MVP/_datasets/mimiciii/D_ICD_DIAGNOSES.csv", usecols= ["ICD9_CODE", "LONG_TITLE"])
code_description = pd.read_csv("/project/projectdirs/m1532/Projects_MVP/_datasets/mimiciii/DRGCODES.csv")
diagnosis_icd9.head()

Unnamed: 0,SUBJECT_ID,ICD9_CODE
0,109,40301
1,109,486
2,109,58281
3,109,5855
4,109,4254


In [3]:
#filter for only "disorders of despair" ICD9 codes
diag_dd_DF = diagnosis_icd9[diagnosis_icd9["ICD9_CODE"].str.startswith(("327", "338", "339", "346", "295", "296", "297", "298", "300", "301", "302", "307", "309", "310", "311", "313"), na = False)]
diag_dd_DF.head()

Unnamed: 0,SUBJECT_ID,ICD9_CODE
22,109,32723
24,109,33829
248,135,311
305,67,311
311,67,311


In [4]:
diag_dd_DF.count()

SUBJECT_ID    13787
ICD9_CODE     13787
dtype: int64

In [5]:
diag_dd_DF.groupby(['ICD9_CODE']).count()

Unnamed: 0_level_0,SUBJECT_ID
ICD9_CODE,Unnamed: 1_level_1
29502,1
29512,1
29520,6
29522,1
29524,1
...,...
34670,2
34680,24
34690,528
34691,1


In [6]:
#merge in separate dataframe to include the name/description of the ICD9 code 
diagDFdesc = pd.merge(diag_dd_DF, diag_name, on='ICD9_CODE')
diagDFdesc

Unnamed: 0,SUBJECT_ID,ICD9_CODE,LONG_TITLE
0,109,32723,Obstructive sleep apnea (adult)(pediatric)
1,85,32723,Obstructive sleep apnea (adult)(pediatric)
2,109,32723,Obstructive sleep apnea (adult)(pediatric)
3,109,32723,Obstructive sleep apnea (adult)(pediatric)
4,109,32723,Obstructive sleep apnea (adult)(pediatric)
...,...,...,...
13780,83578,32720,"Organic sleep apnea, unspecified"
13781,99085,34691,"Migraine, unspecified, with intractable migrai..."
13782,95841,33902,Chronic cluster headache
13783,96928,29621,"Major depressive affective disorder, single ep..."


In [7]:
#description is the human understandable summary of the meaning of the given DRG code
#DRG code is diagnosis related groups
code_descDF = code_description[['SUBJECT_ID','DESCRIPTION']]
code_descDF.head()

Unnamed: 0,SUBJECT_ID,DESCRIPTION
0,2491,"TRAUMATIC STUPOR & COMA, COMA <1 HR AGE >17 WI..."
1,24958,MAJOR CARDIOVASCULAR PROCEDURES WITH COMPLICAT...
2,18325,NEONATE WITH OTHER SIGNIFICANT PROBLEMS
3,17887,SPECIFIC CEREBROVASCULAR DISORDERS EXCEPT TRAN...
4,11113,NEONATE WITH OTHER SIGNIFICANT PROBLEMS


In [8]:
notes = pd.read_csv("/project/projectdirs/m1532/Projects_MVP/_datasets/mimiciii/NOTEEVENTS.csv", usecols = ['SUBJECT_ID','CATEGORY', 'TEXT'])
notes.head()

Unnamed: 0,SUBJECT_ID,CATEGORY,TEXT
0,22532,Discharge summary,Admission Date: [**2151-7-16**] Dischar...
1,13702,Discharge summary,Admission Date: [**2118-6-2**] Discharg...
2,13702,Discharge summary,Admission Date: [**2119-5-4**] D...
3,13702,Discharge summary,Admission Date: [**2124-7-21**] ...
4,26880,Discharge summary,Admission Date: [**2162-3-3**] D...


In [9]:
#merge dd dataframe with notes
icd9_dd_notes = pd.merge(diag_dd_DF, notes, on ='SUBJECT_ID')
icd9_dd_notes.head()

Unnamed: 0,SUBJECT_ID,ICD9_CODE,CATEGORY,TEXT
0,109,32723,Discharge summary,Admission Date: [**2142-5-15**] ...
1,109,32723,Discharge summary,Admission Date: [**2142-5-20**] ...
2,109,32723,Discharge summary,Admission Date: [**2142-6-18**] ...
3,109,32723,Discharge summary,Admission Date: [**2142-7-3**] D...
4,109,32723,Discharge summary,Admission Date: [**2142-7-7**] D...


In [10]:
#how many entries?
icd9_dd_notes.count()

SUBJECT_ID    1167881
ICD9_CODE     1167881
CATEGORY      1167881
TEXT          1167881
dtype: int64

In [11]:
#what is the largest number of entries for a patient?
#what is the least 
icd9_dd_notes.groupby(["SUBJECT_ID"]).count().sort_values(by=['TEXT'],ascending=False)

Unnamed: 0_level_0,ICD9_CODE,CATEGORY,TEXT
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
109,40928,40928,40928
11861,22450,22450,22450
73713,21049,21049,21049
29035,16626,16626,16626
5060,14877,14877,14877
...,...,...,...
68353,1,1,1
40477,1,1,1
86731,1,1,1
64238,1,1,1


In [12]:
#average number of notes per patient
#dont know how skewed this is (need to look at outliers)
icd9_dd_notes.groupby('SUBJECT_ID').count().mean()

ICD9_CODE    136.962707
CATEGORY     136.962707
TEXT         136.962707
dtype: float64

In [13]:
#how many characters?
icd9_dd_notes["TEXT"].str.len() 

0          10479
1          16179
2          11058
3          10613
4           9850
           ...  
1167876     1627
1167877     4672
1167878     1493
1167879     2975
1167880     1446
Name: TEXT, Length: 1167881, dtype: int64

In [14]:
#average number of characters per text
icd9_dd_notes["TEXT"].str.len().mean()

2498.6530519804674

In [15]:
# Load the regular expression library
import re

In [16]:
# Remove punctuation and anything that isnt a character or number 
#this process took approximately (start 1:31:16 - finish 1:33:10) 2 minutes
icd9_dd_notes['text_processed'] = icd9_dd_notes['TEXT'].map(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x))

In [17]:
# Convert to lowercase
icd9_dd_notes['text_processed'] = icd9_dd_notes['text_processed'].map(lambda x: x.lower())

In [18]:
# Print out the first rows of papers to ensure re working properly
icd9_dd_notes['text_processed'].head()

0    admission date  2142515              discharge...
1    admission date  2142520              discharge...
2    admission date  2142618              discharge...
3    admission date  214273              discharge ...
4    admission date  214277              discharge ...
Name: text_processed, dtype: object

In [19]:
icd9_dd_notes.head()

Unnamed: 0,SUBJECT_ID,ICD9_CODE,CATEGORY,TEXT,text_processed
0,109,32723,Discharge summary,Admission Date: [**2142-5-15**] ...,admission date 2142515 discharge...
1,109,32723,Discharge summary,Admission Date: [**2142-5-20**] ...,admission date 2142520 discharge...
2,109,32723,Discharge summary,Admission Date: [**2142-6-18**] ...,admission date 2142618 discharge...
3,109,32723,Discharge summary,Admission Date: [**2142-7-3**] D...,admission date 214273 discharge ...
4,109,32723,Discharge summary,Admission Date: [**2142-7-7**] D...,admission date 214277 discharge ...


In [50]:
#adding additional index column for calling
index = tuple(range(0, 1167881, 1))
icd9_dd_notes["index"] = index
icd9_dd_notes

Unnamed: 0,SUBJECT_ID,ICD9_CODE,CATEGORY,TEXT,text_processed,index
0,109,32723,Discharge summary,Admission Date: [**2142-5-15**] ...,admission date 2142515 discharge...,0
1,109,32723,Discharge summary,Admission Date: [**2142-5-20**] ...,admission date 2142520 discharge...,1
2,109,32723,Discharge summary,Admission Date: [**2142-6-18**] ...,admission date 2142618 discharge...,2
3,109,32723,Discharge summary,Admission Date: [**2142-7-3**] D...,admission date 214273 discharge ...,3
4,109,32723,Discharge summary,Admission Date: [**2142-7-7**] D...,admission date 214277 discharge ...,4
...,...,...,...,...,...,...
1167876,97492,311,Nursing,TITLE:\n Pt is 31 y/o woman with history of ...,title pt is 31 yo woman with history of head...,1167876
1167877,97492,311,Physician,SICU\n HPI:\n 31 yo RH woman with a PMH de...,sicu hpi 31 yo rh woman with a pmh depress...,1167877
1167878,97492,311,Radiology,[**2158-8-1**] 10:43 AM\n MR HEAD W/O CONTRAST...,215881 1043 am mr head wo contrast ...,1167878
1167879,97492,311,Radiology,[**2158-7-31**] 8:52 PM\n CTA HEAD W&W/O C & R...,2158731 852 pm cta head wwo c recons cta neck...,1167879


In [51]:
data_text = icd9_dd_notes[['text_processed','index']]
data_text

Unnamed: 0,text_processed,index
0,admission date 2142515 discharge...,0
1,admission date 2142520 discharge...,1
2,admission date 2142618 discharge...,2
3,admission date 214273 discharge ...,3
4,admission date 214277 discharge ...,4
...,...,...
1167876,title pt is 31 yo woman with history of head...,1167876
1167877,sicu hpi 31 yo rh woman with a pmh depress...,1167877
1167878,215881 1043 am mr head wo contrast ...,1167878
1167879,2158731 852 pm cta head wwo c recons cta neck...,1167879


In [52]:
documents = data_text

In [45]:
#import necessary packages for further word processing
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2020)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /global/homes/d/dsmorrow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [48]:
#function to perform lemmatize and stem preprocessing steps on the data set.
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [53]:
#test the stemmer and lem on one example text
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['24', 'yo', 'woman', 'with', 'hx', 'of', 'sle', 'ersd', 'on', 'hd', 'who', 'presented', 'with', 'hypertensive', '', '', 'urgency', 'ha', 'and', 'abd', 'pain', 'now', 'transferred', 'to', 'the', 'unit', 'for', 'angioedema', '', '', 'hypertension', 'malignant', 'hypertensive', 'crisis', 'hypertensive', 'emergency', '', '', 'assessment', '', '', 'action', '', '', 'response', '', '', 'plan', '', '', 'pain', 'control', 'acute', 'pain', 'chronic', 'pain', '', '', 'assessment', '', '', 'action', '', '', 'response', '', '', 'plan']


 tokenized and lemmatized document: 
['woman', 'ersd', 'present', 'hypertens', 'urgenc', 'pain', 'transfer', 'unit', 'angioedema', 'hypertens', 'malign', 'hypertens', 'crisi', 'hypertens', 'emerg', 'assess', 'action', 'respons', 'plan', 'pain', 'control', 'acut', 'pain', 'chronic', 'pain', 'assess', 'action', 'respons', 'plan']


In [None]:
#remove any na that would cause problems in the lda model
documents = documents.dropna(subset=['text_processed'])

In [62]:
doc_group1 = documents.iloc[0:10]
doc_group1

Unnamed: 0,text_processed,index
0,admission date 2142515 discharge...,0
1,admission date 2142520 discharge...,1
2,admission date 2142618 discharge...,2
3,admission date 214273 discharge ...,3
4,admission date 214277 discharge ...,4
5,admission date 2142712 discharge...,5
6,admission date 2142813 discharge...,6
7,admission date 2142828 discharge...,7
8,admission date 2141911 discharge...,8
9,admission date 2141918 discharge...,9


In [63]:
#use preprocess function on group1
processed_docs = doc_group1['text_processed'].map(preprocess)
processed_docs

0    [admiss, date, discharg, date, date, birth, fs...
1    [admiss, date, discharg, date, date, birth, fs...
2    [admiss, date, discharg, date, date, birth, fs...
3    [admiss, date, discharg, date, date, birth, fs...
4    [admiss, date, discharg, date, date, birth, fs...
5    [admiss, date, discharg, date, date, birth, fs...
6    [admiss, date, discharg, date, date, birth, fs...
7    [admiss, date, discharg, date, date, birth, fs...
8    [admiss, date, discharg, date, date, birth, fs...
9    [admiss, date, discharg, date, date, birth, fs...
Name: text_processed, dtype: object

In [66]:
#check to see if the function preprocess is picking up on all words
processed_docs[2]

['admiss',
 'date',
 'discharg',
 'date',
 'date',
 'birth',
 'fservic',
 'percocet',
 'chief',
 'complaintacut',
 'onset',
 'dyspneamajor',
 'surgic',
 'invas',
 'present',
 'illnesspleas',
 'micu',
 'note',
 'detail',
 'brief',
 'woman',
 'esrd',
 'malign',
 'svcsyndrom',
 'posterior',
 'revers',
 'encephalopathi',
 'syndromepr',
 'prior',
 'intracerebr',
 'hemorrhag',
 'frequent',
 'admittedwith',
 'hypertens',
 'admit',
 'acuteonset',
 'dyspnea',
 'week',
 'dialysi',
 'give',
 'unabl',
 'toget',
 'transport',
 'despit',
 'stitl',
 'attempt',
 'refus',
 'come',
 'micu',
 'treat',
 'nitro',
 'labetolol',
 'gtt',
 'stop',
 'micu',
 'wasdialyz',
 'fluid',
 'remov',
 'giventranfus',
 'improv',
 'note',
 'transfus',
 'unit',
 'prbc',
 'recentegd',
 'gastric',
 'ulcer',
 'guaiac',
 'neg',
 'inunit',
 'hemolysi',
 'neg',
 'current',
 'ofnot',
 'hypotens',
 'note',
 'improvedrapidli',
 'arrivalro',
 'current',
 'complaint',
 'note',
 'home',
 'herabdomin',
 'pain',
 'baselin',
 'felt',
 'ep