In [1]:
#import necessary packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import re

In [2]:
#import necessary packages for further word processing
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2020)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /global/homes/d/dsmorrow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#function to perform lemmatize and stem preprocessing steps on the data set.
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
full_proc = pd.read_csv("Full_Table_ICD9_Notes.csv", usecols=["text_processed", "index"])
full_proc

Unnamed: 0,text_processed,index
0,admission date 2142 5 15 discharge date 2142 5...,0
1,admission date 2142 5 20 discharge date 2142 6...,1
2,admission date 2142 6 18 discharge date 2142 6...,2
3,admission date 2142 7 3 discharge date 2142 7 ...,3
4,admission date 2142 7 7 discharge date 2142 7 ...,4
...,...,...
323055,last name lf first name3 lf 1046 j last name ...,323055
323056,2143 9 3 9 59 am chest pa lat clip clip numbe...,323056
323057,2144 2 25 1 49 pm ankle ap mortise lat left c...,323057
323058,2144 1 7 4 21 pm ankle ap mortise lat left cl...,323058


In [5]:
#select for all the history section in the notes
full_proc['history'] = full_proc["text_processed"].apply(lambda st: st[st.find("history of present illness")+len("history of present illness"):st.find("physical exam")])

In [6]:
full_proc.head()

Unnamed: 0,text_processed,index,history
0,admission date 2142 5 15 discharge date 2142 5...,0,24 year old female with sle esrd on hd hx mal...
1,admission date 2142 5 20 discharge date 2142 6...,1,ms known lastname is a 24 yo f with lupus sin...
2,admission date 2142 6 18 discharge date 2142 6...,2,please see micu note for full details in brie...
3,admission date 2142 7 3 discharge date 2142 7 ...,3,24f h o sle esrd on hd h o malignant htn svc ...
4,admission date 2142 7 7 discharge date 2142 7 ...,4,24f h o sle esrd on hd h o malignant htn svc ...


In [7]:
#remove the sub headers in the history section
full_proc['WO_Headers'] = full_proc["history"].map(lambda x: re.sub('family history', '', x))
full_proc['WO_Headers'] = full_proc["WO_Headers"].map(lambda x: re.sub('social history', '', x))
full_proc['WO_Headers'] = full_proc["WO_Headers"].map(lambda x: re.sub('past medical history', '', x))

In [8]:
full_proc.head()

Unnamed: 0,text_processed,index,history,WO_Headers
0,admission date 2142 5 15 discharge date 2142 5...,0,24 year old female with sle esrd on hd hx mal...,24 year old female with sle esrd on hd hx mal...
1,admission date 2142 5 20 discharge date 2142 6...,1,ms known lastname is a 24 yo f with lupus sin...,ms known lastname is a 24 yo f with lupus sin...
2,admission date 2142 6 18 discharge date 2142 6...,2,please see micu note for full details in brie...,please see micu note for full details in brie...
3,admission date 2142 7 3 discharge date 2142 7 ...,3,24f h o sle esrd on hd h o malignant htn svc ...,24f h o sle esrd on hd h o malignant htn svc ...
4,admission date 2142 7 7 discharge date 2142 7 ...,4,24f h o sle esrd on hd h o malignant htn svc ...,24f h o sle esrd on hd h o malignant htn svc ...


In [9]:
#full_proc.to_csv('History_Notes.csv')

In [10]:
doc_group1 = full_proc

In [11]:
#use preprocess function on the history section of the notes without headers included
processed_docs = doc_group1['WO_Headers'].map(preprocess)
processed_docs

0         [year, femal, esrd, malign, syndrom, posterior...
1         [know, lastnam, lupu, esrd, malign, histori, s...
2         [micu, note, detail, brief, woman, esrd, malig...
3         [esrd, malign, syndrom, pre, prior, frequent, ...
4         [esrd, malign, syndrom, pre, prior, frequent, ...
                                ...                        
323055    [chest, contrast, abdomen, contrast, clip, cli...
323056    [clip, clip, number, radiolog, reason, eval, a...
323057    [mortis, leav, clip, clip, number, radiolog, r...
323058    [mortis, leav, clip, clip, number, radiolog, r...
323059    [mortis, leav, clip, clip, number, radiolog, r...
Name: WO_Headers, Length: 323060, dtype: object

In [12]:
#create dictionary of words
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abdomin
1 abus
2 accid
3 admit
4 agre
5 anemia
6 antibodi
7 anticardiolipin
8 anticoagul
9 apnea
10 associ


In [13]:
#filter out words that appear in less than (15) documents
#only keep the first 10000
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [14]:
#go through each document and report words and occurrences using doc2box for token id and amount
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
#bow_corpus

In [15]:
#determine the TF-IDF scores or weight of a word within a document
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(sorted(doc, key = lambda x: x[1]))
    break

[(90, 0.01157092694043111),
 (158, 0.01607398170559334),
 (106, 0.02060247123983802),
 (56, 0.02316409571790551),
 (42, 0.026360408974084),
 (28, 0.026927212974133984),
 (133, 0.028460091686710386),
 (145, 0.029323390164209217),
 (122, 0.029754160832268093),
 (125, 0.03016918649471875),
 (144, 0.03059138532250271),
 (75, 0.03212258276223757),
 (81, 0.03317950358330767),
 (83, 0.03320847455858883),
 (93, 0.03321173187495516),
 (136, 0.03473065065038114),
 (117, 0.034863221470665666),
 (151, 0.03495781924052276),
 (97, 0.03496688243642581),
 (123, 0.03531442508866595),
 (52, 0.035374119636408366),
 (29, 0.035589415175223425),
 (49, 0.03583056904221277),
 (0, 0.03607206394014949),
 (19, 0.03655270349527521),
 (3, 0.0365961282108587),
 (73, 0.03738290103716312),
 (156, 0.03814627212856471),
 (30, 0.03815099040258024),
 (16, 0.03842001056934495),
 (130, 0.0389760740018324),
 (127, 0.0399458951937839),
 (129, 0.04020607254881963),
 (26, 0.04021689891375987),
 (77, 0.04047428067949276),
 (155

In [16]:
#top ten weighted words
print(dictionary[100] ,"," + dictionary[140] ,"," + dictionary[6] ,"," + dictionary[86] ,"," + dictionary[34] ,"," + dictionary[152] ,"," + dictionary[70],"," + dictionary[72],"," + dictionary[94],"," + dictionary[104])

neg ,thrombot ,antibodi ,lupu ,dialysi ,urgenc ,hypertens ,inabl ,month ,onset


In [18]:
# Gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
#train LDA model using BOW, chunk size is 5000 documents, lda is updated after every chunk size, 2 full passes through the corpus for training, produce 10 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=5000,
                                           passes=2)

In [20]:
# Print the Keywords in the 10 topics using the BOW corpus
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.023*"plan" + 0.020*"respons" + 0.019*"assess" + 0.019*"action" + '
  '0.016*"continu" + 0.011*"monitor" + 0.010*"pain" + 0.009*"give" + '
  '0.009*"remain" + 0.008*"follow"'),
 (1,
  '0.019*"vein" + 0.019*"right" + 0.017*"procedur" + 0.016*"arteri" + '
  '0.015*"leav" + 0.013*"identifi" + 0.012*"reason" + 0.012*"clip" + '
  '0.012*"cathet" + 0.011*"patient"'),
 (2,
  '0.024*"contrast" + 0.019*"right" + 0.018*"leav" + 0.018*"hemorrhag" + '
  '0.016*"head" + 0.015*"clip" + 0.013*"reason" + 0.012*"mass" + 0.010*"year" '
  '+ 0.009*"acut"'),
 (3,
  '0.033*"medic" + 0.026*"hour" + 0.023*"total" + 0.022*"balanc" + '
  '0.018*"rhythm" + 0.017*"review" + 0.017*"allergi" + 0.016*"system" + '
  '0.015*"mmhg" + 0.014*"admiss"'),
 (4,
  '0.055*"trace" + 0.036*"previou" + 0.033*"chang" + 0.030*"wave" + '
  '0.026*"compar" + 0.019*"lead" + 0.018*"leav" + 0.016*"abnorm" + '
  '0.015*"ventricular" + 0.012*"infarct"'),
 (5,
  '0.046*"assess" + 0.045*"lung" + 0.042*"sound" + 0.034*"ventil" + '

In [22]:
#train LDA model using TFIDF corpus, chunk size is 5000 documents, lda is updated after every chunk size, 2 full passes through the corpus for training, produce 10 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=5000,
                                           passes=2)

In [23]:
# Print the Keywords in the 10 topics using the TFIDF corpus instead
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus_tfidf]

[(0,
  '0.007*"action" + 0.007*"respons" + 0.006*"patient" + 0.006*"transfer" + '
  '0.006*"pain" + 0.005*"assess" + 0.005*"plan" + 0.004*"deni" + 0.004*"home" '
  '+ 0.004*"daili"'),
 (1,
  '0.016*"liver" + 0.012*"ascit" + 0.012*"obstruct" + 0.011*"hepat" + '
  '0.011*"bowel" + 0.010*"abdomen" + 0.010*"abdomin" + 0.009*"gallbladd" + '
  '0.009*"portal" + 0.008*"kidney"'),
 (2,
  '0.018*"contrast" + 0.008*"fractur" + 0.007*"hemorrhag" + 0.007*"clip" + '
  '0.007*"right" + 0.006*"mass" + 0.006*"reason" + 0.005*"imag" + 0.005*"leav" '
  '+ 0.005*"pelvi"'),
 (3,
  '0.041*"sound" + 0.030*"comment" + 0.028*"ventil" + 0.025*"lung" + '
  '0.023*"breath" + 0.023*"cuff" + 0.022*"ideal" + 0.021*"assess" + '
  '0.020*"airway" + 0.019*"type"'),
 (4,
  '0.078*"trace" + 0.045*"wave" + 0.043*"previou" + 0.031*"compar" + '
  '0.026*"lead" + 0.021*"diagnost" + 0.020*"ventricular" + 0.019*"specif" + '
  '0.018*"abnorm" + 0.018*"chang"'),
 (5,
  '0.018*"vein" + 0.012*"spine" + 0.009*"femor" + 0.009*"comp