In [1]:
#import necessary packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import re

In [2]:
#import necessary packages for further word processing
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2020)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /global/homes/d/dsmorrow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#function to perform lemmatize and stem preprocessing steps on the data set.
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 1:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
full_proc = pd.read_csv("Full_Table_ICD9_Notes.csv", usecols=["text_processed", "index"])
full_proc

Unnamed: 0,text_processed,index
0,admission date 2142 5 15 discharge date 2142 5...,0
1,admission date 2142 5 20 discharge date 2142 6...,1
2,admission date 2142 6 18 discharge date 2142 6...,2
3,admission date 2142 7 3 discharge date 2142 7 ...,3
4,admission date 2142 7 7 discharge date 2142 7 ...,4
...,...,...
323055,last name lf first name3 lf 1046 j last name ...,323055
323056,2143 9 3 9 59 am chest pa lat clip clip numbe...,323056
323057,2144 2 25 1 49 pm ankle ap mortise lat left c...,323057
323058,2144 1 7 4 21 pm ankle ap mortise lat left cl...,323058


In [8]:
#remove numbers
full_proc['text_processed'] = full_proc['text_processed'].map(lambda x: re.sub('(\s\d+)', ' ', x))

In [6]:
documents = full_proc

In [None]:
#use preprocess function
processed_docs = documents['text_processed'].map(preprocess)
processed_docs

In [None]:
#processed_docs.to_csv('Full_Text_Processed.csv')

In [5]:
#load in already processed csv, so we dont have to wait for first preprocessing
full_proc2 = pd.read_csv("Full_Text_Processed.csv")
full_proc2

Unnamed: 0.1,Unnamed: 0,text_processed
0,0,"['admiss', 'date', 'discharg', 'date', 'date',..."
1,1,"['admiss', 'date', 'discharg', 'date', 'date',..."
2,2,"['admiss', 'date', 'discharg', 'date', 'date',..."
3,3,"['admiss', 'date', 'discharg', 'date', 'date',..."
4,4,"['admiss', 'date', 'discharg', 'date', 'date',..."
...,...,...
432684,432684,"['titl', 'woman', 'histori', 'headach', 'talk'..."
432685,432685,"['sicu', 'woman', 'depress', 'wellbutrin', 'he..."
432686,432686,"['head', 'contrast', 'clip', 'clip', 'number',..."
432687,432687,"['head', 'recon', 'neck', 'recon', 'clip', 'cl..."


In [7]:
def preprocess2(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 1:
            result.append(token)
    return result

In [8]:
#make an interable again so can be make into dictionary
full_proc2 = full_proc2['text_processed'].map(preprocess2)

In [9]:
documents = full_proc2

In [10]:
#create dictionary of words and number of appearances 
dictionary = gensim.corpora.Dictionary(documents)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abdomen
1 abdomin
2 abl
3 abnorm
4 abus
5 accid
6 acut
7 admiss
8 admit
9 afebril
10 agre


In [11]:
#filter out words that appear in less than (15) documents
#only keep the first 10000
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [12]:
#go through each document and report words and occurrences using doc2box for token id and amount
bow_corpus = [dictionary.doc2bow(doc) for doc in documents]

In [17]:
#bow_corpus

In [13]:
#determine the TF-IDF scores or weight of a word within the documents
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(sorted(doc, key = lambda x: x[1]))
    break

[(350, 0.00478736280236833),
 (62, 0.005361282392675229),
 (214, 0.005632634801900507),
 (240, 0.006145858625093947),
 (181, 0.006252642382669749),
 (211, 0.006284518089834968),
 (6, 0.006422084838045818),
 (86, 0.006470056298457444),
 (157, 0.007263152434868314),
 (234, 0.00741368945270182),
 (261, 0.00757306090884315),
 (74, 0.007913477090277366),
 (288, 0.007930082632852118),
 (102, 0.007933443903941344),
 (101, 0.00814007095389678),
 (47, 0.008308036455530658),
 (292, 0.008376721215670086),
 (118, 0.008418482889838937),
 (65, 0.008515421063424242),
 (176, 0.00853884968784224),
 (320, 0.008559424126695123),
 (112, 0.008580519072054555),
 (318, 0.008672202469334417),
 (31, 0.00870126631071611),
 (283, 0.008790533163563121),
 (97, 0.00882957316869233),
 (13, 0.008970281177750582),
 (131, 0.009273104540039658),
 (173, 0.009288969232983632),
 (163, 0.009373734995544869),
 (263, 0.009385223046052158),
 (233, 0.009412076977054812),
 (116, 0.009419960639604487),
 (59, 0.009455778850807653)

In [14]:
#top ten weighted words from full documents dictionary and BOW
print(dictionary[306] ,"," + dictionary[345] ,"," + dictionary[223] ,"," + dictionary[321] ,"," + dictionary[302] ,"," + dictionary[251] ,"," + dictionary[333],"," + dictionary[260],"," + dictionary[209],"," + dictionary[87])

tablet ,weekli ,patch ,transderm ,sustain ,qwed ,urgenc ,releas ,nifedipin ,dialysi


In [28]:
#top ten weighted words from 50k documents
print(dictionary[307] ,"," + dictionary[345] ,"," + dictionary[224] ,"," + dictionary[303] ,"," + dictionary[321] ,"," + dictionary[261] ,"," + dictionary[76],"," + dictionary[252],"," + dictionary[333],"," + dictionary[93])

tablet ,weekli ,patch ,sustain ,transderm ,releas ,daili ,qwed ,urgenc ,discharg


In [16]:
# Gensim extras so we can plot word weights
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#train LDA model using BOW, chunk size is 5000 documents, lda is updated after every chunk size, 2 full passes through the corpus for training, produce 10 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=5000,
                                           passes=2)

In [19]:
# Print the Keywords in the 10 topics using the BOW corpus
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.028*"plan" + 0.027*"assess" + 0.025*"respons" + 0.022*"action" + '
  '0.015*"continu" + 0.014*"pain" + 0.012*"monitor" + 0.008*"acut" + '
  '0.008*"remain" + 0.007*"cont"'),
 (1,
  '0.026*"contrast" + 0.012*"abdomen" + 0.012*"clip" + 0.011*"small" + '
  '0.010*"right" + 0.010*"pelvi" + 0.009*"reason" + 0.009*"leav" + '
  '0.009*"liver" + 0.009*"fluid"'),
 (2,
  '0.035*"right" + 0.034*"leav" + 0.033*"fractur" + 0.021*"vein" + '
  '0.018*"reason" + 0.017*"clip" + 0.013*"year" + 0.012*"report" + '
  '0.012*"arteri" + 0.011*"extrem"'),
 (3,
  '0.021*"assess" + 0.012*"hour" + 0.011*"respiratori" + 0.011*"comment" + '
  '0.010*"total" + 0.009*"puls" + 0.009*"sound" + 0.009*"balanc" + '
  '0.009*"code" + 0.008*"line"'),
 (4,
  '0.037*"valv" + 0.037*"normal" + 0.028*"leav" + 0.027*"aortic" + '
  '0.023*"ventricular" + 0.020*"mitral" + 0.018*"trace" + 0.016*"leaflet" + '
  '0.015*"right" + 0.014*"wall"'),
 (5,
  '0.021*"head" + 0.019*"contrast" + 0.018*"hemorrhag" + 0.018*"leav" + '
 

In [20]:
# Visualize the topics and words
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
vis

In [21]:
#train LDA model using TFIDF corpus, chunk size is 5000 documents, lda is updated after every chunk size, 2 full passes through the corpus for training, produce 10 topics
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=5000,
                                           passes=2)

In [22]:
# Print the Keywords in the 10 topics using the TFIDF corpus
pprint(lda_model2.print_topics())
doc_lda2 = lda_model2[corpus_tfidf]

[(0,
  '0.019*"action" + 0.016*"respons" + 0.010*"plan" + 0.009*"assess" + '
  '0.007*"pain" + 0.006*"continu" + 0.006*"monitor" + 0.005*"failur" + '
  '0.005*"acut" + 0.005*"cont"'),
 (1,
  '0.020*"cvvh" + 0.017*"nutrit" + 0.016*"weight" + 0.015*"ioniz" + '
  '0.013*"diet" + 0.012*"protein" + 0.011*"arteri" + 0.011*"calcium" + '
  '0.011*"kcal" + 0.011*"recommend"'),
 (2,
  '0.019*"valv" + 0.012*"aortic" + 0.012*"contrast" + 0.011*"mitral" + '
  '0.011*"normal" + 0.010*"leaflet" + 0.008*"wall" + 0.008*"doppler" + '
  '0.007*"ventricular" + 0.007*"dilat"'),
 (3,
  '0.008*"puls" + 0.007*"balanc" + 0.007*"total" + 0.007*"hour" + 0.006*"code" '
  '+ 0.006*"lab" + 0.006*"assess" + 0.005*"prophylaxi" + 0.005*"respiratori" + '
  '0.005*"line"'),
 (4,
  '0.071*"trace" + 0.044*"wave" + 0.039*"previou" + 0.032*"sinu" + '
  '0.030*"compar" + 0.026*"rhythm" + 0.024*"lead" + 0.020*"specif" + '
  '0.020*"diagnost" + 0.020*"atrial"'),
 (5,
  '0.014*"contrast" + 0.013*"fractur" + 0.010*"hemorrhag" + 

In [2]:
# Visualize the topics and words based on TFIDF weight corpus
#pyLDAvis.enable_notebook()
#vis2 = pyLDAvis.gensim.prepare(lda_model2, corpus_tfidf, dictionary)
#vis2