In [2]:
#import necessary packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import re

In [3]:
#import necessary packages for further word processing
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2020)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /global/homes/d/dsmorrow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#function to perform lemmatize and stem preprocessing steps on the data set.
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [5]:
full_proc = pd.read_csv("Full_Table_ICD9_Notes.csv", usecols=["text_processed", "index"])
full_proc

Unnamed: 0,text_processed,index
0,admission date 2142 5 15 discharge date 2142 5...,0
1,admission date 2142 5 20 discharge date 2142 6...,1
2,admission date 2142 6 18 discharge date 2142 6...,2
3,admission date 2142 7 3 discharge date 2142 7 ...,3
4,admission date 2142 7 7 discharge date 2142 7 ...,4
...,...,...
432684,title pt is 31 y o woman with history of heada...,432684
432685,sicu hpi 31 yo rh woman with a pmh depression ...,432685
432686,2158 8 1 10 43 am mr head w o contrast clip c...,432686
432687,2158 7 31 8 52 pm cta head w w o c recons cta...,432687


In [6]:
#select for all the history section in the notes
full_proc['history'] = full_proc["text_processed"].apply(lambda st: st[st.find("history of present illness")+len("history of present illness"):st.find("physical exam")])

In [7]:
full_proc.head()

Unnamed: 0,text_processed,index,history
0,admission date 2142 5 15 discharge date 2142 5...,0,24 year old female with sle esrd on hd hx mal...
1,admission date 2142 5 20 discharge date 2142 6...,1,ms known lastname is a 24 yo f with lupus sin...
2,admission date 2142 6 18 discharge date 2142 6...,2,please see micu note for full details in brie...
3,admission date 2142 7 3 discharge date 2142 7 ...,3,24f h o sle esrd on hd h o malignant htn svc ...
4,admission date 2142 7 7 discharge date 2142 7 ...,4,24f h o sle esrd on hd h o malignant htn svc ...


In [8]:
#remove the sub headers in the history section
full_proc['WO_Headers'] = full_proc["history"].map(lambda x: re.sub('family history', '', x))
full_proc['WO_Headers'] = full_proc["WO_Headers"].map(lambda x: re.sub('social history', '', x))
full_proc['WO_Headers'] = full_proc["WO_Headers"].map(lambda x: re.sub('past medical history', '', x))

In [9]:
full_proc.head()

Unnamed: 0,text_processed,index,history,WO_Headers
0,admission date 2142 5 15 discharge date 2142 5...,0,24 year old female with sle esrd on hd hx mal...,24 year old female with sle esrd on hd hx mal...
1,admission date 2142 5 20 discharge date 2142 6...,1,ms known lastname is a 24 yo f with lupus sin...,ms known lastname is a 24 yo f with lupus sin...
2,admission date 2142 6 18 discharge date 2142 6...,2,please see micu note for full details in brie...,please see micu note for full details in brie...
3,admission date 2142 7 3 discharge date 2142 7 ...,3,24f h o sle esrd on hd h o malignant htn svc ...,24f h o sle esrd on hd h o malignant htn svc ...
4,admission date 2142 7 7 discharge date 2142 7 ...,4,24f h o sle esrd on hd h o malignant htn svc ...,24f h o sle esrd on hd h o malignant htn svc ...


In [10]:
#full_proc.to_csv('History_Notes.csv')

In [11]:
#use preprocess function on the history section of the notes without headers included
processed_docs = full_proc['WO_Headers'].map(preprocess)
processed_docs

0         [year, femal, esrd, malign, syndrom, posterior...
1         [know, lastnam, lupu, esrd, malign, histori, s...
2         [micu, note, detail, brief, woman, esrd, malig...
3         [esrd, malign, syndrom, pre, prior, frequent, ...
4         [esrd, malign, syndrom, pre, prior, frequent, ...
                                ...                        
432684    [histori, headach, talk, phone, sister, yester...
432685    [depress, wellbutrin, headach, migrain, ocp, a...
432686    [contrast, clip, clip, number, radiolog, reaso...
432687    [recon, neck, recon, clip, clip, number, radio...
432688    [contrast, outsid, film, read, clip, clip, num...
Name: WO_Headers, Length: 432689, dtype: object

In [12]:
#create dictionary of words
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abdomin
1 abus
2 accid
3 admit
4 agre
5 anemia
6 antibodi
7 anticardiolipin
8 anticoagul
9 apnea
10 associ


In [13]:
#filter out words that appear in less than (15) documents
#only keep the first 10000
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [14]:
#go through each document and report words and occurrences using doc2box for token id and amount
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
#bow_corpus

In [15]:
#determine the TF-IDF scores or weight of a word within a document
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.035465778570587664),
 (1, 0.05460991218792134),
 (2, 0.07800877051953248),
 (3, 0.03523687249486545),
 (4, 0.051074978571353805),
 (5, 0.05397328711449128),
 (6, 0.18934836318594755),
 (7, 0.11280492994920087),
 (8, 0.07372299064382262),
 (9, 0.059855741315649),
 (10, 0.05307489656417573),
 (11, 0.09678360008384047),
 (12, 0.1278174710138821),
 (13, 0.045752521857350376),
 (14, 0.07949406072692557),
 (15, 0.08800088973972603),
 (16, 0.03879030258693885),
 (17, 0.051604732460941286),
 (18, 0.06841032540259104),
 (19, 0.0366973145327756),
 (20, 0.09967026823540343),
 (21, 0.057746254649464916),
 (22, 0.07269433233145711),
 (23, 0.05852996050045313),
 (24, 0.04160203438926827),
 (25, 0.052977521113137936),
 (26, 0.03947951121981764),
 (27, 0.08451239749170157),
 (28, 0.025813326837803914),
 (29, 0.03531242178154765),
 (30, 0.03832735560326235),
 (31, 0.05735352496758997),
 (32, 0.125810212421795),
 (33, 0.06327983309610975),
 (34, 0.16599162676828463),
 (35, 0.09218079998136047),
 

In [None]:
#train lda model using only bow_corpus
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# train lda model using tf-idf word weights already established
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))