In [23]:
##Create processed text that only includes abbreviations and misspellings

In [1]:
#import necessary packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import re

In [2]:
#import necessary packages for further word processing
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2020)
import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

[nltk_data] Downloading package words to
[nltk_data]     /global/homes/d/dsmorrow/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /global/homes/d/dsmorrow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /global/homes/d/dsmorrow/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
#function to perform lemmatize and stem preprocessing steps on the data set.
#this lemmatize function only converts verbs
words = set(nltk.corpus.words.words())
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in words:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
#function to tag each word in the text by its part of speech
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [5]:
#lemmatize based on the tag from other function
#this is needed so we can succesfully remove all dictionary words regardless of inflection 
def lemmatize2(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(text)]

In [6]:
#remove all stop words and anything in the nltk dictionary
def preprocess2(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in words:
            result.append(token)
    return result

In [7]:
full_proc = pd.read_csv("Full_Table_ICD9_Notes.csv", usecols=["text_processed", "index"])
full_proc

Unnamed: 0,text_processed,index
0,admission date 2142 5 15 discharge date 2142 5...,0
1,admission date 2142 5 20 discharge date 2142 6...,1
2,admission date 2142 6 18 discharge date 2142 6...,2
3,admission date 2142 7 3 discharge date 2142 7 ...,3
4,admission date 2142 7 7 discharge date 2142 7 ...,4
...,...,...
323055,last name lf first name3 lf 1046 j last name ...,323055
323056,2143 9 3 9 59 am chest pa lat clip clip numbe...,323056
323057,2144 2 25 1 49 pm ankle ap mortise lat left c...,323057
323058,2144 1 7 4 21 pm ankle ap mortise lat left cl...,323058


In [None]:
documents = full_proc

In [8]:
#remove numbrs
full_proc['text_processed'] = full_proc['text_processed'].map(lambda x: re.sub('(\s\d+)', ' ', x))

In [9]:
proc_text = full_proc['text_processed']

In [10]:
#remove stop words
no_stop = proc_text.map(preprocess)

In [11]:
#convert to string so can be mapped again
DF_no_stop = no_stop.to_frame()
DF_no_stop['string'] = DF_no_stop.text_processed.apply(', '.join)

In [12]:
#lemmatize using tagging method
processed_doc3 = DF_no_stop['string'].map(lemmatize2)

In [19]:
processed_doc3.to_csv('Lemmatized_Text.csv')

In [16]:
#not sure what the extra commas are from
processed_doc3.head()

0    [allergi, ,, penicillin, ,, percocet, ,, atten...
1    [allergi, ,, penicillin, ,, percocet, ,, atten...
2    [allergi, ,, penicillin, ,, percocet, ,, atten...
3    [allergi, ,, penicillin, ,, percocet, ,, atten...
4    [allergi, ,, penicillin, ,, percocet, ,, atten...
Name: string, dtype: object

In [13]:
#convert map object to string to be mapped again
Final_Table = processed_doc3.to_frame()
Final_Table['ND_string2'] = Final_Table.string.apply(', '.join)

In [14]:
#remove stop words and english dictionary words from nltk
words = set(nltk.corpus.words.words())
processed_docs3_full = Final_Table['ND_string2'].map(preprocess2)

In [15]:
processed_docs3_full.head

<bound method NDFrame.head of 0         [allergi, percocet, lf, hemodialysi, sle, esrd...
1         [allergi, percocet, lf, lastnam, esrd, hd, htn...
2         [allergi, percocet, lf, micu, sle, esrd, hd, h...
3         [allergi, percocet, lf, hemodialysi, sle, esrd...
4         [allergi, percocet, namepattern, hemodialysi, ...
                                ...                        
323055    [lf, lf, cc, ct, ct, ct, intra, int, polytraum...
323056      [eval, ptx, polytrauma, ptx, eval, ptx, compar]
323057    [pm, ap, wk, wk, wk, orif, fractur, tibiotalar...
323058         [pm, ap, fx, fx, fx, orif, complic, preserv]
323059                                    [pm, ap, travers]
Name: ND_string2, Length: 323060, dtype: object>

In [18]:
processed_docs3_full.to_csv('No_English_Dictionary.csv')

In [17]:
#looks like we succesfully pulled out almost all dictionary words
processed_docs3_full.iloc[1]

['allergi',
 'percocet',
 'lf',
 'lastnam',
 'esrd',
 'hd',
 'htn',
 'svc',
 'pre',
 'admiss',
 'ed',
 'dilaudid',
 'mg',
 'zofran',
 'mg',
 'iv',
 'hyperkalemia',
 'kayexal',
 'bp',
 'ed',
 'sbp',
 'hydral',
 'aliskeren',
 'labetalol',
 'iv',
 'hydral',
 'labetalol',
 'iv',
 'nicardipin',
 'mg',
 'iv',
 'nicardipin',
 'gtt',
 'abd',
 'earlier',
 'abd',
 'abd',
 'earlier',
 'episod',
 'deni',
 'gi',
 'micu',
 'nicardipin',
 'bp',
 'wnl',
 'hd',
 'erythematosu',
 'mycophenol',
 'prednison',
 'ckd',
 'esrd',
 'diagnos',
 'pd',
 'hd',
 'dialyz',
 'wk',
 'seizur',
 'svc',
 'anticardiolipin',
 'antibodi',
 'igg',
 'igm',
 'hocm',
 'depoprovera',
 'requir',
 'coag',
 'staph',
 'hd',
 'microangiopathi',
 'cpap',
 'mssa',
 'hd',
 'cin',
 'appt',
 'schedul',
 'hemorrhag',
 'le',
 'deni',
 'etoh',
 'autoimmun',
 'cerebrovascular',
 'nad',
 'tv',
 'heent',
 'normocephal',
 'perrla',
 'eomi',
 'mmm',
 'op',
 'rrr',
 'rusb',
 'ctab',
 'biater',
 'nd',
 'luq',
 'extrem',
 'hd',
 'nontend',
 'noneryt

In [20]:
#need to convert back to string to determine word frequency
Full_WF = processed_docs3_full.to_frame()
Full_WF['final_string'] = Full_WF.ND_string2.apply(', '.join)

In [21]:
Full_WF.head()

Unnamed: 0,ND_string2,final_string
0,"[allergi, percocet, lf, hemodialysi, sle, esrd...","allergi, percocet, lf, hemodialysi, sle, esrd,..."
1,"[allergi, percocet, lf, lastnam, esrd, hd, htn...","allergi, percocet, lf, lastnam, esrd, hd, htn,..."
2,"[allergi, percocet, lf, micu, sle, esrd, hd, h...","allergi, percocet, lf, micu, sle, esrd, hd, hx..."
3,"[allergi, percocet, lf, hemodialysi, sle, esrd...","allergi, percocet, lf, hemodialysi, sle, esrd,..."
4,"[allergi, percocet, namepattern, hemodialysi, ...","allergi, percocet, namepattern, hemodialysi, s..."


In [22]:
final_proc_WF = Full_WF['final_string']

In [23]:
#go through all documents and count word frequency
wordfreq2 = {}
for sentence in final_proc_WF:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq2.keys():
            wordfreq2[token] = 1
        else:
            wordfreq2[token] += 1

In [24]:
import operator
sorted_WF = dict(sorted(wordfreq2.items(), key=operator.itemgetter(1),reverse=True))
sorted_WF

{',': 15866513,
 'pt': 603770,
 'mg': 583925,
 'ml': 440099,
 'pm': 359837,
 'hr': 242824,
 'dl': 240690,
 'ct': 199840,
 'iv': 167206,
 'meq': 141210,
 'bp': 134692,
 'icu': 125629,
 'hct': 118278,
 'cont': 102145,
 'rr': 97922,
 'namepattern': 96134,
 'cc': 90701,
 'increas': 88413,
 'imag': 80631,
 'abg': 80441,
 'wbc': 80126,
 'prn': 79085,
 'gtt': 77249,
 'cm': 75718,
 'neuro': 74169,
 'abd': 72308,
 'toler': 70838,
 'ap': 69909,
 'spo': 69588,
 'gi': 69481,
 'mm': 65860,
 'compar': 65186,
 'dr': 65177,
 'allergi': 65053,
 'stitl': 63497,
 'inr': 59739,
 'eval': 59629,
 'lasix': 59244,
 'ul': 58655,
 'intub': 58399,
 'picc': 57902,
 'mcg': 56836,
 'cxr': 56565,
 'ng': 55705,
 'dvt': 55684,
 'ivf': 55330,
 'antibiot': 54936,
 'cmh': 53994,
 'plt': 53766,
 'ed': 53508,
 'improv': 53308,
 'sbp': 52738,
 'md': 51886,
 'tf': 51455,
 'hd': 50661,
 'mmhg': 50402,
 'fio': 50383,
 'extrem': 49616,
 'med': 49459,
 'continu': 48764,
 'foley': 48068,
 'trach': 47978,
 'requir': 47848,
 'deni'

In [25]:
sorted(wordfreq2.items(), key=lambda item: item[1])

[('shapria', 1),
 ('intensvi', 1),
 ('naticoagul', 1),
 ('antihypertns', 1),
 ('recrat', 1),
 ('admissionwho', 1),
 ('exploaratori', 1),
 ('remiaind', 1),
 ('therapetu', 1),
 ('ckda', 1),
 ('enremark', 1),
 ('dialysu', 1),
 ('unsucesful', 1),
 ('introutu', 1),
 ('specuilum', 1),
 ('glucuronid', 1),
 ('plaquanil', 1),
 ('communicaton', 1),
 ('appointmnt', 1),
 ('exrd', 1),
 ('tinitu', 1),
 ('swellingand', 1),
 ('mechanica', 1),
 ('hyrperaklemia', 1),
 ('asppear', 1),
 ('sludi', 1),
 ('nkf', 1),
 ('kdoqi', 1),
 ('ium', 1),
 ('neccassari', 1),
 ('labelol', 1),
 ('bisacoldi', 1),
 ('repetat', 1),
 ('probablkt', 1),
 ('kubto', 1),
 ('regiven', 1),
 ('lobatolol', 1),
 ('azetranam', 1),
 ('kayexelatewil', 1),
 ('nitriat', 1),
 ('cais', 1),
 ('labetallol', 1),
 ('backneg', 1),
 ('dialyis', 1),
 ('microthromb', 1),
 ('infectioiu', 1),
 ('midd', 1),
 ('anel', 1),
 ('bpi', 1),
 ('dialy', 1),
 ('cleay', 1),
 ('subsis', 1),
 ('vsvc', 1),
 ('avpoid', 1),
 ('themodialyz', 1),
 ('hocom', 1),
 ('thromb

In [None]:
#all code below is before changing to new lemmatizing function
#using for comparison of words

In [10]:
#use original preprocess function on text before removing stop words and changing lemamtize function
processed_docs = documents['text_processed'].map(preprocess)
processed_docs

0         [allergi, penicillin, percocet, attend, lf, he...
1         [allergi, penicillin, percocet, attend, lf, ms...
2         [allergi, penicillin, percocet, attend, lf, mi...
3         [allergi, penicillin, percocet, attend, lf, he...
4         [allergi, penicillin, percocet, attend, namepa...
                                ...                        
323055    [lf, lf, cc, ct, ct, ct, intra, int, admit, po...
323056    [eval, ptx, admit, polytrauma, place, ptx, eva...
323057    [pm, ap, wk, wk, wk, radiograph, orif, fractur...
323058    [pm, ap, fx, fx, fx, view, orif, find, plat, c...
323059          [pm, ap, find, screw, screw, travers, heal]
Name: text_processed, Length: 323060, dtype: object

In [17]:
#before different preprocessing output
#removes stop word and removes anything from the english dictionary
#still receiving alot of dictionary words
#need to adjust lemmatizer so that all the english words are forms found in the dictionary
processed_docs2 = documents['text_processed'].map(preprocess2)
processed_docs2

0         [allergies, penicillins, percocet, attending, ...
1         [allergies, penicillins, percocet, attending, ...
2         [allergies, penicillins, percocet, attending, ...
3         [allergies, penicillins, percocet, attending, ...
4         [allergies, penicillins, percocet, attending, ...
                                ...                        
323055    [lf, lf, cc, ct, ct, ct, intra, int, admitting...
323056    [eval, ptx, admitting, polytrauma, placed, ptx...
323057    [pm, ap, wk, wk, wk, radiographs, orif, fractu...
323058    [pm, ap, fx, fx, fx, views, orif, findings, pl...
323059    [pm, ap, findings, screws, screws, traverses, ...
Name: text_processed, Length: 323060, dtype: object

In [20]:
"penicillin" in words

True

In [21]:
"penicillins" in words

False

In [11]:
processed_docs.iloc[1]

['allergi',
 'penicillin',
 'percocet',
 'attend',
 'lf',
 'ms',
 'lastnam',
 'esrd',
 'hd',
 'htn',
 'svc',
 'pre',
 'admiss',
 'ed',
 'complain',
 'hour',
 'dilaudid',
 'mg',
 'zofran',
 'mg',
 'iv',
 'hyperkalemia',
 'kayexal',
 'bp',
 'ed',
 'sbp',
 'hydral',
 'aliskeren',
 'labetalol',
 'iv',
 'hydral',
 'labetalol',
 'iv',
 'nicardipin',
 'mg',
 'iv',
 'start',
 'nicardipin',
 'gtt',
 'complain',
 'abd',
 'start',
 'earlier',
 'abd',
 'feel',
 'abd',
 'have',
 'earlier',
 'have',
 'episod',
 'stool',
 'week',
 'deni',
 'chang',
 'medic',
 'gi',
 'micu',
 'wean',
 'nicardipin',
 'bp',
 'remain',
 'wnl',
 'hd',
 'erythematosu',
 'mycophenol',
 'maintain',
 'prednison',
 'ckd',
 'esrd',
 'diagnos',
 'pd',
 'hd',
 'agre',
 'dialyz',
 'wk',
 'seizur',
 'event',
 'svc',
 'anticardiolipin',
 'antibodi',
 'igg',
 'igm',
 'hocm',
 'month',
 'depoprovera',
 'requir',
 'coag',
 'staph',
 'hd',
 'microangiopathi',
 'cpap',
 'mssa',
 'hd',
 'cin',
 'appt',
 'schedul',
 'hemorrhag',
 'le',
 'd

In [None]:
#create dictionary of words and number of appearances 
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break