# After defined models test in doc 4, move to new models

## Reference
* (2017, May 23). Clinical Intervention Prediction and Understanding using Deep Networks. Retrieved December 3, 2018, from https://arxiv.org/abs/1705.08498
* [NLP with Python](https://github.com/susanli2016/NLP-with-Python) reference
* [Topic Modeling](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24) reference

## Before any prediction, let's see what topic modeling finds

Note: <b>Topic model</b> is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body. Given that we have multiple clinicial notes (unstructured data) under each patient's history, we will practise what topic modeling brings us, before looking into more sophisticated machine learnings based on time series.

In [60]:
# import libraries
import pandas as pd
import numpy as np
np.random.seed(2018)

# topic modeling in python
import gensim              
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# natural language toolkit
import nltk                
from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer('english')
from nltk.stem.porter import *
nltk.download('wordnet')

# TF-IDF
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ericx\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# test Lemmatize
[print(WordNetLemmatizer().lemmatize(item, pos='v')) 
    for item in ['went', 'goes', 'medicine', 'medicines', 'service', 'services']]

go
go
medicine
medicine
service
service


[None, None, None, None, None, None]

In [45]:
# define a function to perform lemmatize and stem preprocessing steps for the data set
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [29]:
# load data for a practise
# using 1000 lines of data from noteevents.csv table
noteevents = pd.read_csv('data/NOTEEVENTS.csv', nrows=1000)  
documents = noteevents[['SUBJECT_ID','TEXT']].sort_values(by=['SUBJECT_ID'], ascending=True)
print('type(documents): ', type(documents))
print('len(documents):', len(documents))
documents.head()

type(documents):  <class 'pandas.core.frame.DataFrame'>
len(documents): 1000


Unnamed: 0,SUBJECT_ID,TEXT
547,64,Admission Date: [**2143-3-3**] Discharg...
113,142,Admission Date: [**2143-4-1**] Discharge ...
824,151,Admission Date: [**2145-5-6**] Discharg...
921,170,Admission Date: [**2189-9-4**] D...
638,246,Admission Date: [**2130-6-1**] D...


In [49]:
doc_sample = documents[documents['SUBJECT_ID']==64].values[0][1]
print('original document:--------------')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n Tokenized and lemmatized documents:--------------- ')
print(preprocess(doc_sample))

original document:--------------
['Admission', 'Date:', '', '[**2143-3-3**]', '', '', '', '', '', '', 'Discharge', 'Date:', '', '[**2143-3-18**]\n\nDate', 'of', 'Birth:', '', '', '[**2116-6-27**]', '', '', '', '', '', '', 'Sex:', '', 'F\n\nService:', '', 'ACOVE\n\nCHIEF', 'COMPLAINT:', '', 'Fever.\n\nHISTORY', 'OF', 'PRESENT', 'ILLNESS:', '', 'The', 'patient', 'is', 'a', '26-year-old\nwoman', 'with', 'a', 'history', 'of', 'IV', 'drug', 'abuse,', 'who', 'initially\npresented', 'to', 'an', 'outside', 'hospital', 'on', '[**2143-3-2**]', 'from', 'a\ndrug', 'and', 'detoxification', 'facility', 'with', 'a', 'chief', 'complaint', 'of\nheadache,', 'abdominal', 'pain,', 'and', 'fever.', '', 'At', 'the', 'outside\nhospital,', 'the', 'patient', 'was', 'found', 'to', 'be', 'febrile', 'to', '104.6\ndegrees', 'F,', 'and', 'she', 'subsequently', 'developed', 'hypotension', 'with', 'a\nsystolic', 'blood', 'pressure', 'in', 'the', '80s.\n\nDuring', 'this', 'initial', 'evaluation,', 'the', 'patient', 'w

In [50]:
processed_docs = documents['TEXT'].map(preprocess)
processed_docs[:10]

547    [admiss, date, discharg, date, date, birth, se...
113    [admiss, date, discharg, date, date, birth, se...
824    [admiss, date, discharg, date, servic, cardiot...
921    [admiss, date, discharg, date, date, birth, se...
638    [admiss, date, discharg, date, date, birth, se...
407    [admiss, date, discharg, date, date, birth, se...
742    [admiss, date, discharg, date, date, birth, se...
743    [admiss, date, discharg, date, date, birth, se...
805    [admiss, date, discharg, date, date, birth, se...
252    [admiss, date, discharg, date, servic, medicin...
Name: TEXT, dtype: object

In [51]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [54]:
dictionary

ItemsView(<gensim.corpora.dictionary.Dictionary object at 0x00000219B55CD9E8>)

In [55]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abdomen
1 abdomin
2 abnorm
3 abort
4 abscess
5 abus
6 acov
7 acut
8 addit
9 adequ
10 admiss


In [56]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [58]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[64]

[(3, 2),
 (4, 4),
 (10, 1),
 (18, 5),
 (29, 2),
 (32, 1),
 (40, 1),
 (45, 1),
 (50, 1),
 (51, 2),
 (54, 1),
 (55, 4),
 (57, 10),
 (60, 1),
 (61, 1),
 (63, 1),
 (77, 1),
 (84, 2),
 (85, 2),
 (89, 1),
 (90, 7),
 (91, 1),
 (92, 4),
 (95, 1),
 (99, 2),
 (101, 3),
 (102, 3),
 (103, 1),
 (104, 3),
 (105, 2),
 (112, 3),
 (120, 6),
 (122, 3),
 (126, 3),
 (129, 1),
 (131, 1),
 (134, 8),
 (137, 1),
 (141, 1),
 (143, 1),
 (144, 1),
 (147, 1),
 (159, 2),
 (161, 3),
 (163, 1),
 (165, 1),
 (168, 1),
 (170, 1),
 (175, 1),
 (180, 2),
 (187, 1),
 (188, 2),
 (192, 1),
 (193, 6),
 (194, 1),
 (197, 3),
 (201, 2),
 (204, 2),
 (206, 1),
 (208, 2),
 (211, 1),
 (214, 1),
 (215, 2),
 (216, 1),
 (221, 1),
 (224, 2),
 (228, 1),
 (229, 1),
 (231, 1),
 (244, 1),
 (245, 1),
 (251, 5),
 (258, 1),
 (260, 7),
 (261, 1),
 (264, 2),
 (267, 3),
 (274, 2),
 (277, 1),
 (278, 8),
 (281, 1),
 (282, 1),
 (283, 2),
 (287, 1),
 (292, 1),
 (296, 1),
 (299, 1),
 (307, 4),
 (309, 3),
 (310, 1),
 (311, 6),
 (312, 1),
 (314, 8),
 (3

In [59]:
bow_doc_64 = bow_corpus[64]

for i in range(len(bow_doc_64)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_64[i][0], 
                                                     dictionary[bow_doc_64[i][0]], 
                                                     bow_doc_64[i][1]))

Word 3 ("abus") appears 2 time.
Word 4 ("addit") appears 4 time.
Word 10 ("albumin") appears 1 time.
Word 18 ("antibiot") appears 5 time.
Word 29 ("atelectasi") appears 2 time.
Word 32 ("atyp") appears 1 time.
Word 40 ("band") appears 1 time.
Word 45 ("bicarbon") appears 1 time.
Word 50 ("calcium") appears 1 time.
Word 51 ("call") appears 2 time.
Word 54 ("cardiovascular") appears 1 time.
Word 55 ("cathet") appears 4 time.
Word 57 ("cell") appears 10 time.
Word 60 ("check") appears 1 time.
Word 61 ("chemistri") appears 1 time.
Word 63 ("chlorid") appears 1 time.
Word 77 ("complic") appears 1 time.
Word 84 ("consist") appears 2 time.
Word 85 ("consult") appears 2 time.
Word 89 ("control") appears 1 time.
Word 90 ("count") appears 7 time.
Word 91 ("creatinin") appears 1 time.
Word 92 ("cultur") appears 4 time.
Word 95 ("decis") appears 1 time.
Word 99 ("degre") appears 2 time.
Word 101 ("depart") appears 3 time.
Word 102 ("develop") appears 3 time.
Word 103 ("diagnos") appears 1 time.
Wo

In [61]:
# TF-IDF
corpus_tfidf = tfidf[bow_corpus]

In [62]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.07868455196498843),
 (1, 0.04670363015139512),
 (2, 0.049809526865249004),
 (3, 0.08589417644905668),
 (4, 0.010786961320840201),
 (5, 0.020827843349254818),
 (6, 0.07917393765878254),
 (7, 0.0227117686491569),
 (8, 0.026542750528308793),
 (9, 0.02882932686821098),
 (10, 0.016595072945082518),
 (11, 0.00957024324743943),
 (12, 0.13092457685631823),
 (13, 0.0544723058259119),
 (14, 0.04606813778078839),
 (15, 0.023697827803126917),
 (16, 0.11596470991522938),
 (17, 0.02588141725393758),
 (18, 0.04543219461625163),
 (19, 0.019577542825540598),
 (20, 0.02393842765167954),
 (21, 0.04141083235529787),
 (22, 0.024538306914657918),
 (23, 0.06987584591801738),
 (24, 0.025582380806360116),
 (25, 0.10672119073433999),
 (26, 0.03133163979141356),
 (27, 0.028009120501188083),
 (28, 0.03495542790494231),
 (29, 0.01445443958985727),
 (30, 0.03464952133129495),
 (31, 0.026140851597319284),
 (32, 0.040737881764606555),
 (33, 0.062231806416226346),
 (34, 0.015515432234677913),
 (35, 0.0217265720

In [None]:
# Running LDA using Bag of Word

In [63]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [64]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"releas" + 0.006*"capsul" + 0.005*"effus" + 0.005*"refil" + 0.005*"disp" + 0.004*"locat" + 0.004*"renal" + 0.003*"angap" + 0.003*"aortic" + 0.003*"delay"
Topic: 1 
Words: 0.005*"urin" + 0.005*"capsul" + 0.004*"releas" + 0.004*"failur" + 0.004*"intub" + 0.004*"tube" + 0.004*"base" + 0.004*"hypotens" + 0.004*"month" + 0.003*"type"
Topic: 2 
Words: 0.006*"disp" + 0.006*"ventricular" + 0.006*"bleed" + 0.005*"releas" + 0.005*"capsul" + 0.005*"refil" + 0.004*"urin" + 0.004*"moder" + 0.004*"delay" + 0.003*"elev"
Topic: 3 
Words: 0.005*"effus" + 0.005*"cultur" + 0.004*"ventricular" + 0.004*"capsul" + 0.004*"bleed" + 0.004*"atrial" + 0.004*"releas" + 0.003*"pneumonia" + 0.003*"upper" + 0.003*"refil"
Topic: 4 
Words: 0.009*"postop" + 0.009*"coronari" + 0.006*"examin" + 0.005*"hematocrit" + 0.005*"reveal" + 0.005*"surgeri" + 0.005*"rhythm" + 0.005*"graft" + 0.005*"room" + 0.004*"count"
Topic: 5 
Words: 0.006*"capsul" + 0.005*"coronari" + 0.004*"releas" + 0.004*"renal" + 0.0

In [65]:
# Running LDA using TF-IDF

In [66]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [67]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.003*"ileostomi" + 0.002*"bleed" + 0.002*"postop" + 0.002*"atrial" + 0.002*"fibril" + 0.002*"oper" + 0.002*"hypotens" + 0.002*"graft" + 0.001*"coronari" + 0.001*"coumadin"
Topic: 1 Word: 0.002*"mouth" + 0.002*"fractur" + 0.002*"aspir" + 0.002*"tube" + 0.002*"renal" + 0.002*"expir" + 0.002*"capsul" + 0.002*"intub" + 0.002*"seizur" + 0.002*"pneumonia"
Topic: 2 Word: 0.003*"postop" + 0.003*"bleed" + 0.002*"oper" + 0.002*"mouth" + 0.002*"hematocrit" + 0.002*"percent" + 0.002*"coronari" + 0.002*"valv" + 0.002*"seizur" + 0.002*"vein"
Topic: 3 Word: 0.004*"disp" + 0.004*"refil" + 0.004*"valv" + 0.004*"aortic" + 0.003*"ventricular" + 0.003*"releas" + 0.003*"mitral" + 0.003*"capsul" + 0.003*"coronari" + 0.002*"coumadin"
Topic: 4 Word: 0.002*"postop" + 0.002*"aneurysm" + 0.002*"capsul" + 0.002*"graft" + 0.002*"angioplasti" + 0.002*"aortic" + 0.001*"coronari" + 0.001*"rehabilit" + 0.001*"femor" + 0.001*"renal"
Topic: 5 Word: 0.002*"postop" + 0.002*"valv" + 0.002*"tube" + 0.002*"ao

In [68]:
# Classification of the topics
# Performance evaluation by classifying sample document using LDA Bag of Words model

In [69]:
processed_docs[64]

['admiss',
 'date',
 'discharg',
 'date',
 'date',
 'birth',
 'servic',
 'medicin',
 'allergi',
 'know',
 'allergi',
 'advers',
 'drug',
 'reaction',
 'attend',
 'doctor',
 'chief',
 'complaint',
 'blurri',
 'vision',
 'major',
 'surgic',
 'invas',
 'procedur',
 'histori',
 'present',
 'ill',
 'year',
 'male',
 'histori',
 'hypertens',
 'beta',
 'blockad',
 'lasix',
 'hydralazin',
 'imdur',
 'metolazon',
 'complic',
 'chronic',
 'kidney',
 'diseas',
 'stage',
 'obes',
 'tobacco',
 'abus',
 'present',
 'today',
 'emerg',
 'room',
 'hospit',
 'clinic',
 'note',
 'sever',
 'bilater',
 'papilledema',
 'opthomolog',
 'suggest',
 'secondari',
 'elev',
 'intracrani',
 'pressur',
 'set',
 'malign',
 'hypertens',
 'pseudotumor',
 'cerebri',
 'symptomat',
 'blurri',
 'vision',
 'past',
 'month',
 'absenc',
 'headach',
 'central',
 'symptom',
 'note',
 'gait',
 'instabl',
 'blood',
 'pressur',
 'note',
 'take',
 'usual',
 'dose',
 'blood',
 'pressur',
 'med',
 'unknown',
 'improv',
 'systol',
 't

In [70]:
for index, score in sorted(lda_model[bow_corpus[64]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7687466740608215	 
Topic: 0.009*"postop" + 0.009*"coronari" + 0.006*"examin" + 0.005*"hematocrit" + 0.005*"reveal" + 0.005*"surgeri" + 0.005*"rhythm" + 0.005*"graft" + 0.005*"room" + 0.004*"count"

Score: 0.098329097032547	 
Topic: 0.007*"releas" + 0.006*"capsul" + 0.005*"effus" + 0.005*"refil" + 0.005*"disp" + 0.004*"locat" + 0.004*"renal" + 0.003*"angap" + 0.003*"aortic" + 0.003*"delay"

Score: 0.07759689539670944	 
Topic: 0.006*"renal" + 0.005*"bleed" + 0.005*"abdomin" + 0.004*"failur" + 0.004*"hematocrit" + 0.003*"cell" + 0.003*"examin" + 0.003*"treat" + 0.003*"diabet" + 0.003*"elev"

Score: 0.04212407022714615	 
Topic: 0.005*"seizur" + 0.004*"head" + 0.004*"urin" + 0.004*"pneumonia" + 0.004*"locat" + 0.003*"mental" + 0.003*"fractur" + 0.003*"month" + 0.003*"tube" + 0.003*"small"

Score: 0.012401793152093887	 
Topic: 0.005*"effus" + 0.005*"cultur" + 0.004*"ventricular" + 0.004*"capsul" + 0.004*"bleed" + 0.004*"atrial" + 0.004*"releas" + 0.003*"pneumonia" + 0.003*"upper" +

In [None]:
# Performance evaluation by classifying sample document using LDA TF-IDF model

In [72]:
for index, score in sorted(lda_model_tfidf[bow_corpus[64]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9216979742050171	 
Topic: 0.002*"mouth" + 0.002*"fractur" + 0.002*"aspir" + 0.002*"tube" + 0.002*"renal" + 0.002*"expir" + 0.002*"capsul" + 0.002*"intub" + 0.002*"seizur" + 0.002*"pneumonia"

Score: 0.07701976597309113	 
Topic: 0.003*"postop" + 0.003*"bleed" + 0.002*"oper" + 0.002*"mouth" + 0.002*"hematocrit" + 0.002*"percent" + 0.002*"coronari" + 0.002*"valv" + 0.002*"seizur" + 0.002*"vein"
