## Part 3: Topic Modelling on Clinical Notes
### Part a:

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import datapath
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string, re
import multiprocessing as mp

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
dataset = pd.read_csv("~/csc2548_ml4h/adult_notes.gz", compression='gzip')
dataset = dataset[~pd.isnull(dataset.chartext)]

chartext = dataset.chartext

In [4]:
stopwordList = stopwords.words("english")

def tokenize(s):
    return [j for j in [re.sub("[\W_]+", '', i) for i in word_tokenize(s.lower())] 
            if j not in stopwordList and j != '']

pool = mp.Pool(mp.cpu_count())
corpus = pool.map(tokenize, chartext)
pool.close()
pool.join()

In [5]:
dictionary = Dictionary(corpus)

In [6]:
pool = mp.Pool(mp.cpu_count())
bow_corpus = pool.map(dictionary.doc2bow, corpus)
pool.close()
pool.join()
del chartext
del corpus

In [7]:
def generate_lda(topics: int) -> LdaMulticore:
    lda = LdaMulticore(bow_corpus, num_topics=topics, id2word=dictionary,
                       workers=mp.cpu_count(), passes=5, per_word_topics=False, chunksize=1000)

    cm = CoherenceModel(model=lda, corpus=bow_corpus, dictionary=dictionary, coherence="u_mass")
    score = cm.get_coherence()
    
    print(f"LDA: # of topics: {topics}, Coherence score: {score}\n")
    
    lda.save("lda_"+str(topics))
    
    return lda, score

#lda_20, score_20 = generate_lda(20)
lda_50, score_50 = generate_lda(50)
#lda_100, score_100 = generate_lda(100)

LDA: # of topics: 50, Coherence score: -0.35583472798066895



LDA: # of topics: 20, Coherence score: -0.320345803714344

LDA: # of topics: 50, Coherence score: -0.3141471647049764

LDA: # of topics: 100, Coherence score: -0.3286962751461799

### Part b:

In [8]:
lda = lda_50

lda.get_term_topics(dictionary.token2id["pulse"])

def examine_word(word: str, model: LdaMulticore):
    topics = model.get_document_topics(dictionary.doc2bow([word]), per_word_topics=True)[1][0][1]
    print(f"\nWord \"{word}\" found in topics {topics}")
    for topic in topics:
        print(f"Topic {topic}:")
        print(model.show_topic(topic))
    
examine_word("respiratory", lda)
examine_word("vomiting", lda)
examine_word("urine", lda)
examine_word("pulse", lda)

[]


Word "respiratory" found in topics []

Word "vomiting" found in topics []

Word "urine" found in topics []

Word "pulse" found in topics []


In [9]:
word = "respiratory"
dictionary.doc2bow([word])
lda_50.get_document_topics(dictionary.doc2bow([word]), per_word_topics=True)

[(658, 1)]

([(0, 0.02),
  (1, 0.02),
  (2, 0.02),
  (3, 0.02),
  (4, 0.02),
  (5, 0.02),
  (6, 0.02),
  (7, 0.02),
  (8, 0.02),
  (9, 0.02),
  (10, 0.02),
  (11, 0.02),
  (12, 0.02),
  (13, 0.02),
  (14, 0.02),
  (15, 0.02),
  (16, 0.02),
  (17, 0.02),
  (18, 0.02),
  (19, 0.02),
  (20, 0.02),
  (21, 0.02),
  (22, 0.02),
  (23, 0.02),
  (24, 0.02),
  (25, 0.02),
  (26, 0.02),
  (27, 0.02),
  (28, 0.02),
  (29, 0.02),
  (30, 0.02),
  (31, 0.02),
  (32, 0.02),
  (33, 0.02),
  (34, 0.02),
  (35, 0.02),
  (36, 0.02),
  (37, 0.02),
  (38, 0.02),
  (39, 0.02),
  (40, 0.02),
  (41, 0.02),
  (42, 0.02),
  (43, 0.02),
  (44, 0.02),
  (45, 0.02),
  (46, 0.02),
  (47, 0.02),
  (48, 0.02),
  (49, 0.02)],
 [(658, [])],
 [(658, [])])