## Part 3: Topic Modelling on Clinical Notes
### Part a:

In [137]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import datapath
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string, re
import multiprocessing as mp

In [22]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [48]:
dataset = pd.read_csv("~/csc2548_ml4h/adult_notes.gz", compression='gzip')
dataset = dataset[~pd.isnull(dataset.chartext)]

chartext = dataset.chartext

In [49]:
stopwordList = stopwords.words("english")

def tokenize(s):
    return [j for j in [re.sub("[\W_]+", '', i) for i in word_tokenize(s.lower())] 
            if j not in stopwordList and j != '']

pool = mp.Pool(mp.cpu_count())
corpus = pool.map(tokenize, chartext)
pool.close()
pool.join()

In [50]:
dictionary = Dictionary(corpus)

In [51]:
pool = mp.Pool(mp.cpu_count())
bow_corpus = pool.map(dictionary.doc2bow, corpus)
pool.close()
pool.join()

In [None]:
def generate_lda(topics: int) -> LdaMulticore:
    lda = LdaMulticore(bow_corpus, num_topics=topics, id2word=dictionary,
                       workers=mp.cpu_count(), passes=1, per_word_topics=True)

    cm = CoherenceModel(model=lda, corpus=bow_corpus, dictionary=dictionary, coherence="u_mass")
    score = cm.get_coherence()
    
    print(f"LDA: # of topics: {topics}, Coherence score: {score}\n")
    
    temp_file = datapath("lda_"+str(topics))
    lda.save(temp_file)
    
    return lda, score

lda_20, score_20 = generate_lda(20)
lda_50, score_50 = generate_lda(50)
lda_100, score_100 = generate_lda(100)

### Part b:

In [133]:
lda.get_term_topics(dictionary.token2id["pulse"])

def examine_word(word: str, model: LdaMulticore):
    topics = model.get_document_topics(dictionary.doc2bow([word]), per_word_topics=True)[1][0][1]
    print(f"\nWord \"{word}\" found in topics {topics}")
    for topic in topics:
        print(f"Topic {topic}:")
        print(model.show_topic(topic))
    
examine_word("respiratory", lda)
examine_word("vomiting", lda)
examine_word("urine", lda)
examine_word("pulse", lda)

[]


Word "respiratory" found in topics [13]
Topic 13:
[('ml', 0.008362081), ('pt', 0.007417203), ('pm', 0.0066727293), ('name', 0.0059811734), ('plan', 0.0051126853), ('left', 0.0049168533), ('chest', 0.0046872776), ('mgdl', 0.004559539), ('right', 0.0043463064), ('last', 0.0042483276)]

Word "vomiting" found in topics [13]
Topic 13:
[('ml', 0.008362081), ('pt', 0.007417203), ('pm', 0.0066727293), ('name', 0.0059811734), ('plan', 0.0051126853), ('left', 0.0049168533), ('chest', 0.0046872776), ('mgdl', 0.004559539), ('right', 0.0043463064), ('last', 0.0042483276)]

Word "urine" found in topics [13]
Topic 13:
[('ml', 0.008362081), ('pt', 0.007417203), ('pm', 0.0066727293), ('name', 0.0059811734), ('plan', 0.0051126853), ('left', 0.0049168533), ('chest', 0.0046872776), ('mgdl', 0.004559539), ('right', 0.0043463064), ('last', 0.0042483276)]

Word "pulse" found in topics [12]
Topic 12:
[('pt', 0.009587039), ('ml', 0.0056111817), ('left', 0.0052784607), ('plan', 0.0046334383), ('last', 0.004530