## Part 4: Topic Modelling on Clinical Notes
### Part a:

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import datapath
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string, re
import multiprocessing as mp

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
dataset = pd.read_csv("~/csc2548_ml4h/adult_notes.gz", compression='gzip')
dataset = dataset[~pd.isnull(dataset.chartext)]

chartext = dataset.chartext

In [15]:
stopwordList = stopwords.words("english")

medicalStopwordList = ["patient", "patients", "disease", "diseases", "disorder", "symptom", "symptoms", "drug", "drugs", "problems", "problem", "prob", "probs", "med", "meds", "pill", "pills", "medicine", "medicines", "medication", "medications", "treatment", "treatments", "caps", "capsules", "capsule", "tablet", "tablets", "tabs", "doctor", "dr", "dr.", "doc", "physician", "physicians", "test", "tests", "testing", "specialist", "specialists", "side-effect", "side-effects", "pharmaceutical", "pharmaceuticals", "pharma", "diagnosis", "diagnose", "diagnosed", "exam", "challenge", "device", "condition", "conditions", "suffer", "suffering", "suffered", "feel", "feeling", "prescription", "prescribe", "prescribed", "over-the-counter", "otc"]

units = ["ml", "mgdl", "pt", "pm", "meql"]

stopwordList = stopwordList + medicalStopwordList + units

def tokenize(s):
    return [j for j in [re.sub("^\d$", '', re.sub("[\W_]+|^\d$", '', i)) for i in word_tokenize(s.lower())] 
            if j not in stopwordList and j != '']

pool = mp.Pool(mp.cpu_count())
corpus = pool.map(tokenize, chartext)
pool.close()
pool.join()

In [5]:
#len(corpus[0])

In [6]:
#word_tokenize(chartext[0].lower())
#re.sub("[\W_]+|^\d$", '', "2**")

In [7]:
dictionary = Dictionary(corpus)

In [8]:
#dictionary.filter_extremes(no_below=0, no_above=0.75, keep_n=None)

In [9]:
pool = mp.Pool(mp.cpu_count())
bow_corpus = pool.map(dictionary.doc2bow, corpus)
pool.close()
pool.join()
#del chartext
#del corpus

In [10]:
def generate_lda(topics: int) -> LdaMulticore:
    lda = LdaModel(bow_corpus, num_topics=topics, id2word=dictionary)
    #lda = LdaMulticore(bow_corpus, num_topics=topics, id2word=dictionary,
    #                   workers=mp.cpu_count(), passes=5, per_word_topics=False, chunksize=1000)

    cm = CoherenceModel(model=lda, corpus=bow_corpus, dictionary=dictionary, coherence="u_mass")
    score = cm.get_coherence()
    
    print(f"LDA: # of topics: {topics}, Coherence score: {score}\n")
    
    lda.save("lda_"+str(topics))
    
    return lda, score

lda_20, score_20 = generate_lda(20)
lda_50, score_50 = generate_lda(50)
lda_100, score_100 = generate_lda(100)

LDA: # of topics: 20, Coherence score: -0.5398971787156316

LDA: # of topics: 50, Coherence score: -0.7838344510175967

LDA: # of topics: 100, Coherence score: -0.7365448682718234



LDA: # of topics: 20, Coherence score: -0.320345803714344

LDA: # of topics: 50, Coherence score: -0.3141471647049764

LDA: # of topics: 100, Coherence score: -0.3286962751461799

 How many topics gives the optimal coherence score? 

### Part b:

In [None]:
lda = lda_20

#lda.get_term_topics(dictionary.token2id["pulse"])

def examine_word(word: str, model: LdaMulticore):
    topics = model.get_document_topics(dictionary.doc2bow([word]), per_word_topics=True)[1][0][1]
    print(f"\nWord \"{word}\" found in topics {topics}")
    for topic in topics:
        print(f"Topic {topic}:")
        print(model.show_topic(topic, topn=20))
    
examine_word("respiratory", lda)

In [None]:
examine_word("vomiting", lda)

In [None]:
examine_word("urine", lda)

In [None]:
examine_word("pulse", lda)

In [None]:
#lda.print_topics(num_topics=50, num_words=20)

In [None]:
#word = "respiratory"
#dictionary.doc2bow([word])
#lda_50.get_document_topics(dictionary.doc2bow([word]), per_word_topics=True)