# Topic Modeling

In [1]:
%load_ext autoreload
%autoreload 2
from cord import ResearchPapers
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dwight\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
research_papers = ResearchPapers.from_data_dir().nlp()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Fixing dates that are a list e.g. "['2020-02-05', '2020-02']"
Fixing dates with the seasons e.g. "2014 Autumn"
Fixing dates like "2016 Nov 9 Jan-Feb"
Fixing dates like "2012 Jan-Mar"
Converting dates like "2020 Apr 13"
Converting Dates like "2020 Apr"
Converting dates like "2020"
Converting Dates like "2020-01-21"
Indexing research papers
Finished Indexing in 61.0 seconds
Getting gensim LDA topic model
Assigning LDA topics


In [6]:
research_papers.metadata.loc[0, 'topic_vector']

[0, 0, 0.94046354, 0.055671223, 0, 0]

In [7]:
research_papers.save()

Saving to data\ResearchPapers.pickle


In [3]:
import gensim
from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore

dictionary = gensim.corpora.Dictionary(research_papers.index_tokens)
corpus =  [dictionary.doc2bow(text) for text in research_papers.index_tokens]

In [4]:
%time lda_model: LdaModel = LdaMulticore(corpus, num_topics=6, passes=4)

Wall time: 31.8 s


In [8]:
def get_topic(tokens):
    print(tokens)
    bow = dictionary.doc2bow(tokens)
    tp =lda_model[bow]
    return tp

get_topic(research_papers.index_tokens[0])

['etiologic', 'basis', 'vast', 'majority', 'cases', 'congenital', 'heart', 'disease', 'remains', 'largely', 'undefined', 'viruses', 'considered', 'likely', 'candidates', 'since', 'recognition', 'association', 'intrauterine', 'rubella', 'congenital', 'heart', 'disease', 'although', 'pathogenesis', 'cardiovascular', 'defects', 'poorly', 'understood', 'information', 'gained', 'study', 'congenital', 'rubella', 'syndrome', 'suggests', 'mechanisms', 'focal', 'endothelial', 'cell', 'damage', 'resulting', 'obliteration', 'vascular', 'supply', 'decreased', 'growth', 'rate', 'shortened', 'survival', 'time', 'certain', 'cells', 'disturbed', 'dna', 'replication', 'cells', 'whose', 'chromosomes', 'damaged', 'secondary', 'effects', 'virus', 'replication', 'may', 'operative', 'production', 'defects', 'developing', 'fetus', 'addition', 'rubella', 'suggestive', 'conclusive', 'evidence', 'coxsackie', 'b3', 'b4', 'virus', 'infections', 'pregnancy', 'result', 'birth', 'infants', 'variety', 'types', 'conge

[(0, 0.56830376), (1, 0.018690336), (2, 0.3718453), (3, 0.03921325)]

In [10]:
lda_model.get_topics()

array([[5.1721145e-05, 6.2091008e-04, 3.3983318e-04, ..., 2.3220653e-07,
        2.0875653e-07, 2.1233468e-07],
       [1.4734658e-04, 6.6963915e-04, 4.7306938e-04, ..., 3.2604504e-07,
        2.6012282e-07, 2.7325012e-07],
       [3.4159351e-05, 1.0488856e-03, 3.2214075e-04, ..., 1.6734226e-07,
        1.5442777e-07, 1.5418497e-07],
       [4.6838486e-05, 5.3613802e-04, 2.3463421e-04, ..., 4.4890251e-07,
        4.4080508e-07, 4.4185899e-07],
       [3.7095244e-05, 9.1707672e-04, 3.0397286e-04, ..., 4.3560881e-06,
        1.2895676e-06, 1.2213334e-06],
       [8.1452235e-05, 6.7815394e-04, 3.5938044e-04, ..., 2.3951617e-07,
        2.1858223e-07, 2.1954524e-07]], dtype=float32)

## Coherence Scores

In [13]:
from gensim.models.coherencemodel import CoherenceModel

def get_lda_model(papers, num_topics=4, passes=4):
    dictionary = gensim.corpora.Dictionary(papers.index_tokens)
    corpus =  [dictionary.doc2bow(text) for text in papers.index_tokens]
    lda_model: LdaModel = LdaMulticore(corpus, num_topics=num_topics, passes=4)
    return lda_model

def calculate_coherence_score(documents, dictionary, model):
    coherence_model = CoherenceModel(model=model, texts=documents, dictionary=dictionary,coherence='c_v')
    return coherence_model.get_coherence()

def get_coherence_scores(papers, start, stop):
    dictionary = gensim.corpora.Dictionary(papers.index_tokens)
    corpus =  [dictionary.doc2bow(text) for text in papers.index_tokens]
    scores = []
    for num_topics in range(start, stop):
        print('Calculating coherence for', num_topics, 'topics')
        l_model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=6)
        coherence = calculate_coherence_score(papers.index_tokens, dictionary, l_model)
        scores.append({'topics':num_topics, 'coherence': coherence})
    return pd.DataFrame(scores)
        

min_topics, max_topics = 4, 15
coherence_scores = get_coherence_scores(research_papers.since_sarscov2(), min_topics,  max_topics)
alt.Chart(coherence_scores).mark_line().encode(
    x = 'topics',
    y = 'coherence'
)

Calculating coherence for 4 topics
Calculating coherence for 5 topics
Calculating coherence for 6 topics
Calculating coherence for 7 topics
Calculating coherence for 8 topics
Calculating coherence for 9 topics
Calculating coherence for 10 topics
Calculating coherence for 11 topics
Calculating coherence for 12 topics
Calculating coherence for 13 topics
Calculating coherence for 14 topics


In [8]:
scores = pd.DataFrame({'score': coherence_scores, 
                       'topics': list(range(min_topics, max_topics))})
scores

Unnamed: 0,score,topics
0,0.439198,3
1,0.465439,4
2,0.454378,5
3,0.43836,6
4,0.437134,7
5,0.466303,8
6,0.468629,9
7,0.454754,10
8,0.460549,11


In [9]:
def get_lda_model(papers, num_topics=6, passes=4):
    dictionary = gensim.corpora.Dictionary(papers.index_tokens)
    corpus =  [dictionary.doc2bow(text) for text in papers.index_tokens]
    lda_model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
    return lda_model

In [18]:
a = [(0, 0.2), (1, 0.1), (2, 0.4), (3, 0.05), (4, 0.25)]
a.sort(key=lambda tup: tup[1], reverse=True)
