In [1]:
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime
    
import numpy as np 
import pandas as pd
from tqdm import tqdm

time: 906 ms


# Load and Prepare Data

In [2]:
# load preprocessed full info df
df = pd.read_csv('../data/df_cleaned.csv')
print(df.shape)

(309120, 5)
time: 6.48 s


In [3]:
df.isna().sum()

title               16
abstract             0
publish_time       175
authors           2279
url             101337
dtype: int64

time: 125 ms


In [4]:
df.head()

Unnamed: 0,title,abstract,publish_time,authors,url
0,Clinical features of culture-proven Mycoplasma...,objective retrospective chart describes epidem...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
1,Nitric oxide: a pro-inflammatory mediator in l...,inflammatory diseases respiratory tract common...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
2,Surfactant protein-D and pulmonary host defense,surfactant protein-d sp-d participates innate ...,2000-08-25,"Crouch, Erika C",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
3,Role of endothelin-1 in lung disease,endothelin-1 et-1 amino acid peptide diverse b...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
4,Gene expression in epithelial cells in respons...,respiratory syncytial virus rsv pneumonia viru...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...


time: 15 ms


We consider the text body, but the approach could also be applied to the abstracts only.

In [5]:
# load preprocessed and tokenized vector for building model directly
docs = np.load("../data/docs.npy", allow_pickle=True)

time: 23.9 s


## HDP

In [6]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

time: 3.69 s




In [7]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

time: 59.2 s


In [8]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

time: 32.8 s


In [9]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 58177
Number of documents: 309120
time: 0 ns


In [11]:
from gensim.models import HdpModel
model = HdpModel(corpus = corpus, id2word = dictionary)

time: 34min 40s


In [14]:
hdp_topics = model.print_topics()
for topic in hdp_topics: 
    print(topic)

(0, '0.015*covid + 0.009*health + 0.008*pandemic + 0.004*care + 0.004*data + 0.004*patients + 0.004*social + 0.003*disease + 0.003*risk + 0.003*public')
(1, '0.024*patients + 0.011*covid + 0.005*clinical + 0.004*risk + 0.004*treatment + 0.004*disease + 0.004*methods + 0.004*mortality + 0.004*associated + 0.003*hospital')
(2, '0.014*cov + 0.013*sars + 0.012*sars_cov + 0.008*covid + 0.007*virus + 0.006*viral + 0.005*coronavirus + 0.005*protein + 0.005*disease + 0.005*infection')
(3, '0.011*cells + 0.007*cell + 0.007*infection + 0.006*covid + 0.005*virus + 0.005*patients + 0.005*expression + 0.004*immune + 0.004*disease + 0.004*viral')
(4, '0.008*sars + 0.008*cov + 0.007*sars_cov + 0.007*virus + 0.006*samples + 0.006*patients + 0.006*respiratory + 0.005*pcr + 0.005*covid + 0.005*infection')
(5, '0.027*covid + 0.023*patients + 0.010*disease + 0.010*sars + 0.010*cov + 0.009*sars_cov + 0.007*infection + 0.007*coronavirus + 0.007*severe + 0.006*respiratory')
(6, '0.006*model + 0.005*covid + 0

In [41]:
for idx, topic in model.show_topics(formatted=False, num_words= 10):
    print('Topic: {} \nWords: {}'.format(idx, [w[0] for w in topic]))

Topic: 0 
Words: ['covid', 'health', 'pandemic', 'care', 'data', 'patients', 'social', 'disease', 'risk', 'public']
Topic: 1 
Words: ['patients', 'covid', 'clinical', 'risk', 'treatment', 'disease', 'methods', 'mortality', 'associated', 'hospital']
Topic: 2 
Words: ['cov', 'sars', 'sars_cov', 'covid', 'virus', 'viral', 'coronavirus', 'protein', 'disease', 'infection']
Topic: 3 
Words: ['cells', 'cell', 'infection', 'covid', 'virus', 'patients', 'expression', 'immune', 'disease', 'viral']
Topic: 4 
Words: ['sars', 'cov', 'sars_cov', 'virus', 'samples', 'patients', 'respiratory', 'pcr', 'covid', 'infection']
Topic: 5 
Words: ['covid', 'patients', 'disease', 'sars', 'cov', 'sars_cov', 'infection', 'coronavirus', 'severe', 'respiratory']
Topic: 6 
Words: ['model', 'covid', 'data', 'time', 'different', 'models', 'analysis', 'method', 'high', 'new']
Topic: 7 
Words: ['covid', 'pandemic', 'health', 'influenza', 'coronavirus', 'disease', 'cells', 'virus', 'cell', 'patients']
Topic: 8 
Words: [

In [15]:
# Compute Coherence Score
coherence_model = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence)


Coherence Score:  0.39324078620342945
time: 11min 26s
