In [1]:
import pandas as pd
import numpy as np
import json
import os
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline



In [2]:
!ls

CORD-19-Topic-Modeling.ipynb lda_50_model.p
[1m[36mCORD_DATA[m[m                    lda_doctopic_vec.p
README.md                    ldavisualization_50
Translating Text.ipynb       ldavisualization_50.html


### Metadata

In [3]:
#read in metadata
root_dir = './CORD_DATA/'
metadata_path = root_dir + 'metadata.csv'
metadata = pd.read_csv(metadata_path)

metadata.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350.0,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


### Read Data Into Dataframes

In [17]:
'''
    @Desc    : Reads in json article and converts into Pandas Dataframe
    @Params  : filepath (str)
    @Returns : Pandas Dataframe 
'''
def JsonToDataFrame(filepath):
        
    #read json into dict
    with open(filepath, 'rb') as json_data:
        data = json.load(json_data)
        
        paper_id = data['paper_id']
        abstract = '\n'.join([section['text'] for section in data['abstract']])

        

        final_data = {
            'paper_id'  : [data['paper_id']],
            'section'   : ['abstract'],
            'text'  : ['\n'.join([section['text'] for section in data['abstract']])]                                       
        }
        
        df = pd.DataFrame.from_dict(final_data)
        for section in data['body_text']:
            df = df.append({
                'paper_id' : data['paper_id'],
                'section'  : section['section'],
                'text'     : section['text']
            }, ignore_index=True)
            
        return df 
    
    
        

In [13]:
#DATA DIRECTORIES
        
biorxiv_medrxiv    = root_dir + 'biorxiv_medrxiv/biorxiv_medrxiv/'
comm_use_subset    = root_dir + 'comm_use_subset/comm_use_subset/'
noncomm_use_subset = root_dir + 'noncomm_use_subset/noncomm_use_subset/'

biorxiv_medrxiv_files = []
comm_use_subset_files = []
noncomm_use_subset_files = []

for subdir, dirs, files in os.walk(biorxiv_medrxiv):
    for file in files:
        biorxiv_medrxiv_files.append(os.path.join(subdir, file))
        
for subdir, dirs, files in os.walk(comm_use_subset):
    for file in files:
        comm_use_subset_files.append(os.path.join(subdir, file))
        
for subdir, dirs, files in os.walk(noncomm_use_subset):
    for file in files:
        noncomm_use_subset_files.append(os.path.join(subdir, file))
    
pprint(biorxiv_medrxiv_files[:5])

['./CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/f905f78b32f63c6d14a79984dfb33f1b358b8ab4.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/607e0074d8ff40c272b958c2fe48793fedfc785e.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/7cfaa2540d3c8eea0982b3b1147884f125f67ff2.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/abcfffafab399149d4adadd6bb458c4994e2025d.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/0cb9c296684ca5e71462d825cab2827854a01544.json']


In [18]:
#initialize dfs
biomed_df      = pd.DataFrame()
comm_use_df    = pd.DataFrame()
noncomm_use_df = pd.DataFrame()

#read biomed data
for f in biorxiv_medrxiv_files:
    df = JsonToDataFrame(f)
    biomed_df = biomed_df.append(df, ignore_index=True)

#read commonly used data
for f in comm_use_subset_files:
    df = JsonToDataFrame(f)
    comm_use_df = comm_use_df.append(df, ignore_index=True)

#read non-commonly used data
for f in noncomm_use_subset_files:
    df = JsonToDataFrame(f)
    noncomm_use_df = noncomm_use_df.append(df, ignore_index=True)


full_corpus = pd.concat([biomed_df, comm_use_df, noncomm_use_df])

UnicodeDecodeError: 'utf-32-be' codec can't decode bytes in position 4-7: code point not in range(0x110000)

In [7]:
full_corpus.head()

Unnamed: 0,paper_id,section,text
0,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,abstract,New anti-AIDS treatments must be continually d...
1,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,"In the absence of a curative treatment, the hi..."
2,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,Tetramers of IN are formed by the reciprocal s...
3,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,Hindering the assembly of IN functional multim...
4,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,Based on the structure of Fab specific to IN C...


In [8]:
full_corpus['section'].value_counts()

Discussion                                             37748
Introduction                                           21471
                                                       13512
abstract                                               12718
Results                                                 9991
                                                       ...  
Rapid MERS-CoV nucleocapsid protein detection assay        1
Virus detection and pathological examination               1
| Phenotype recording                                      1
TOLL-LIKE RECEPTOR 4 ANTAGONISTS                           1
Confirmation of Interactions by Cotransformation           1
Name: section, Length: 88025, dtype: int64

### Text Preprocessing

In [9]:
import re
import string

punct_table = str.maketrans('', '', string.punctuation)

#remove punctuation
full_corpus['text'] = full_corpus['text'].map(lambda x: x.translate(punct_table))

#convert to lowercase
full_corpus['text'] = full_corpus['text'].map(lambda x: x.lower())

full_corpus.head()

Unnamed: 0,paper_id,section,text
0,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,abstract,new antiaids treatments must be continually de...
1,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,in the absence of a curative treatment the hig...
2,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,tetramers of in are formed by the reciprocal s...
3,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,hindering the assembly of in functional multim...
4,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,,based on the structure of fab specific to in c...


In [10]:
full_corpus['text']

0        new antiaids treatments must be continually de...
1        in the absence of a curative treatment the hig...
2        tetramers of in are formed by the reciprocal s...
3        hindering the assembly of in functional multim...
4        based on the structure of fab specific to in c...
                               ...                        
76763    it is easy to pontificate and write pages of p...
76764    only then can we see the shortcomings of human...
76765    the numerous missteps misspeaks and misdeeds o...
76766            financial disclosure none funding support
76767    none conflicts of interest no authors have fin...
Name: text, Length: 419853, dtype: object

In [11]:
full_corpus.describe()

Unnamed: 0,paper_id,section,text
count,419853,419853,419853.0
unique,12718,88025,402718.0
top,ce6717ad3bb0da86077a5cbb8111576ea8230b2c,Discussion,
freq,2022,37748,1812.0


In [12]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

stopwords = list(stopwords.words('english')) + ['et', 'al'] + [str(year) for year in range(1999,2020)]
#transfrom data into Bag-Of-Words Representation
count_vectorizer = CountVectorizer(stop_words=stopwords)

bag_of_words = count_vectorizer.fit_transform(full_corpus['text'])

In [13]:
print("Unique words: {} ".format(len(count_vectorizer.get_feature_names())))

Unique words: 625692 


### Latent Dirichlet Allocation (LDA)

In [14]:
%%time 

import warnings
warnings.simplefilter("ignore", DeprecationWarning)

from sklearn.decomposition import LatentDirichletAllocation as LDA

#LDA params
num_topics = 50
random_state = 42

#initialize LDA model
lda = LDA(
    n_components=num_topics,
    random_state=random_state,
    n_jobs=-1
)

#train lda model
lda.fit(bag_of_words)

CPU times: user 2min 3s, sys: 2min 1s, total: 4min 5s
Wall time: 20min 15s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=50, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [15]:
doc_topic_vec = bag_of_words.toarray()
len(doc_topic_vec)

419853

In [None]:
import pickle

pickle.dump(lda, open('lda_50_model.p','wb') )
pickle.dump(doc_topic_vec, open('lda_doctopic_vec.p','wb'))



In [None]:
lda = pickle.load(open('lda_50_model.p','rb'))
doc_topic_vec = pickle.load(open('lda_doctopic_vec.p','rb'))

In [16]:
topic_distributions = lda.transform(bag_of_words)
len(topic_distributions)

419853

In [17]:
'''
    @Desc   :  Prints top n topics generated from LDA model
    @Params :  Model: LDA model, 
               count_vec : CountVectorizer model
               n: number of topics to print
'''
def print_topics(model, count_vec, n):
    words = count_vec.get_feature_names()
    for i, topic in enumerate(model.components_):
        print("\nTopic {}: \n {}".format(i, [words[k] for k in topic.argsort()[:-n -1:-1]]))

In [18]:
#print topics
print_topics(lda, count_vectorizer, 10)



Topic 0: 
 ['antibodies', 'antibody', 'igg', 'serum', 'elisa', 'sera', 'antigen', 'monoclonal', 'mab', 'mabs']

Topic 1: 
 ['liver', 'mice', 'lung', 'tissues', 'tissue', 'bone', 'mouse', 'usage', 'muscle', 'organs']

Topic 2: 
 ['di', 'la', 'il', 'per', 'del', 'un', 'le', 'con', 'della', 'che']

Topic 3: 
 ['detection', 'dna', 'target', 'probes', 'probe', 'nanoparticles', 'signal', 'method', 'used', 'fluorescence']

Topic 4: 
 ['pedv', 'cells', 'apoptosis', 'piglets', 'infection', 'tgev', 'porcine', 'protein', 'phosphorylation', 'cell']

Topic 5: 
 ['using', 'analysis', 'data', 'test', 'used', 'values', 'mean', 'performed', 'statistical', 'calculated']

Topic 6: 
 ['health', 'public', 'disease', 'control', 'diseases', 'countries', 'care', 'infectious', 'surveillance', 'global']

Topic 7: 
 ['strains', 'strain', 'virus', 'influenza', 'viruses', 'ibv', 'ha', 'isolates', 'avian', 'h5n1']

Topic 8: 
 ['development', 'use', 'studies', 'may', 'drug', 'treatment', 'clinical', 'used', 'potent

### Creating Topics from Task Details

In [19]:
task_docs_path = root_dir + 'task_topics/'
genetics_path = task_docs_path + 'genetics/'
vaccines_path = task_docs_path + 'vaccines/'

genetics_docs = []
for file in sorted(os.listdir(genetics_path)):
    with open(genetics_path + file, 'r') as f:
        genetics_docs.append(f.read())
        
vaccines_docs = []
for file in sorted(os.listdir(vaccines_path)):
    with open(vaccines_path + file, 'r') as f:
        vaccines_docs.append(f.read())
    
print("Genetics document 1: \n {}".format(genetics_docs[0])) 
print("Vaccines document 1: \n {}".format(vaccines_docs[0]))

Genetics document 1: 
 Real-time tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time

Vaccines document 1: 
 Clinical and bench trials to investigate less common viral inhibitors against COVID-19 such as naproxen, clarithromycin, and minocyclinethat that may exert effects on viral replication



#### Preprocess Task Details

In [20]:
punct_table = str.maketrans('', '', string.punctuation)

genetics_docs = [doc.translate(punct_table) for doc in genetics_docs]
vaccines_docs = [doc.translate(punct_table) for doc in vaccines_docs]

print("Genetics document 1: \n {}".format(genetics_docs[0])) 
print("Vaccines document 1: \n {}".format(vaccines_docs[0]))

Genetics document 1: 
 Realtime tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time

Vaccines document 1: 
 Clinical and bench trials to investigate less common viral inhibitors against COVID19 such as naproxen clarithromycin and minocyclinethat that may exert effects on viral replication



#### Generate Bag of Words Model for Task Details

In [21]:
#combine task details
task_details = genetics_docs + vaccines_docs

tasks_bow = count_vectorizer.transform(task_details)

print("Unique words: {} ".format(len(count_vectorizer.get_feature_names())))


Unique words: 625692 


In [22]:
tasks_bow.toarray().shape

(20, 625692)

#### Run Task Documents Through LDA

In [95]:
#train lda model
tasks_topics = lda.transform(tasks_bow)

print(len(tasks_topics))
print(tasks_topics)

20
[[1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  2.89690576e-01 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.07493544e-01 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 2.51948492e-01 1.11111111e-03 1.11111111e-03
  2.03312482e-01 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  1.11111111e-03 1.11111111e-03 1.11111111e-03 1.11111111e-03
  9.75549052e-02 1.11111111e-03]
 [8.69565217e-04 8.69565217e-04 8.69565217e-04 8.69565217e-04
  3.62923173e-01 8.69565217e-04 8.69565217e-04 8.69565217e-04
  8.69565217e-04 8.69565217e-04 8.69565217e-04 8.69565217e-04
  8.69565217e-04 8.69565217e-04 8.

### Compare Similarity of Corpus to Task Details using Bhattacharyya distance

In [None]:
from dictances import bhattacharyya

similarities = []


### Visualizing LDA Results

In [13]:
import pickle
import pyLDAvis
from pyLDAvis import sklearn as sklearn_lda

LDAvis_path = './ldavisualization_' + str(num_topics)

VISUALIZATION = True

if VISUALIZATION:
    LDAvisual = sklearn_lda.prepare(lda, bag_of_words, count_vectorizer)
    
    #write visualization to disk
    with open(LDAvis_path, 'wb') as f:
        pickle.dump(LDAvisual, f)

#load visualization from disk
try:
    with open(LDAvis_path, 'rb') as f:
        LDAvisual = pickle.load(f)

except:
    print("WARNING: No LDA Visualiation On Disk")
    

    
pyLDAvis.save_html(LDAvisual, './ldavisualization_' + str(num_topics) + '.html')



In [14]:
pyLDAvis.display(LDAvisual)