In [1]:
import os
import json
from pprint import pprint
import pandas as pd
import sklearn
import nltk

In [27]:
data_dir = 'Data/pdf_json/'
filenames = os.listdir(data_dir)

papers=[]
for filename in filenames:
    file = data_dir + filename
    paper = json.load(open(file, 'rb'))
    body_texts = [body_text['text'] for body_text in paper['body_text']]
    text_joined=''
    for text in body_texts:
        text_joined += text
    papers.append(text_joined)


In [28]:
clean_df = pd.DataFrame(papers)[0].str.lower()
clean_df.head()

0    vp3, and vp0 (which is further processed to vp...
1    in december 2019, a novel coronavirus, sars-co...
2    the 2019-ncov epidemic has spread across china...
3    metagenomic sequencing, which allows us to dir...
4    infectious bronchitis (ib), which is caused by...
Name: 0, dtype: object

### Latent Dirichlet Allocation (LDA)

In [29]:
stopwords=set(nltk.corpus.stopwords.words('english'))

In [32]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(stop_words=stopwords)
data_vectorized = vectorizer.fit_transform(clean_df)

In [33]:
word_frequency = sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1], reverse=True)
print(word_frequency[:10])

[('累计确诊人数', 73754), ('百万', 73753), ('疫情信息', 73752), ('温州银泰百货商场', 73751), ('武汉肺', 73750), ('武汉爆发', 73749), ('春运', 73748), ('新型冠状病毒', 73747), ('振华量贩聊城闸口店', 73746), ('感冒', 73745)]


In [36]:
from sklearn.decomposition import LatentDirichletAllocation

In [37]:
lda = LatentDirichletAllocation()
lda.fit_transform(data_vectorized)

array([[0.00772267, 0.00772663, 0.00772267, ..., 0.00772267, 0.00772267,
        0.00772293],
       [0.00451679, 0.00451672, 0.24398546, ..., 0.00451672, 0.00451672,
        0.00451672],
       [0.00773239, 0.00773239, 0.00773239, ..., 0.00773239, 0.00773239,
        0.00773239],
       ...,
       [0.00878406, 0.00878406, 0.00878406, ..., 0.04235212, 0.00878406,
        0.00878406],
       [0.0070009 , 0.0070009 , 0.0070009 , ..., 0.0070009 , 0.0070009 ,
        0.0070009 ],
       [0.00737976, 0.00737976, 0.04025019, ..., 0.00737976, 0.00737976,
        0.00737976]])

In [44]:
len(lda.components_)

10

In [47]:
# taken from https://www.kaggle.com/danielwolffram/topic-modeling-finding-related-articles#Latend-Dirichlet-Allocation
for topic_idx, topic in enumerate(lda.components_):
    feature_names = vectorizer.get_feature_names()
    message = "\nTopic #%d: " % topic_idx
    message += " ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])
    print(message)


Topic #0: appendicitis 349001 wikidata vhh hd5 idsr crrnas sftsv exo cces

Topic #1: bcg g3bp1 bees atv reovirus lncrnas aceis avan lpv hdl

Topic #2: lecb vp35 uccs bleomycin adm1 fmw nelfinavir mtase muc5ac g64s

Topic #3: mirnas tlr5 hyposmia top3b meplazumab dcv cpac parhyale msti pscnv

Topic #4: preprint medrxiv doi covid license patients cases 19 2020 10

Topic #5: cr3022 pr8 mad1 parp2 cml mad2 phi6 drv muc4 nsp1a

Topic #6: dux4 entrying trim25 mxb valinomycin phosphorus parkin c00422 vadr vt

Topic #7: osd nab virosal ccfr ami nabs symptoma denv4 isd sensr

Topic #8: curvature hsa agi prf eps8 enc dpcr klk13 egcg mir

Topic #9: sox cd147 vl masp cosmetic gn lns sre 20021493 trisilix


In [48]:
#TODO: remove non-english words to make sense of output