In [76]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import collections
import re
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer, WordNetLemmatizer
import time
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cdist
import pickle
import json

Two-level tf-idf literature retrieval.

# 1. Helper functions

In [2]:
def textNormalize(rawString):
    """
    Function for text normalization.
    Text normalization includes:
    1. removing web links
    2. converting all letters to lower or upper case
    3. removing punctuations
    4. removing numbers
    5. tokenization
    6. removing stopwords
    7. stemming
    8. lemmatization
    Input:
        rawString: a string containing text to be normaized. 
    Output:
        normText: a string containing the normalized text where the tokens extracted from rawString are joined by space.
    """
    if rawString == np.nan:
        return rawString
    ## Remove web links
    rawString = re.sub('https?://\S+|www\.\S+', '', rawString) 

    ## Lowercase
    rawString = rawString.lower()
    
    ## Remove punctuation
    rawString = re.sub('<.*?>+', ' ', rawString)
    rawString = re.sub('[%s]' % re.escape(string.punctuation), ' ', rawString)
    
    ## Remove number
    rawString = re.sub(r'\d+', '', rawString)
    
    ## Tokenize
    words = word_tokenize(rawString)
    
    ## Remove stop words
    nltk_stop_words = stopwords.words('english')
    words = [word for word in words if word not in nltk_stop_words]
    
    ## Stem
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]
    
    ## Lematize verbs
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    
    normText = " ".join(words)
    
    return normText

# 2. Preprocess metadata

The preprocessing step only needs to be run once. The new metadata table with the normalized abstracts in the column "clean_abstract" will be saved to all_sources_metadata_2020-03-13_clean.csv

## 2.1. Read-in metadata

In [26]:
#metaDataPath = "/kaggle/input/CORD-19-research-challenge/2020-03-13/all_sources_metadata_2020-03-13.csv"
metaDataPath = "~/Downloads/2020-03-13/all_sources_metadata_2020-03-13.csv"
metaData = pd.read_csv(metaDataPath, header = 0)
print("The number of literatures: " + str(metaData.shape[0]))
metaData.head()

The number of literatures: 29500


Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2002765000.0,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3003431000.0,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065000.0,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663100.0,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3006643000.0,#3242,False


## 2.2. Normalize abstracts

In [4]:
startTime = time.time()
metaData["clean_abstract"] = float("NaN")
metaData.loc[metaData["abstract"].notnull(), "clean_abstract"] = \
metaData["abstract"][metaData["abstract"].notnull()].apply(lambda x: textNormalize(x))
print("Time spent: " + str(round((time.time() - startTime) / 60, 3)) + "min.")

Time spent: 2.231min.


## 2.3. Output the new metadata table

In [5]:
metaData.to_csv("~/Downloads/2020-03-13/all_sources_metadata_2020-03-13_clean.csv")

# 3. tf-idf vectorizer

This step only needs to be run once. A tf-idf vectorizer will be trained from the normalized abstracts and saved to tf-idf_vectorizer.pkl

## 3.1. Read-in metadata with normalized abstracts

In [27]:
metaDataPath = "~/Downloads/2020-03-13/all_sources_metadata_2020-03-13_clean.csv"
metaData = pd.read_csv(metaDataPath, header = 0)
metaData.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,clean_abstract
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2002765000.0,#3252,True,
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3003431000.0,#1861,True,
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065000.0,#1043,True,geograph spread novel coronavirus covid infect...
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663100.0,#1999,True,decemb case unidentifi pneumonia histori expos...
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3006643000.0,#3242,False,


## 3.2.	Learn vocabulary and idf from training set

In [3]:
startTime = time.time()
vectorizer = TfidfVectorizer(tokenizer = word_tokenize)
vectorizer.fit(metaData["clean_abstract"][metaData["clean_abstract"].notnull()])
print('Time spent: ' + str(time.time() - startTime) + 's.')

Time spent: 20.281893014907837s.


## 3.3. Save the learned vectorizer

In [4]:
vectPath = "../../../Downloads/2020-03-13/tf-idf_vectorizer.pkl"
with open(vectPath, "wb") as vectFile:
    pickle.dump(vectorizer, vectFile)

# 4. Retrieve literatures for each task

## 4.1. Read-in metadata with normalized abstracts

In [28]:
metaDataPath = "~/Downloads/2020-03-13/all_sources_metadata_2020-03-13_clean.csv"
metaData = pd.read_csv(metaDataPath, header = 0)
metaData.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,clean_abstract
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2002765000.0,#3252,True,
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3003431000.0,#1861,True,
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065000.0,#1043,True,geograph spread novel coronavirus covid infect...
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663100.0,#1999,True,decemb case unidentifi pneumonia histori expos...
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3006643000.0,#3242,False,


## 4.2. Load the tf-idf vectorizer

In [29]:
vectPath = "../../../Downloads/2020-03-13/tf-idf_vectorizer.pkl"
with open(vectPath, 'rb') as vectFile:
    vectorizer = pickle.load(vectFile)

## 4.3. Transform document to document-term matrix

In [30]:
docTermMatrix = vectorizer.transform(metaData["clean_abstract"][metaData["clean_abstract"].notnull()]).toarray()

## 4.4. Task I: What is known about transmission, incubation, and environmental stability?

In [51]:
queries = ["transmission incubation and environmental stability"]

In [52]:
normQueries = [textNormalize(q) for q in queries]
queryTermMatrix = vectorizer.transform(normQueries).toarray()

In [53]:
queryDocDist = 1 - cdist(queryTermMatrix, docTermMatrix, metric = 'cosine')

In [60]:
for i in range(queryDocDist.shape[0]):
    print("Query: ", queries[i])
    print("")
    for idx in queryDocDist[i].argsort()[-4:][::-1]:
        print("cosine similarity: ", queryDocDist[i][idx])
        literatureMeta = metaData[metaData["abstract"].notnull()].iloc[idx]
        print("Sha: ", literatureMeta[])
        print("Title: ", metaData["title"][metaData["abstract"].notnull()].iloc[idx])
        print("Abstract: ", metaData["abstract"][metaData["abstract"].notnull()].iloc[idx])
        print("Has full text: ", metaData["has_full_text"][metaData["abstract"].notnull()].iloc[idx])
        print(metaData["source_x"][metaData["abstract"].notnull()].iloc[idx])
        print("")

Query:  transmission incubation and environmental stability

cosine similarity:  0.33123953678616014
Sha:  nan
Title:  A Chinese Case of COVID-19 Did Not Show Infectivity During the Incubation Period: Based on an Epidemiological Survey
Abstract:  Controversy remains over whether the novel coronavirus 2019 (COVID-19) virus may have infectivity during the incubation period before the onset of symptoms. The author had the opportunity to examine the infectivity of COVID-19 during the incubation period by conducting an epidemiological survey on a confirmed patient who had visited Jeju Island during the incubation period. The epidemiological findings support the claim that the COVID-19 virus does not have infectivity during the incubation period.
Has full text:  nan
CZI

cosine similarity:  0.3153872041304393
Sha:  nan
Title:  Replication-Competent Influenza Virus and Respiratory Syncytial Virus Luciferase Reporter Strains Engineered for Co-Infections Identify Antiviral Compounds in Combinat

In [72]:
path = "../../../Downloads/2020-03-13"

In [73]:
testSha = metaData["sha"][metaData["abstract"].notnull()].iloc[1094]

In [74]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

In [77]:
jsonFile = find(testSha + ".json", path3)

In [78]:
with open(jsonFile) as f:
    data = json.load(f)

In [87]:
for tmp in data['body_text']:
    print(tmp['section'])


Background
Background
Background
Analysis
Analysis
Analysis
The earliest model developed using incomplete data
The earliest model developed using incomplete data
The earliest model developed using incomplete data
The earliest model developed using incomplete data
5
5
5
12
14-23
24,25
26,27
Classic right-skewed distribution
Classic right-skewed distribution
Classic right-skewed distribution
Classic right-skewed distribution
Classic right-skewed distribution
Lognormal distribution proposed by Philip Sartwell
Lognormal distribution proposed by Philip Sartwell
Lognormal distribution proposed by Philip Sartwell
Lognormal distribution proposed by Philip Sartwell
Lognormal distribution proposed by Philip Sartwell
Lognormal models proposed by Japanese epidemiologists
Lognormal models proposed by Japanese epidemiologists
Lognormal models proposed by Japanese epidemiologists
Lognormal models proposed by Japanese epidemiologists
Lognormal models proposed by Japanese epidemiologists
Lognormal mod

In [88]:
data['body_text'][-1]

{'cite_spans': [],
 'ref_spans': [],
 'section': 'Conclusion',
 'text': 'The lessons that can be learnt from the presented discussion are as follows: (I) although it is historically remarkable that the incubation period of pandemic influenza was assessed based on an explicit understanding of an unknown time of exposure, the assumed periods of exposure were too long and equal probability of exposure was assumed for each possible date. Well-defined short periods of exposure are needed to decipher the incubation period distribution using appropriate statistical methods. Taking this point into account will be critically important in estimating the incubation period of newly emerging diseases in the future. (II) The epidemiologic usefulness of the lognormal assumption was highlighted with respect to the basic characteristics of lognormal distribution, but this assumption is likely to remain unwarrantable until details of disease mechanisms are fully clarified; thus, this assumption may be m

In [89]:
data['body_text'][-2]

{'cite_spans': [{'end': 698,
   'ref_id': 'BIBREF104',
   'start': 693,
   'text': '[103,'},
  {'end': 703, 'ref_id': 'BIBREF105', 'start': 699, 'text': '104]'},
  {'end': 1021, 'ref_id': 'BIBREF47', 'start': 1017, 'text': '[48,'},
  {'end': 1026, 'ref_id': 'BIBREF106', 'start': 1022, 'text': '105]'}],
 'ref_spans': [],
 'section': 'Conclusion',
 'text': 'The present study revisited previous works concerned with models of the incubation period of acute infectious diseases. In particular, the following were highlighted: (i) the earliest modeling effort conducted using incomplete data of a pandemic influenza, (ii) the explicit distribution of the incubation period, (iii) the application of a lognormal assumption to estimations of the time of exposure during a point source outbreak, and (iv) the validity of assuming lognormal distribution for the incubation period. Although it was not highlighted in the present paper, Norman T. J. Bailey also formed a framework using a chain binomial mode