https://github.com/CoronaWhy
# Data Processing
## Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import glob 
import json 

import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure 
import seaborn as sns 

import sys 
if not sys.warnoptions: 
    import warnings 
    warnings.simplefilter("ignore")

plt.style.use('ggplot')

In [2]:
"""
# trying to use the GPU from tensorflow
import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True

sess = tf.compat.v1.Session(config=config)"""

'\n# trying to use the GPU from tensorflow\nimport tensorflow as tf\n\nconfig = tf.compat.v1.ConfigProto()\nconfig.gpu_options.allow_growth = True\n\nsess = tf.compat.v1.Session(config=config)'

## Metadata 

In [3]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

path = '../CORD-19-research-challenge/'

In [4]:
metadata_df = pd.read_csv(f'{path}metadata.csv',
                          parse_dates=['publish_time'],
                          dtype={'pubmed_id':str, 'Microsoft Academic Paper ID':str},
                          low_memory=False)

metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47298 entries, 0 to 47297
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   cord_uid                     47298 non-null  object        
 1   sha                          34283 non-null  object        
 2   source_x                     47298 non-null  object        
 3   title                        47140 non-null  object        
 4   doi                          43956 non-null  object        
 5   pmcid                        28038 non-null  object        
 6   pubmed_id                    35409 non-null  object        
 7   license                      47298 non-null  object        
 8   abstract                     39048 non-null  object        
 9   publish_time                 47289 non-null  datetime64[ns]
 10  authors                      45189 non-null  object        
 11  journal                      42894 non-nu

In [5]:
metadata_df = metadata_df[['sha', 'title', 'authors', 'journal', 'publish_time', 'abstract']]
metadata_df = metadata_df[metadata_df.sha.isna()==False]
metadata_df.rename(columns={'sha':'paper_id'}, inplace=True)


In [6]:
authors = []
for author in metadata_df.authors.astype(str):
    if len(author.split('; ')) > 2:
        authors.append(author.split('; ')[0]+' et al.')
    else:
        authors.append(author)

metadata_df['authors'] = authors
metadata_df['abstract'] = [str(a).replace('Abstract ','') for a in metadata_df.abstract]

metadata_df.head()

Unnamed: 0,paper_id,title,authors,journal,publish_time,abstract
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus,"Brunner, Carolyn M. et al.",The American Journal of Medicine,1973-08-31,"Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE), were eva..."
5,212e990b378e8d267042753d5f9d4a64ea5e9869,Infectious diarrhea: Pathogenesis and risk factors,"Cantey, J.Robert",The American Journal of Medicine,1985-06-28,"Our understanding of the pathogenesis of infectious, especially bacterial, diarrhea has increase..."
6,bf5d344243153d58be692ceb26f52c08e2bd2d2f,New perspectives on the pathogenesis of rheumatoid arthritis,"Zvaifler, Nathan J.",The American Journal of Medicine,1988-10-14,"In the pathogenesis of rheumatoid arthritis, locally produced antibodies complex with an incitin..."
7,ddd2ecf42ec86ad66072962081e1ce4594431f9c,Management of acute and chronic respiratory tract infections,"Ellner, Jerrold J.",The American Journal of Medicine,1988-09-16,"Pharyngitis, bronchitis, and pneumonia represent the most common respiratory tract infections. W..."
8,a55cb4e724091ced46b5e55b982a14525eea1c7e,Acute bronchitis: Results of U.S. and European trials of antibiotic therapy,"Dere, Willard H.",The American Journal of Medicine,1992-06-22,"Acute bronchitis, an illness frequently encountered by primary-care physicians, is an inflammati..."


## Extract the body text of the papers from JSON files

In [7]:
# JSON path
pdf_json = glob.glob(path+'**/pdf_json/*.json', recursive=True)
pmc_json = glob.glob(path+'**/pmc_json/*.json', recursive=True)

print('Lenght of pdf_json is', len(pdf_json))
print('Lenght of pmc_json is', len(pmc_json))

Lenght of pdf_json is 36236
Lenght of pmc_json is 15861


In [8]:
with open(pdf_json[0], mode='r') as file:
     contents = json.loads(file.read())
        
contents.keys()

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [18]:
%%time
papers_df = pd.DataFrame(columns=['paper_id','body_text'])

for j in range(len(pdf_json)):
    if j%(len(pdf_json)//10)==0:
        print(f'Processing index: {j} of {len(pdf_json)} ==> {round(100*j/len(pdf_json))}%')  
        
    with open(pdf_json[j], mode='r') as file:
        contents = json.loads(file.read())
    papers_df.loc[j,'paper_id'] = contents['paper_id']

    texts = []
    for text in contents['body_text']:
        texts.append(text['text'])
    body_text = '\n '.join(texts)
    papers_df.loc[j,'body_text'] = body_text


Processing index: 0 of 36236 ==> 0%
Processing index: 3623 of 36236 ==> 10%
Processing index: 7246 of 36236 ==> 20%
Processing index: 10869 of 36236 ==> 30%
Processing index: 14492 of 36236 ==> 40%
Processing index: 18115 of 36236 ==> 50%
Processing index: 21738 of 36236 ==> 60%
Processing index: 25361 of 36236 ==> 70%
Processing index: 28984 of 36236 ==> 80%
Processing index: 32607 of 36236 ==> 90%
Processing index: 36230 of 36236 ==> 100%
CPU times: user 3min 11s, sys: 17.4 s, total: 3min 28s
Wall time: 3min 53s


## Merging the metadata and the body text of the papers

In [19]:
%%time
cord19_df = pd.merge(metadata_df, papers_df, on=['paper_id'], how='inner')
cord19_df.drop_duplicates(['paper_id', 'body_text'], inplace=True)

cord19_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32417 entries, 0 to 32424
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   paper_id      32417 non-null  object        
 1   title         32381 non-null  object        
 2   authors       32417 non-null  object        
 3   journal       31003 non-null  object        
 4   publish_time  32417 non-null  datetime64[ns]
 5   abstract      32417 non-null  object        
 6   body_text     32417 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 2.0+ MB
CPU times: user 4.52 s, sys: 5.21 s, total: 9.73 s
Wall time: 13.9 s


In [20]:
cord19_df['body_text'].describe(include='all')

count                                                                                                   32417
unique                                                                                                  32401
top       In previous reports, workers have characterized the presentation of Middle East Respiratory Synd...
freq                                                                                                        4
Name: body_text, dtype: object

In [21]:
cord19_df.head()

Unnamed: 0,paper_id,title,authors,journal,publish_time,abstract,body_text
0,aecbc613ebdab36753235197ffb4f35734b5ca63,Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus,"Brunner, Carolyn M. et al.",The American Journal of Medicine,1973-08-31,"Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE), were eva...","The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of sout..."
1,212e990b378e8d267042753d5f9d4a64ea5e9869,Infectious diarrhea: Pathogenesis and risk factors,"Cantey, J.Robert",The American Journal of Medicine,1985-06-28,"Our understanding of the pathogenesis of infectious, especially bacterial, diarrhea has increase...","Pathogenesis and Risk Factors J. ROBERT CANTEY, M.D. Charleston, South Carolina Our understandin..."
2,bf5d344243153d58be692ceb26f52c08e2bd2d2f,New perspectives on the pathogenesis of rheumatoid arthritis,"Zvaifler, Nathan J.",The American Journal of Medicine,1988-10-14,"In the pathogenesis of rheumatoid arthritis, locally produced antibodies complex with an incitin...","In the pathogenesis of rheumatoid arthritis, locally produced antibodies complex with an incitin..."
3,ddd2ecf42ec86ad66072962081e1ce4594431f9c,Management of acute and chronic respiratory tract infections,"Ellner, Jerrold J.",The American Journal of Medicine,1988-09-16,"Pharyngitis, bronchitis, and pneumonia represent the most common respiratory tract infections. W...","Respiratory Tract Infections JERROLD J. ELLNER, M.D. Cleveland, CM Pharyngitis, bronchitis, and ..."
4,a55cb4e724091ced46b5e55b982a14525eea1c7e,Acute bronchitis: Results of U.S. and European trials of antibiotic therapy,"Dere, Willard H.",The American Journal of Medicine,1992-06-22,"Acute bronchitis, an illness frequently encountered by primary-care physicians, is an inflammati...","A cute bronchitis, an illness frequently encountered by primary-care physicians [1] , is an infl..."


In [22]:
cord19_df.isnull().sum()

paper_id           0
title             36
authors            0
journal         1414
publish_time       0
abstract           0
body_text          0
dtype: int64

## Language detection 

In [13]:
import scispacy
import spacy
import en_core_sci_sm
from spacy_langdetect import LanguageDetector

nlp = spacy.load("en_core_web_sm")
nlp = en_core_sci_sm.load()
nlp.max_length = 1_000_000
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [14]:
%%time
cord19_df['paper_language'] = cord19_df.abstract.apply(lambda x: nlp(str(x[:1000]))._.language['language'])

cord19_df.paper_language.value_counts()

CPU times: user 14min 27s, sys: 1min 25s, total: 15min 52s
Wall time: 15min 58s


en         28030
tl          4032
fr           164
es           157
it            12
de             8
nl             6
pt             3
ro             1
et             1
ca             1
cy             1
UNKNOWN        1
Name: paper_language, dtype: int64

In [15]:
cord19_df = cord19_df[cord19_df.paper_language=='en'].drop(['paper_language'], axis=1)
cord19_df.isnull().sum()

paper_id           0
title             34
authors            0
journal         1410
publish_time       0
abstract           0
body_text          0
dtype: int64

In [16]:
print(f'The CORD19 dataset contains {cord19_df.shape[0]} papers written in English')

The CORD19 dataset contains 28030 papers written in English


## Save to CSV format

In [17]:
cord19_df.to_csv('../data_processed/cord19_processed.csv', index=False)