In [1]:
import numpy as np 
import pandas as pd 
import glob
import json

# Libraries for text preprocessing
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emarellano\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Indico la ruta donde se encuentra el set de datos descargados
root_path = 'C:\\Users\\emarellano\\Documents\\Ezequiel\\Cursos\\CORD-19-research-challenge\\2020-03-13'

#archivo con metadata
metadata_path = f'{root_path}/all_sources_metadata_2020-03-13.csv'

#leo el archivo con metadata
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
#meta_df.head()

In [3]:
#armo un listado con los archivos json que tienen las noticias
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

13202

In [4]:
#creo un dataframe donde voy a guardar las columnas de interes:
#sha: id de la noticia
#title: titulo de la noticia
#body_text: cuerpo de la noticia
df = pd.DataFrame(columns = ['sha', 'title', 'abstract', 'body_text']) 

In [5]:
#Leo cada json y lo agrego al dataframe
for json_path in all_json:
   with open(json_path) as file:
            content = json.load(file)
            sha = content['paper_id']
            title = content['metadata']['title']
            abstract = []
            body_text = []
            # Abstract
            for entry in content['abstract']:
                abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                body_text.append(entry['text'])
            abstract = '\n'.join(abstract)
            body_text = '\n'.join(body_text)
            row = [(sha, title, abstract, body_text)]
            dfRow = pd.DataFrame(row, columns = ['sha', 'title', 'abstract', 'body_text'])
            df = pd.concat([dfRow, df], ignore_index=True, sort=False)

 

In [6]:
#muestro el contenido del dataframe
df

Unnamed: 0,sha,title,abstract,body_text
0,ffe133ed880d6c77ae340c5374817232e21f8315,Rational Design of Peptide Vaccines Against Mu...,Human papillomavirus (HPV) occurs in many type...,"Certain types of cancers such as liver cancer,..."
1,ffd3a93b927e221ded4cf76536ad31bef2c74b89,Fatal Respiratory Infections Associated with R...,During an outbreak of severe acute respiratory...,During an outbreak of severe acute respiratory...
2,ffb381668d93248759ca3855425e05722cb9f562,,,H uman coronaviruses (HCoVs) were first record...
3,ff7d49ac4008f60ef9c5a437e0d504dcefd1246f,,,results of studies conducted in other countrie...
4,ff365ebbc0fc55476886b0abd129e227c1f8a527,Article focus Hip,We report a systematic review and metaanalysis...,Despite the fact that total hip arthroplasty (...
...,...,...,...,...
13197,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...",Faced with the current large-scale public heal...,The sudden outbreak of the new coronavirus (SA...
13198,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p..."
13199,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by..."
13200,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,,The 2019-nCoV epidemic has spread across China...


In [7]:
## analisis de duplicados y nulos
# df.loc[0:0,'abstract']
# meta_df_not_null = meta_df[meta_df.sha.notnull()] 
# meta_df['sha']=="d13a685f861b0f1ba05afa6e005311ad1820fd3a"
# duplicateRowsDF = meta_df_not_null[meta_df_not_null.duplicated(['sha'])]
# duplicateRowsDF
# result = pd.merge(df,
#                 meta_df[['source_x']],
#                 on='sha', 
#                 how='left')


In [8]:
#cantidad de palabras en abstract
df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split(" ")))

In [9]:
#cantidad de palabras en body_text
df['body_word_count'] = df['body_text'].apply(lambda x: len(str(x).split(" ")))

In [10]:
#estadisticas de abstract
df.abstract_word_count.describe()

count    13202.000000
mean       210.268596
std        196.859557
min          1.000000
25%        112.000000
50%        199.500000
75%        270.000000
max       4145.000000
Name: abstract_word_count, dtype: float64

In [11]:
#estadisticas de body_text
df.body_word_count.describe()

count     13202.000000
mean       4236.881836
std        4683.207623
min           1.000000
25%        2458.250000
50%        3691.500000
75%        5336.750000
max      239553.000000
Name: body_word_count, dtype: float64

In [12]:
#cargar stop_words generico. Se va a utilizar para quitar las palabras "comunes" de cada noticia
stop_words = set(stopwords.words("english"))

In [19]:
#agregar stop words propias
new_words = ["many", "type", "et", "al", "day", "hi", "ae", "like", "common", "dc", "cd", "na", "described", "medrxiv", "preprint", "copyright", "reviewed", "http", "doi", "author", "funder", "right", "reserved", "web", "survey", "disclosure", "permission", "granted", "license", "word", "count", "biorxiv", "display", "perpetuity", "holder",  "reuse", "allowed"]
stop_words = stop_words.union(new_words)

In [14]:
#generar arreglos con la información "limpia" de abstract y text body 
abstract = []
body = []
for i in range(0, 13202):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', df['abstract'][i])
    text2 = re.sub('[^a-zA-Z]', ' ', df['body_text'][i])
    #Convert to lowercase
    text = text.lower()
    text2 = text2.lower()
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    text2=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text2)
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    text2=re.sub("(\\d|\\W)+"," ",text2)
    ##Convert to list from string
    text = text.split()
    text2 = text2.split()
    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    abstract.append(text)
    
    text2 = [lem.lemmatize(word) for word in text2 if not word in  
            stop_words] 
    text2 = " ".join(text2)
    body.append(text2)

In [20]:
#Se usa CountVectoriser para generar diccionarios de palabras/expresiones con mayor frecuencia. ngram_range indica que van a ser expresiones de 1, 2 y 3 palabras
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))

X=cv.fit_transform(abstract)
X2=cv.fit_transform(body)

In [21]:
#Function for sorting tf_idf in descending order 
#https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

df["abstract_kw"] = ""
df["body_kw"] = ""


tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)

tfidf_transformer2=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer2.fit(X2)

# get feature names
feature_names=cv.get_feature_names()



In [22]:
for i in range(0, 13202):
   # fetch document for which keywords needs to be extracted
   doc=abstract[i]
   #generate tf-idf for the given document
   tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
   #sort the tf-idf vectors by descending order of scores
   sorted_items=sort_coo(tf_idf_vector.tocoo())
   #extract only the top n; n here is 5
   keywords=extract_topn_from_vector(feature_names,sorted_items,5)
   kw = []
   for k in keywords:
      kw.append(k)
   kwstr =  ', '.join(kw) 
   df.loc[i:i, 'abstract_kw'] = kwstr
   
   doc2=body[i]
   #generate tf-idf for the given document
   tf_idf_vector=tfidf_transformer2.transform(cv.transform([doc2]))
   #sort the tf-idf vectors by descending order of scores
   sorted_items=sort_coo(tf_idf_vector.tocoo())
   #extract only the top n; n here is 5
   keywords=extract_topn_from_vector(feature_names,sorted_items,5)
   kw = []
   for k in keywords:
      kw.append(k)
   kwstr =  ', '.join(kw) 
   df.loc[i:i, 'body_kw'] = kwstr 

In [23]:
df.to_csv(f'{root_path}/covid_temp_2.csv', index = False, header=True)