In [5]:
import numpy as np 
import pandas as pd 
import glob
import json

In [6]:
#Indico la ruta donde se encuentra el set de datos descargados
root_path = 'D:/DS/COVID/CORD19'
# root_path = 'C:\\Users\\emarellano\\Documents\\Ezequiel\\Cursos\\CORD-19-research-challenge\\2020-03-13'

#archivo con metadata
metadata_path = f'{root_path}/metadata.csv'

#leo el archivo con metadata
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})

In [7]:
#armo un listado con los archivos json que tienen las noticias
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

29315

In [8]:
#creo un dataframe donde voy a guardar las columnas de interes:
#sha: id de la noticia
#title: titulo de la noticia
#body_text: cuerpo de la noticia
df = pd.DataFrame(columns = ['sha', 'title', 'abstract', 'body_text']) 

In [9]:
#Leo cada json y lo agrego al dataframe
for json_path in all_json:
   with open(json_path) as file:
            content = json.load(file)
            sha = content['paper_id']
            title = content['metadata']['title']
            abstract = []
            body_text = []
            # Abstract
            for entry in content['abstract']:
                abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                body_text.append(entry['text'])
            abstract = '\n'.join(abstract)
            body_text = '\n'.join(body_text)
            row = [(sha, title, abstract, body_text)]
            dfRow = pd.DataFrame(row, columns = ['sha', 'title', 'abstract', 'body_text'])
            df = pd.concat([dfRow, df], ignore_index=True, sort=False)

 

In [10]:
result = pd.merge(df,
                 meta_df,
                 on='sha', 
                 how='outer')


In [15]:
# Guardo el dataframe en un csv para levantarlo desde cualquier script
result_path = f'{root_path}/result.csv'
result.to_csv(result_path, index=False) 

### Calidad de datos
Los registros que tienen has_full_text son los que deberian tener un archivo json.

Buscar duplicados por body_text y abstract.

In [None]:
meta_df.describe()

In [None]:
print(meta_df.isnull().sum(axis=0))

print('\nCon full text: ', len(meta_df[meta_df['has_full_text']]))

In [103]:
df.describe()

Unnamed: 0,sha,title,abstract,body_text
count,29315,29315.0,29315.0,29315
unique,29315,25114.0,21050.0,29117
top,c6e2851ef1f6e35c2954eeaa913f2da2502842d9,,,"In previous reports, workers have characterize..."
freq,1,3583.0,8051.0,4


In [251]:
print('Estan en metadata pero no tienen un json: ', len(meta_df[(~meta_df['sha'].isin(df['sha']) & (~meta_df['title'].isin(df['title'])))]))
print('Tienen un json pero no estan en metadata: ', len(df[(~df['sha'].isin(meta_df['sha']) & (~df['title'].isin(meta_df['title'])))]))

Estan en metadata pero no tienen un json:  15521
Tienen un json pero no estan en metadata:  1048


In [250]:
result.isnull().sum(axis=0)

sha                            15758
title_x                        16530
abstract_x                     16530
body_text                      16530
source_x                        1637
title_y                         1861
doi                             5107
pmcid                          22538
pubmed_id                      22914
license                         1637
abstract_y                     10051
publish_time                   11660
authors                         4783
journal                        12684
Microsoft Academic Paper ID    44893
WHO #Covidence                 44090
has_full_text                   1637
full_text_file                 13028
dtype: int64

In [247]:
print('Tienen el mismo abstract: ', len(result.loc[result['abstract_x']==result['abstract_y']][['abstract_x','abstract_y']]))
print('Tienen el mismo title: ', len(result.loc[result['title_x']==result['title_y']][['title_x','title_y']]))
result.loc[result['title_x']!=result['title_y']][['title_x','title_y']]


Tienen el mismo abstract:  2108
Tienen el mismo title:  14619


Unnamed: 0,title_x,title_y
0,"Supplemental material for the paper ""Evidence ...",
2,Identification of a Subdomain of CENPB That Is...,Identification of a subdomain of CENP-B that i...
3,Brief Definitive Report MACROPHAGES GENETICALL...,Macrophages genetically resistant to mouse hep...
9,STATE OF THE ART 1B-H3-01 Current perspectives...,Current perspectives in transfusion-transmitte...
10,Effect of acetate Ringer ' s solution with or ...,Effect of acetate Ringer(’)s solution with or ...
...,...,...
45852,,Potential role of inanimate surfaces for the s...
45853,,The Healthy Infant Nasal Transcriptome: A Benc...
45854,,The intrinsic vulnerability of networks to epi...
45855,,Lung ultrasound as a diagnostic tool for radio...


### Busco duplicados

In [256]:
(result[(df['body_text'].notnull()) 
        & (result.duplicated('body_text', keep=False))].sort_values(by=['body_text','sha','doi']))

Unnamed: 0,sha,title_x,abstract_x,body_text,source_x,title_y,doi,pmcid,pubmed_id,license,abstract_y,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
27270,220fbe1f5e79e25737c0624de9a4245bfee48ec0,Complete genome sequence of canine astrovirus ...,,", but to date, no full-length genome sequence ...",,,,,,,,,,,,,,
25245,59b5d767e4bb03b049a800c82c2e78edd0650907,Complete genome sequence of canine astrovirus ...,,", but to date, no full-length genome sequence ...",,,,,,,,,,,,,,
2122,182fc0c38ce9f98fa70dc7ed3063551d426c4b7a,To appear in: One Health,,. The WHO have announced that the disease caus...,,,,,,,,,,,,,,
1547,58b5c77c9fb3f68a3ad84a3f15275dc0e4554192,To appear in: One Health,,. The WHO have announced that the disease caus...,,,,,,,,,,,,,,
2055,1fe9d44383ae2debf062555ea05a816dd3848a91,"Challenges presented by MERS corona virus, and...",Numerous viral infections have arisen and affe...,. These two viruses are causing acute and ofte...,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15827,34ba72963a44bbe5ce71b7ec96c6f71dd1069c2e,574 Coronavirus NL63 Illnesses in Infancy are ...,"RATIONALE: In infancy, rhinovirus illnesses an...",p 5 0.005) prevalence in children who needed t...,Elsevier,Coronavirus NL63 Illnesses in Infancy are a Ri...,10.1016/j.jaci.2006.11.508,,,els-covid,,2007-01-31,"Pappas, T.E.; Sullivan Dille, K.T.; Lee, W.; G...",Journal of Allergy and Clinical Immunology,,,True,custom_license
354,d7732a3cb93e9f48792f9072956ef244552afca1,,,where did you receive your training and what m...,,,,,,,,,,,,,,
54,fa46fb0587956a218b9b81d5aa6b2a6c7ec68126,,,where did you receive your training and what m...,,,,,,,,,,,,,,
1759,4050a72fdcfdeddecf74c01047934871e72abd8e,"Emerging Zoonoses: the ""One Health Approach""",Zoonoses represent a public health risk recent...,www.e-shaw.org at predicting the risk of an ep...,,,,,,,,,,,,,,


In [203]:
# meta_df[(meta_df['title'].notnull()) & (meta_df.duplicated(['title'], keep=False)) 
#         & (meta_df.duplicated(['sha'], keep=False)) & (meta_df.duplicated(['abstract'], keep=False))
#         & (meta_df.duplicated(['doi'], keep=False))
#        ].sort_values(by='title')

1048

#### sha y body_text duplicados
Son documentos donde se incluye mas de una investigacion, por lo tanto tienen distintos DOI, distintos titulos y distintos autores, pero el body_text incluye todos los articulos.

- Concatenar todos los titulos en uno?
- Buscar la forma de separar el body_text para cada caso?

Como son solo 19 tomaria el approach mas simple.
Si dejo los repetidos se corre el riesgo de sesgar los resultados?

In [140]:
len(result[(result.duplicated('sha', keep=False)) & (result.duplicated('body_text', keep=False))].sort_values(by=['sha','doi']))

19

In [139]:
# print(df.loc[df['sha'] == '72a5640aa0c307fbe171ca7ad55d3fda48b53988']['title'][11736])
meta_df.loc[meta_df['sha'] == '72a5640aa0c307fbe171ca7ad55d3fda48b53988']['title']
# meta_df.loc[meta_df['doi'] == '10.1186/cc2504']

28931    Computerised tomography (CT) in severe acute r...
28932    Critically ill patients with severe acute resp...
28933    Retrospective analysis of critically ill patie...
28934    Increase in methicillin-resistant Staphylococc...
Name: title, dtype: object