In [1]:
# NOTE: The way the functions are arranged 
# is a bit repetitive, you can group method in modules
# then you can from npmine.util import something

# Papers to use as example
# https://www.sciencedirect.com/science/article/pii/003194229085002W?via%3Dihub
# Download all papers Quimica Nova
# http://www.scielo.br/pdf/qn/v35n11/v35n11a34.pdf


from npmine.retrieve_doi import retrieve_doi

import os
import json

In [2]:
# Retrieve all DOis from this journal
# NOTE: The function should contain the parameter referring to the journal
#doi = retrieve_doi()
help(retrieve_doi)

Help on function retrieve_doi in module npmine.retrieve_doi:

retrieve_doi(journal_name='all')
    Performs web scraping to obtain DOIs from journal of ACS
    Parameters
    ----------
    journal_name: str
        iTo be added.
    Returns
        List containing DOIs.
    -------



In [3]:
# write doi file
#with open('data/doi_dnp.json', 'w+') as outfile:
#    json.dump(doi, outfile, indent=4)

# compress the json file
!zip -j data/doi_dnp.zip data/doi_dnp.json

/bin/sh: 1: zip: not found


In [3]:
# uncompress the json file
#!unzip data/doi_dnp.zip -d data/

# Load the file, so you don't need to generate it
# every time
with open('data/doi_dnp.json', 'r') as inputfile:
    doi = json.load(inputfile)

In [4]:
len(doi)

14481

In [5]:
doi[-3:]

['https://pubs.acs.org/doi/pdf/10.1021/acs.jnatprod.9b00914',
 'https://pubs.acs.org/doi/pdf/10.1021/npv082i012_1340240',
 'https://pubs.acs.org/doi/pdf/10.1021/npv082i012_1340198']

In [6]:
# Separating the download
# from entity retrieval
# is more efficient

from npmine.retrieve_chemical_entities import download_pdf

for url in doi[-3:]:
    d = url.split('/')[-1]
    download_pdf(url, '%s.pdf' % d)

In [7]:
import requests
r = requests.get(url)
r.status_code

200

In [8]:
with open('%s.pdf' % d, 'wb') as fd:
    fd.write(r.content)

In [10]:
# inspect example files
os.listdir('data/example_pdf')

['acs.jnatprod.5b00005.pdf',
 'acs.jnatprod.5b00009.pdf',
 'acs.jnatprod.5b00002.pdf',
 'acs.jnatprod.5b00008.pdf',
 'acs.jnatprod.5b00004.pdf']

In [11]:
# Test for a small number of files
from npmine.retrieve_chemical_entities import retrieve_chemical_entities

chemical_entities = [retrieve_chemical_entities(os.path.join('data/example_pdf', f)) 
                                                for f in os.listdir('data/example_pdf')]

with open("entities.json", "w+") as f:
    json.dump(chemical_entities, f)

In [12]:
chemical_entities

[{'data/example_pdf/acs.jnatprod.5b00005': {'oscar': [{'md5Sum': 'ae32af9b2c0bad89b2834bb6946922c1',
     'chemicalData': {'Prednisone': {'name': 'Prednisone',
       'standardInChI': 'InChI=1S/C21H26O5/c1-19-7-5-13(23)9-12(19)3-4-14-15-6-8-21(26,17(25)11-22)20(15,2)10-16(24)18(14)19/h5,7,9,14-15,18,22,26H,3-4,6,8,10-11H2,1-2H3/t14-,15-,18+,19-,20-,21-/m0/s1',
       'standardInChIKey': 'XOFYZVNMUHMLCC-ZPOLXVRWSA-N'},
      'naringenin': {'name': 'naringenin',
       'standardInChI': 'InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19)15-11(18)5-10(17)6-14(15)20-13/h1-6,13,16-18H,7H2',
       'standardInChIKey': 'FTVWIRXFELQLPI-UHFFFAOYSA-N'},
      'Tris': {'name': 'Tris',
       'standardInChI': 'InChI=1S/C4H11NO3/c5-4(1-6,2-7)3-8/h6-8H,1-3,5H2',
       'standardInChIKey': 'LENZDBCJOHFCAS-UHFFFAOYSA-N'},
      'MeCN': {'name': 'MeCN',
       'standardInChI': 'InChI=1S/C2H3N/c1-2-3/h1H3',
       'standardInChIKey': 'WEVYAHXRMPXWCK-UHFFFAOYSA-N'},
      'oxygen': {'name': 'oxygen',
       

In [14]:
# Creates one file with scientific name for each paper
from npmine.retrieve_scientific_name import retrieve_scietific_name, html2txt

fls = os.listdir('data/example_pdf')

for f in fls:
    if '.pdf' in f:
        pdf = os.path.join('data/example_pdf', f)
        out = pdf.replace('.pdf', '_gn.txt')
        retrieve_scietific_name(pdf, out)

In [15]:
from npmine.retrieve_chemical_entities_from_image import retrieve_chemical_entities_from_image

img_entities = []

for f in fls:
    if '.pdf' in f:
        pdf = os.path.join('data/example_pdf', f)
        img_entities.append(retrieve_chemical_entities_from_image(pdf)) 

In [16]:
img_entities

[{'data/example_pdf/acs.jnatprod.5b00005': {'osra': ['O=CC[C@@H](/C(=C/C(=C(/O)\\C)/OC)/C=C)OC1=C*2=C(C(=C1*)O)CC(O2)C(CC/C=C(\\C)/*)(O)C\n',
    'O=C1C[C@H](Oc2c1c(O)c(c(c2)O)C)c1cc(*)c(c(c1)*)*\n',
    'CC(=CCCC1(C)C=Cc2c(O1)cc1c(c2O)C(=O)[C@@H]([C@H](O1)c1cc(*)c(c(c1)*)*)*)C\n',
    'COc1cc(ccc1O)[C@@H]1CC(=O)c2c(O1)cc1c(c2O)CC(C(O1)(C)CCC=C(C)C)O\n',
    'COc1c(OC)cc(cc1OC)[C@@H]1CC(=O)c2c(O1)cc(cc2O)O\n',
    'CCC(C1(C)CCC(O1)C(=C)C)O\n',
    'CC(=CCCC(C1Cc2c(O1)cc1c(c2O)C(=O)CC(O1)c1ccc(cc1)O)(O)C)C\n',
    'COc1cc(ccc1O)[C@H]1Oc2cc(O)c(c(c2C(=O)[C@@H]1O)O)C/C=C(/CCC=C(C)C)\\C\n',
    'COc1cc(cc(c1O)OC)C1CC(=O)c2c(O1)cc(c(c2O)*)O\n',
    'Oc1cc2O[C@@H](CC(=O)c2c(c1)O)c1ccc(c(c1)O)O\n',
    'Oc1ccc(cc1)[C@@H]1CC(=O)c2c(O1)cc(cc2O)O\n',
    'C*C(C1CN1)CC\n',
    'CCC(CC(C=N)*)C\n']}},
 {'data/example_pdf/acs.jnatprod.5b00009': {'osra': ['*C[C@H]([C@@H](C(=O)N[C@H](C(=O)N)CC(=O)N)N)C\n',
    '*C[C@H]([C@@H](C(=O)NC(C(=O)N)CC(=O)C)N)C\n',
    'Sc1c[nH]c2c1cccc2\n',
    'O=CNc1ccccc1*

In [17]:
with open("img_entities.json", "w+") as f:
    json.dump(img_entities, f)

In [19]:
from npmine.postprocessing import entity_dict2dataframe

dflist = []
for e in chemical_entities:
    dflist.append(entity_dict2dataframe(e))

ModuleNotFoundError: No module named 'rdkit'