In [1]:
# NOTE: The way the functions are arranged 
# is a bit repetitive, you can group method in modules
# then you can from npmine.util import something

# Papers to use as example
# https://www.sciencedirect.com/science/article/pii/003194229085002W?via%3Dihub
# Download all papers Quimica Nova
# http://www.scielo.br/pdf/qn/v35n11/v35n11a34.pdf


from npmine.retrieve_doi import retrieve_doi

import os
import json

In [2]:
# Retrieve all DOis from this journal
# NOTE: The function should contain the parameter referring to the journal
#doi = retrieve_doi()
help(retrieve_doi)

Help on function retrieve_doi in module npmine.retrieve_doi:

retrieve_doi(journal_name='all')
    Performs web scraping to obtain DOIs from journal of ACS
    Parameters
    ----------
    journal_name: str
        iTo be added.
    Returns
        List containing DOIs.
    -------



In [3]:
# write doi file
#with open('data/doi_dnp.json', 'w+') as outfile:
#    json.dump(doi, outfile, indent=4)

# compress the json file
!zip -j data/doi_quim.zip data/doi_quim.json

/bin/sh: 1: zip: not found


In [4]:
# uncompress the json file
#!unzip data/doi_dnp.zip -d data/

# Load the file, so you don't need to generate it
# every time
with open('data/doi_quim.json', 'r') as inputfile:
    doi = json.load(inputfile)

In [5]:
len(doi)

8124

In [6]:
doi[17:20]

['http://static.sites.sbq.org.br/quimicanova.sbq.org.br/pdf/v37n2a15.pdf',
 'http://static.sites.sbq.org.br/quimicanova.sbq.org.br/pdf/v37n2a15-Supl01.pdf',
 'http://static.sites.sbq.org.br/quimicanova.sbq.org.br/pdf/v37n2a16.pdf']

In [7]:
# Separating the download
# from entity retrieval
# is more efficient

from npmine.retrieve_chemical_entities import download_pdf

for url in doi[17:20]:
    d = url.split('/')[-1]
    if '.pdf' in d:
        download_pdf(url, d)
    else:
        download_pdf(url, '%s.pdf' % d)

In [8]:
!ls v37*.pdf

v37n2a15-Supl01.pdf  v37n2a15.pdf  v37n2a16.pdf


In [9]:
import requests
r = requests.get(url)
r.status_code

200

In [10]:
with open(d, 'wb') as fd:
    fd.write(r.content)

In [12]:
# inspect example files
if not os.path.exists('data/example_pdf'):
    os.path.mkdir('data/example_pdf')
    
os.listdir('data/example_pdf')

['v37n2a15-Supl01.pdf', 'v37n2a15.pdf', 'v37n2a16.pdf']

In [13]:
# Test for a small number of files
from npmine.retrieve_chemical_entities import retrieve_chemical_entities

chemical_entities = [retrieve_chemical_entities(os.path.join('data/example_pdf', f)) 
                                                for f in os.listdir('data/example_pdf')]

with open("entities.json", "w+") as f:
    json.dump(chemical_entities, f)

In [14]:
chemical_entities

[{'data/example_pdf/v37n2a15-Supl01': {'oscar': [{'md5Sum': 'd8d9ecb7c0e6b71b1dacd7a2bb360e57',
     'chemicalData': {'CDCl3': {'name': 'CDCl3',
       'standardInChI': 'InChI=1S/CHCl3/c2-1(3)4/h1H/i1D',
       'standardInChIKey': 'HEDRZPFGACZZDS-MICDWDOJSA-N'},
      'hexane': {'name': 'hexane',
       'standardInChI': 'InChI=1S/C6H14/c1-3-5-6-4-2/h3-6H2,1-2H3',
       'standardInChIKey': 'VLKZOEOYAKHREP-UHFFFAOYSA-N'}}}]}},
 {'data/example_pdf/v37n2a15': {'oscar': [{'md5Sum': '4ee277015c3e3578394885d302f0b06b',
     'chemicalData': {'Nonacosane': {'name': 'Nonacosane',
       'standardInChI': 'InChI=1S/C29H60/c1-3-5-7-9-11-13-15-17-19-21-23-25-27-29-28-26-24-22-20-18-16-14-12-10-8-6-4-2/h3-29H2,1-2H3',
       'standardInChIKey': 'IGGUPRCHHJZPBS-UHFFFAOYSA-N'},
      'Triacontane': {'name': 'Triacontane',
       'standardInChI': 'InChI=1S/C30H62/c1-3-5-7-9-11-13-15-17-19-21-23-25-27-29-30-28-26-24-22-20-18-16-14-12-10-8-6-4-2/h3-30H2,1-2H3',
       'standardInChIKey': 'JXTPJDDICSTXJX-

In [15]:
# Creates one file with scientific name for each paper
from npmine.retrieve_scientific_name import retrieve_scientific_name, html2txt

fls = os.listdir('data/example_pdf')

for f in fls:
    if '.pdf' in f:
        pdf = os.path.join('data/example_pdf', f)
        out = pdf.replace('.pdf', '_gn.txt')
        retrieve_scientific_name(pdf, out)

In [16]:
from npmine.retrieve_chemical_entities_from_image import retrieve_chemical_entities_from_image

img_entities = []

for f in fls:
    if '.pdf' in f:
        pdf = os.path.join('data/example_pdf', f)
        img_entities.append(retrieve_chemical_entities_from_image(pdf)) 

In [17]:
img_entities

[{'data/example_pdf/v37n2a15-Supl01': {'osra': ['C[O+](C*1(C)(C)([Li+])*CCC1)I\n',
    'CCCCCC*(CC(=C)CCC(C=[N](=CC([C](#C)CC(*C(C)C)C)C)(#CC)C=C$C)C)C\n',
    'CCC1CCC(C1)C\n',
    'COc1cc2c(cc1OC)cc(c(c2O)C)C\n',
    'COc1cc2C(=O)C(C)C(C(c2cc1OC)c1ccc2c(c1)OCO2)C\n',
    'COc1c/c(=C\\c2ccc3c(c2)OCO3)/c(=C=O)cc1OC\n',
    'Cc1cc2cc3OCOc3cc2c(c1C)O\n',
    'COc1cc(ccc1OC)C1C(C)C(C)C(=O)c2c1cc1OCOc1c2\n',
    'COc1cc(ccc1OC)/C=c/1\\cc2O[I]Oc2cc1=C=O\n']}},
 {'data/example_pdf/v37n2a15': {'osra': ['*Oc1cc(ccc1O*)[C@]1(C)C(C)C(*)C(=O)c2c1cc(O)c(c2*)O\n',
    '*Oc1cc(ccc1O*)C1[C@@H](C)C(C)C(=O)c2c1cc(O)c(c2)O\n',
    'COc1cc(ccc1OC)[C@H]1C(C)[C@@H](C)C(=O)c2c1cc(OC)c(c2)OC\n',
    'C[C@@H]1C(OC([C@@H]1C)c1ccc2c(c1)OCO2)c1ccc2c(c1)OCO2\n',
    'COc1cc(ccc1OC)C1OC([C@H](C1C)C)c1ccc2c(c1)OCO2\n',
    '*Oc1cc(ccc1O*)C1C(C)C(C)C(=O)c2c1cc1OCOc1c2\n',
    '*Oc1cc(ccc1O*)C1OC(C(C1C)C)c1ccc(c(c1)OC)OC\n']}},
 {'data/example_pdf/v37n2a16': {'osra': ['*OC1C(O)C(OC(C1O)C)O[C@H]1Cc2c(O)cc(cc2O[C@@H]1c

In [18]:
with open("img_entities.json", "w+") as f:
    json.dump(img_entities, f)

In [19]:
from npmine.postprocessing import entity_dict2dataframe

dflist = []
for e in chemical_entities:
    dflist.append(entity_dict2dataframe(e))

In [20]:
# How result provided in data/entities_dataframe.tsv
# was generated
import pandas as pd
pd.concat(dflist).head()

Unnamed: 0,name,standardInChI,standardInChIKey,doi,ExactMolWt
CDCl3,CDCl3,InChI=1S/CHCl3/c2-1(3)4/h1H/i1D,HEDRZPFGACZZDS-MICDWDOJSA-N,data/example_pdf/v37n2a15-Supl01,118.92066
hexane,hexane,"InChI=1S/C6H14/c1-3-5-6-4-2/h3-6H2,1-2H3",VLKZOEOYAKHREP-UHFFFAOYSA-N,data/example_pdf/v37n2a15-Supl01,86.10955
Nonacosane,Nonacosane,InChI=1S/C29H60/c1-3-5-7-9-11-13-15-17-19-21-2...,IGGUPRCHHJZPBS-UHFFFAOYSA-N,data/example_pdf/v37n2a15,408.469502
Triacontane,Triacontane,InChI=1S/C30H62/c1-3-5-7-9-11-13-15-17-19-21-2...,JXTPJDDICSTXJX-UHFFFAOYSA-N,data/example_pdf/v37n2a15,422.485152
acetone,acetone,InChI=1S/C3H6O/c1-3(2)4/h1-2H3,CSCPPACGZOOCGX-UHFFFAOYSA-N,data/example_pdf/v37n2a15,58.041865


In [21]:
from npmine.postprocessing import sci_name_dict2dataframe

nms = []
fls = os.listdir('data/example_pdf/')
for fl in fls:
    if '_gn.txt' in fl:
        with open(os.path.join('data/example_pdf/', fl), encoding='utf-8') as f:
            gn = json.load(f)
        nms.append(sci_name_dict2dataframe(gn, fl))  


In [22]:
# How result provided in data/gn_dataframe.tsv
# was generated
import pandas as pd
pd.concat(nms).head()

Unnamed: 0,doi,verbatim,odds,dataSourceId,taxonId,classificationPath,classificationRank,matchType
0,v37n2a15_gn.txt,Holostylis reniformis:,252676700000.0,1,1615480,Plantae|Tracheophyta|Magnoliopsida|Piperales|A...,kingdom|phylum|class|order|family|genus|species,ExactCanonicalMatch
1,v37n2a15_gn.txt,Lucia,71783.1,1,4092258,Animalia|Arthropoda|Insecta|Lepidoptera|Papili...,kingdom|phylum|class|order|superfamily|family|...,ExactMatch
2,v37n2a15_gn.txt,Vieira,11568.01,1,4217981,Animalia|Arthropoda|Insecta|Neuroptera|Chrysop...,kingdom|phylum|class|order|family|genus,ExactMatch
3,v37n2a15_gn.txt,Antoniana,5662.834,165,145260271,,,ExactCanonicalMatch
4,v37n2a15_gn.txt,(Holostylis reniformis),252676700000.0,1,1615480,Plantae|Tracheophyta|Magnoliopsida|Piperales|A...,kingdom|phylum|class|order|family|genus|species,ExactCanonicalMatch


In [23]:
from npmine.postprocessing import image_dict2dataframe

imglist = []

for img in img_entities:
    imglist.append(image_dict2dataframe(img))

In [24]:
# How result provided in data/entities_img_dataframe.tsv
# was generated
import pandas as pd
pd.concat(imglist).head()

Unnamed: 0,doi,smiles,standardInChIKey,ExactMolWt
0,data/example_pdf/v37n2a15-Supl01,C[O+](C*1(C)(C)([Li+])*CCC1)I,,0.0
1,data/example_pdf/v37n2a15-Supl01,CCCCCC*(CC(=C)CCC(C=[N](=CC([C](#C)CC(*C(C)C)C...,,0.0
2,data/example_pdf/v37n2a15-Supl01,CCC1CCC(C1)C,PQXAPVOKLYINEI-UHFFFAOYSA-N,112.125201
3,data/example_pdf/v37n2a15-Supl01,COc1cc2c(cc1OC)cc(c(c2O)C)C,FHIMWCCRGDSISW-UHFFFAOYSA-N,232.109944
4,data/example_pdf/v37n2a15-Supl01,COc1cc2C(=O)C(C)C(C(c2cc1OC)c1ccc2c(c1)OCO2)C,AQILVQJBVWGDOL-UHFFFAOYSA-N,354.146724


In [25]:
from npmine.create_report import create_report

help(create_report)

Help on function create_report in module npmine.create_report:

create_report(report_print, dois, out_file='npmine_report.html', useSVG=False)
    Creates an html report from NPMINE's results
    Parameters
    ----------
    report_print: pd.DataFrame
        DataFrame containing columns 'doi', 'pubchem', 'ExactMolWt', 'smiles','source'.
    dois: str or list
        Directory of doi link files or link list.
    out_file: str
        HTML output file name.
    useSVG: bool
        If svg format should be used. Default is png.
    Returns
        Report html file.
    -------



In [26]:
path = 'data/example_pdf'
doi = os.listdir(path)

doi = [os.path.join(path, x) for x in doi if '.pdf' in x]
doi

['data/example_pdf/v37n2a15-Supl01.pdf',
 'data/example_pdf/v37n2a15.pdf',
 'data/example_pdf/v37n2a16.pdf']

In [27]:
from npmine.retrieve_ID_pubchem_spiderchem import inchikey2cid

df_oscar = pd.concat(dflist)
df_oscar['source'] = 'oscar'
df_oscar['pubchem'] = [inchikey2cid(x) for x in df_oscar['standardInChIKey']]

df_oscar

Unnamed: 0,name,standardInChI,standardInChIKey,doi,ExactMolWt,source,pubchem
CDCl3,CDCl3,InChI=1S/CHCl3/c2-1(3)4/h1H/i1D,HEDRZPFGACZZDS-MICDWDOJSA-N,data/example_pdf/v37n2a15-Supl01,118.92066,oscar,71583
hexane,hexane,"InChI=1S/C6H14/c1-3-5-6-4-2/h3-6H2,1-2H3",VLKZOEOYAKHREP-UHFFFAOYSA-N,data/example_pdf/v37n2a15-Supl01,86.10955,oscar,8058
Nonacosane,Nonacosane,InChI=1S/C29H60/c1-3-5-7-9-11-13-15-17-19-21-2...,IGGUPRCHHJZPBS-UHFFFAOYSA-N,data/example_pdf/v37n2a15,408.469502,oscar,12409
Triacontane,Triacontane,InChI=1S/C30H62/c1-3-5-7-9-11-13-15-17-19-21-2...,JXTPJDDICSTXJX-UHFFFAOYSA-N,data/example_pdf/v37n2a15,422.485152,oscar,12535
acetone,acetone,InChI=1S/C3H6O/c1-3(2)4/h1-2H3,CSCPPACGZOOCGX-UHFFFAOYSA-N,data/example_pdf/v37n2a15,58.041865,oscar,180
furan,furan,InChI=1S/C4H4O/c1-2-4-5-3-1/h1-4H,YLQBMQCUIZJEEH-UHFFFAOYSA-N,data/example_pdf/v37n2a15,68.026215,oscar,8029
Hexacosane,Hexacosane,InChI=1S/C26H54/c1-3-5-7-9-11-13-15-17-19-21-2...,HMSWAIKSFDFLKN-UHFFFAOYSA-N,data/example_pdf/v37n2a15,366.422552,oscar,12407
hexane acetone ethanol,hexane acetone ethanol,InChI=1S/C6H14.C3H6O.C2H6O/c1-3-5-6-4-2;1-3(2)...,SVHVEORJPVYLCT-UHFFFAOYSA-N,data/example_pdf/v37n2a15,190.19328,oscar,87578811
isoeugenol,isoeugenol,InChI=1S/C10H12O2/c1-3-4-8-5-6-9(11)10(7-8)12-...,BJIOGJUNALELMI-UHFFFAOYSA-N,data/example_pdf/v37n2a15,164.08373,oscar,7338
Octacosane,Octacosane,InChI=1S/C28H58/c1-3-5-7-9-11-13-15-17-19-21-2...,ZYURHZPYMFLWSH-UHFFFAOYSA-N,data/example_pdf/v37n2a15,394.453852,oscar,12408


In [40]:
df_osra = pd.concat(imglist)
df_osra['source'] = 'osra'

pubchem = []
for x in df_osra['standardInChIKey']:
    cid = inchikey2cid(x)
    if not cid is None:
        pubchem.append(cid)
    else:
        pubchem.append(0)

df_osra['pubchem'] = pubchem
df_osra

Unnamed: 0,doi,smiles,standardInChIKey,ExactMolWt,source,pubchem
0,data/example_pdf/v37n2a15-Supl01,C[O+](C*1(C)(C)([Li+])*CCC1)I,,0.0,osra,0
1,data/example_pdf/v37n2a15-Supl01,CCCCCC*(CC(=C)CCC(C=[N](=CC([C](#C)CC(*C(C)C)C...,,0.0,osra,0
2,data/example_pdf/v37n2a15-Supl01,CCC1CCC(C1)C,PQXAPVOKLYINEI-UHFFFAOYSA-N,112.125201,osra,19502
3,data/example_pdf/v37n2a15-Supl01,COc1cc2c(cc1OC)cc(c(c2O)C)C,FHIMWCCRGDSISW-UHFFFAOYSA-N,232.109944,osra,12353794
4,data/example_pdf/v37n2a15-Supl01,COc1cc2C(=O)C(C)C(C(c2cc1OC)c1ccc2c(c1)OCO2)C,AQILVQJBVWGDOL-UHFFFAOYSA-N,354.146724,osra,13845954
5,data/example_pdf/v37n2a15-Supl01,COc1c/c(=C\c2ccc3c(c2)OCO3)/c(=C=O)cc1OC,DPFXJGAVVDQCIM-LFYBBSHMSA-N,298.084124,osra,0
6,data/example_pdf/v37n2a15-Supl01,Cc1cc2cc3OCOc3cc2c(c1C)O,RQBSCNIDCUKHTJ-UHFFFAOYSA-N,216.078644,osra,0
7,data/example_pdf/v37n2a15-Supl01,COc1cc(ccc1OC)C1C(C)C(C)C(=O)c2c1cc1OCOc1c2,QJKIOLPHXOZLDC-UHFFFAOYSA-N,354.146724,osra,14034477
8,data/example_pdf/v37n2a15-Supl01,COc1cc(ccc1OC)/C=c/1\cc2O[I]Oc2cc1=C=O,DGMPNBYAGHZWNH-VZUCSPMQSA-N,410.972946,osra,0
0,data/example_pdf/v37n2a15,*Oc1cc(ccc1O*)[C@]1(C)C(C)C(*)C(=O)c2c1cc(O)c(...,,310.084124,osra,0


In [41]:
df_osra.shape

(21, 6)

In [34]:
from rdkit import Chem

def inchi2smiles(inchi):
    return Chem.MolToSmiles(Chem.MolFromInchi(inchi))

df_oscar['smiles'] = [inchi2smiles(x) for x in df_oscar['standardInChI']]

In [42]:
report_print = pd.concat([df_oscar[['doi', 'pubchem', 'ExactMolWt', 'smiles','source']],
                         df_osra[['doi', 'pubchem', 'ExactMolWt', 'smiles','source']]]).reset_index(drop=True)

report_print['source'].value_counts()

oscar    55
osra     21
Name: source, dtype: int64

In [43]:
create_report(report_print, doi, out_file='npmine_report.html')

<Figure size 432x288 with 0 Axes>

In [44]:
!zip -r npmine_report.zip npmine_report.html figs data/example_pdf

/bin/sh: 1: zip: not found
