In [1]:
# uncompress the json file
#!unzip data/doi_quim.zip -d data/
!unzip doi_quim.zip

import json

# Load the file, so you don't need to generate it
# every time
with open('doi_quim.json', 'r') as inputfile:
    doi = json.load(inputfile)

Archive:  doi_quim.zip
  inflating: doi_quim.json           


In [2]:
len(doi)

8124

In [3]:
urls = ['http://quimicanova.sbq.org.br/audiencia_pdf.asp?aid2=15&nomeArquivo=v37n2a15.pdf',
        'http://quimicanova.sbq.org.br/audiencia_pdf.asp?aid2=15&nomeArquivo=v37n2a15-Supl01.pdf',
        'http://quimicanova.sbq.org.br/audiencia_pdf.asp?aid2=16&nomeArquivo=v37n2a16.pdf']

urls

['http://quimicanova.sbq.org.br/audiencia_pdf.asp?aid2=15&nomeArquivo=v37n2a15.pdf',
 'http://quimicanova.sbq.org.br/audiencia_pdf.asp?aid2=15&nomeArquivo=v37n2a15-Supl01.pdf',
 'http://quimicanova.sbq.org.br/audiencia_pdf.asp?aid2=16&nomeArquivo=v37n2a16.pdf']

In [4]:
# Separating the download
# from entity retrieval
# is more efficient

from npmine.retrieve_chemical_entities import download_pdf

for url in urls[:1]:
    d = url.split('=')[-1]
    if '.pdf' in d:
        download_pdf(url, d)
    else:
        download_pdf(url, '%s.pdf' % d)

In [5]:
!ls v37*.pdf

v37n2a15.pdf


In [6]:
import os

# inspect example files
if not os.path.exists('pdfs'):
    os.mkdir('pdfs')
    
!mv v37*.pdf pdfs
!ls pdfs

v37n2a15.pdf  v37n2a15_gn.txt


In [7]:
# Test for a small number of files
from npmine.retrieve_chemical_entities import retrieve_chemical_entities

chemical_entities = [retrieve_chemical_entities(os.path.join('pdfs', f)) 
                                                for f in os.listdir('pdfs') if '.pdf' in f]

with open("entities.json", "w+") as f:
    json.dump(chemical_entities, f)

In [8]:
chemical_entities

[{'pdfs/v37n2a15': {'oscar': [{'md5Sum': '4ee277015c3e3578394885d302f0b06b',
     'chemicalData': {'pentacosane': {'name': 'pentacosane',
       'standardInChI': 'InChI=1S/C25H52/c1-3-5-7-9-11-13-15-17-19-21-23-25-24-22-20-18-16-14-12-10-8-6-4-2/h3-25H2,1-2H3',
       'standardInChIKey': 'YKNWIILGEFFOPE-UHFFFAOYSA-N'},
      'acetone': {'name': 'acetone',
       'standardInChI': 'InChI=1S/C3H6O/c1-3(2)4/h1-2H3',
       'standardInChIKey': 'CSCPPACGZOOCGX-UHFFFAOYSA-N'},
      'Hexacosane': {'name': 'Hexacosane',
       'standardInChI': 'InChI=1S/C26H54/c1-3-5-7-9-11-13-15-17-19-21-23-25-26-24-22-20-18-16-14-12-10-8-6-4-2/h3-26H2,1-2H3',
       'standardInChIKey': 'HMSWAIKSFDFLKN-UHFFFAOYSA-N'},
      'CO2': {'name': 'CO2',
       'standardInChI': 'InChI=1S/CO2/c2-1-3',
       'standardInChIKey': 'CURLTUGMZLYLDI-UHFFFAOYSA-N'},
      'Pentacosane': {'name': 'Pentacosane',
       'standardInChI': 'InChI=1S/C25H52/c1-3-5-7-9-11-13-15-17-19-21-23-25-24-22-20-18-16-14-12-10-8-6-4-2/h3-25H2,

In [9]:
%%time

# Creates one file with scientific name for each paper
from npmine.retrieve_scientific_name import retrieve_scientific_name, html2txt

fls = os.listdir('pdfs')

for f in fls:
    if '.pdf' in f:
        pdf = os.path.join('pdfs', f)
        out = pdf.replace('.pdf', '_gn.txt')
        retrieve_scientific_name(pdf, out)

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 1.29 s


In [10]:
%%time

from npmine.retrieve_chemical_entities_from_image import retrieve_chemical_entities_from_image

img_entities = []

for f in fls:
    if '.pdf' in f:
        pdf = os.path.join('pdfs', f)
        img_entities.append(retrieve_chemical_entities_from_image(pdf)) 

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1min 40s


In [11]:
img_entities

[{'pdfs/v37n2a15': {'osra': ['*Oc1cc(ccc1O*)[C@]1(C)C(*)C(*)C(=O)c2c1cc(O)c(c2)O\n',
    '*Oc1cc(ccc1O*)[C]1[C]([C]([C]([C]2[C]1[C][C]([C]([C]2)O)O)=O)C)C\n',
    'COc1cc(ccc1OC)[C@H]1C(C)[C@@H](C)C(=O)c2c1cc(OC)c(c2)OC\n',
    'C[C@@H]1C(OC([C@@H]1C)c1ccc2c(c1)OCO2)c1ccc2c(c1)OCO2\n',
    'COc1cc(ccc1OC)C1OC([C@H](C1C)C)c1ccc2c(c1)OCO2\n',
    '*Oc1cc(ccc1O*)C1C(C)C(C)C(=O)c2c1cc1OCOc1c2\n',
    '*Oc1cc(ccc1O*)C1OC(C(C1C)C)c1ccc(c(c1)OC)OC\n']}}]

In [12]:
with open("img_entities.json", "w+") as f:
    json.dump(img_entities, f)

In [13]:
from npmine.postprocessing import entity_dict2dataframe

dflist = []
for e in chemical_entities:
    dflist.append(entity_dict2dataframe(e))

In [14]:
# How result provided in data/entities_dataframe.tsv
# was generated
import pandas as pd
pd.concat(dflist).head()

Unnamed: 0,name,standardInChI,standardInChIKey,doi,ExactMolWt
pentacosane,pentacosane,InChI=1S/C25H52/c1-3-5-7-9-11-13-15-17-19-21-2...,YKNWIILGEFFOPE-UHFFFAOYSA-N,pdfs/v37n2a15,352.406902
acetone,acetone,InChI=1S/C3H6O/c1-3(2)4/h1-2H3,CSCPPACGZOOCGX-UHFFFAOYSA-N,pdfs/v37n2a15,58.041865
Hexacosane,Hexacosane,InChI=1S/C26H54/c1-3-5-7-9-11-13-15-17-19-21-2...,HMSWAIKSFDFLKN-UHFFFAOYSA-N,pdfs/v37n2a15,366.422552
CO2,CO2,InChI=1S/CO2/c2-1-3,CURLTUGMZLYLDI-UHFFFAOYSA-N,pdfs/v37n2a15,43.989829
Pentacosane,Pentacosane,InChI=1S/C25H52/c1-3-5-7-9-11-13-15-17-19-21-2...,YKNWIILGEFFOPE-UHFFFAOYSA-N,pdfs/v37n2a15,352.406902


In [15]:
from npmine.postprocessing import sci_name_dict2dataframe

nms = []
fls = os.listdir('pdfs/')
for fl in fls:
    if '_gn.txt' in fl:
        with open(os.path.join('pdfs/', fl), encoding='utf-8') as f:
            gn = json.load(f)
        nms.append(sci_name_dict2dataframe(gn, fl))  


In [16]:
# How result provided in data/gn_dataframe.tsv
# was generated
import pandas as pd
pd.concat(nms).head()

Unnamed: 0,doi,verbatim,odds,dataSourceId,taxonId,classificationPath,classificationRank,matchType
0,v37n2a15_gn.txt,Holostylis reniformis:,252676700000.0,1,1615480,Plantae|Tracheophyta|Magnoliopsida|Piperales|A...,kingdom|phylum|class|order|family|genus|species,ExactCanonicalMatch
1,v37n2a15_gn.txt,Lucia,71783.1,1,4092258,Animalia|Arthropoda|Insecta|Lepidoptera|Papili...,kingdom|phylum|class|order|superfamily|family|...,ExactMatch
2,v37n2a15_gn.txt,Vieira,11568.01,1,4217981,Animalia|Arthropoda|Insecta|Neuroptera|Chrysop...,kingdom|phylum|class|order|family|genus,ExactMatch
3,v37n2a15_gn.txt,Antoniana,5662.834,165,145260271,,,ExactCanonicalMatch
4,v37n2a15_gn.txt,(Holostylis reniformis),252676700000.0,1,1615480,Plantae|Tracheophyta|Magnoliopsida|Piperales|A...,kingdom|phylum|class|order|family|genus|species,ExactCanonicalMatch


In [17]:
from npmine.postprocessing import image_dict2dataframe

imglist = []

for img in img_entities:
    imglist.append(image_dict2dataframe(img))

In [18]:
# How result provided in data/entities_img_dataframe.tsv
# was generated
import pandas as pd
pd.concat(imglist).head()

Unnamed: 0,doi,smiles,standardInChIKey,ExactMolWt
0,pdfs/v37n2a15,*Oc1cc(ccc1O*)[C@]1(C)C(*)C(*)C(=O)c2c1cc(O)c(...,,296.068473
1,pdfs/v37n2a15,*Oc1cc(ccc1O*)[C]1[C]([C]([C]([C]2[C]1[C][C]([...,,307.060648
2,pdfs/v37n2a15,COc1cc(ccc1OC)[C@H]1C(C)[C@@H](C)C(=O)c2c1cc(O...,UCHGPGXURWMCBZ-IGNYMDPMSA-N,370.178024
3,pdfs/v37n2a15,C[C@@H]1C(OC([C@@H]1C)c1ccc2c(c1)OCO2)c1ccc2c(...,QFUXQRHAJWXPGP-WFSAKUOBSA-N,340.131074
4,pdfs/v37n2a15,COc1cc(ccc1OC)C1OC([C@H](C1C)C)c1ccc2c(c1)OCO2,HSMDOSKNXLVXIP-IIBNLQKXSA-N,356.162374


In [19]:
from npmine.create_report import *

help(create_report)

Help on function create_report in module npmine.create_report:

create_report(report_print, dois, out_file='npmine_report.html', useSVG=False)
    Creates an html report from NPMINE's results
    Parameters
    ----------
    report_print: pd.DataFrame
        DataFrame containing columns 'doi', 'pubchem', 'ExactMolWt', 'smiles','source'.
    dois: str or list
        Directory of doi link files or link list.
    out_file: str
        HTML output file name.
    useSVG: bool
        If svg format should be used. Default is png.
    Returns
        Report html file.
    -------



In [20]:
path = 'pdfs'
doi = os.listdir(path)

doi = [os.path.join(path, x) for x in doi if '.pdf' in x]
doi

['pdfs/v37n2a15.pdf']

In [21]:
report_print = format_source_table(dflist, imglist)

report_print['source'].value_counts()

oscar    40
osra      7
Name: source, dtype: int64

In [22]:
create_report(report_print, doi, out_file='npmine_report.html')

In [23]:
!zip -r npmine_report.zip npmine_report.html figs pdfs

updating: npmine_report.html (deflated 88%)
updating: figs/ (stored 0%)
updating: figs/71583.png (deflated 60%)
updating: figs/280.png (deflated 70%)
updating: figs/445639.png (deflated 55%)
updating: figs/12366.png (deflated 50%)
updating: figs/3034819.png (deflated 75%)
updating: figs/12409.png (deflated 74%)
updating: figs/0.png (deflated 23%)
updating: figs/8029.png (deflated 46%)
updating: figs/8058.png (deflated 57%)
updating: figs/5363269.png (deflated 49%)
updating: figs/87578811.png (deflated 35%)
updating: figs/3931.png (deflated 55%)
updating: figs/123138.png (deflated 76%)
updating: figs/134828006.png (deflated 19%)
updating: figs/985.png (deflated 54%)
updating: figs/7005.png (deflated 26%)
updating: figs/783.png (deflated 81%)
updating: figs/12407.png (deflated 73%)
updating: figs/962.png (deflated 77%)
updating: figs/12535.png (deflated 73%)
updating: figs/7338.png (deflated 28%)
updating: figs/180.png (deflated 61%)
updating: figs/12410.png (defl