In [1]:
#! python3 - Script2.py - Retrieve SMILES codes from NCI/CACTUS database

'''This script takes an input of a list with chemical substances and transfers their
CAS numbers to the API of the nci.nih.gov chemical database for retrieving SMILES codes
for each CAS and substance as far as available in the database.'''

from os import chdir
import pandas as pd
from pandas import ExcelWriter
from urllib.parse import quote
from urllib.request import urlopen

# Define working directory
#chdir('C:/Path/to/your/working/directory')

# Connection to API function
def CIRconvert(ID):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ID) + '/smiles' # quote important!
        print('resolving', url)
        return urlopen(url).read().decode('utf8')
    except Exception:
        return 'Did not work'

CIRconvert('aspirin')

resolving http://cactus.nci.nih.gov/chemical/structure/aspirin/smiles


'CC(=O)Oc1ccccc1C(O)=O'

In [3]:
# Connection to API function
def CIRconvert(ID):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ID) + '/smiles' # quote important!
        print('resolving', url)
        return urlopen(url).read().decode('utf8')
    except Exception:
        return 'Did not work'

CIRconvert('ZnCl2')


resolving http://cactus.nci.nih.gov/chemical/structure/ZnCl2/smiles


'[Cl-].[Cl-].[Zn++]'

In [5]:
import requests
import time
from selenium import webdriver
import undetected_chromedriver as uc
from time import sleep
from urllib.parse import urljoin
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
import json
import numpy as np
import tqdm.auto as tqdm
import contextlib
from bs4 import BeautifulSoup
 

#from feedpage import LinkScrapperPersonal
#l = LinkScrapperPersonal()

In [4]:
from joblib import Parallel, delayed

In [11]:
def GiveR(url, driver):
    #r = requests.get(url, allow_redirects=False)
    #if r.status_code != 301:
    driver.get(url)

    sleep(5)
    
    return driver.page_source

In [12]:
options = Options()
options.add_argument('--headless')
def dataRetriver(i):
    driver = webdriver.Chrome(options= options)
    mid =  f'000{i}'
    mid = mid[-4:]
    url = f"https://hbcp.chemnetbase.com/faces/documents/06_25/06_25_{mid}.xhtml"
    r = GiveR(url, driver = driver)
    soup = BeautifulSoup(r, 'html.parser')
    trs = soup.find_all('tr', class_="ui-widget-content")
    data_table = []
    for row in trs:
        if len(row) == 10:
            elem_list = []
            for elem in row:
                elem_list.append(elem.text)
        data_table.append(elem_list)  
    driver.close()
    return data_table

In [16]:
d = dataRetriver(2)

In [18]:
results = Parallel(n_jobs=16)(delayed(dataRetriver)(arg) for arg in range(1, 48))

In [21]:
all_data = []
for data_table in results:
    for data in data_table:
        all_data.append(data)
import pandas as pd
df = pd.DataFrame(np.array(all_data), columns = ["Name","Synonym","Mol.Form","Formula","CAS","Mol.Wt","t_bp","Delta_G(t)","Delta_G(25)","Ref"])
df = df.drop_duplicates(keep='first')

In [25]:
df

Unnamed: 0,Name,Synonym,Mol.Form,Formula,CAS,Mol.Wt,t_bp,Delta_G(t),Delta_G(25),Ref
0,Acetaldehyde,Ethanal,C2H4O,CH3CHO,75-07-0,44.052,20.8,25.76,25.47,1
1,Acetic acid,Ethanoic acid,C2H4O2,CH3COOH,64-19-7,60.052,117.9,23.70,23.36,1
2,Acetic anhydride,Acetyl acetate,C4H6O3,C4H6O3,108-24-7,102.089,139.5,38.2,,
3,Acetone,2-Propanone,C3H6O,(CH3)2CO,67-64-1,58.079,56.08,29.10,30.99,1
4,Acetonitrile,Methyl cyanide,C2H3N,CH3CN,75-05-8,41.052,81.6,29.75,32.94,1
...,...,...,...,...,...,...,...,...,...,...
1752,"3,4-Xylenol","3,4-Dimethylphenol",C8H10O,C8H10O,95-65-8,122.164,227.31,,85.03,1
1753,"3,5-Xylenol","3,5-Dimethylphenol",C8H10O,C8H10O,108-68-9,122.164,221.71,,82.01,1
1754,Zinc bromide,,Br2Zn,ZnBr2,7699-45-8,225.217,≈670,118,,
1755,Zinc chloride,,Cl2Zn,ZnCl2,7646-85-7,136.315,732,126,,


In [26]:
results = Parallel(n_jobs=16)(delayed(CIRconvert)(arg) for arg in df['CAS'])

resolving http://cactus.nci.nih.gov/chemical/structure/75-07-0/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/64-19-7/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/108-24-7/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/67-64-1/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/75-05-8/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/98-86-2/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/107-02-8/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/107-13-1/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/591-87-7/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/107-18-6/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/7429-90-5/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/16962-07-5/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/7727-15-3/smiles
resolving http://cactus.nci.nih.gov/chemical/structure/7784-23-8/smiles
resolv

In [32]:
missing_index = np.where(np.array(results)=='Did not work')[0]

In [37]:
df.iloc[missing_index].Name.values

array(['Bis(ethoxymethyl) ether', 'Bromosilane', 'sec-Butyl methyl ether',
       '2-Chloropentane, (+)', 'Chromium(VI) dichloride dioxide',
       'Dichlorodifluorosilane', 'Fluorosilane', '6-Heptadecanol',
       '7-Heptadecanol', '3-Heptanol, (S)-', 'Iodine pentafluoride',
       '2-Methyl-1-butanethiol, (+)', 'cis-2-Methylcyclohexanol',
       '3-Methylhexane', '5-Methyl-3-hexanol, (±)-',
       '3-Methyl-2-pentanone, (±)-', 'Osmium(V) fluoride',
       'Pentaborane(11)', '1,15-Pentadecanediol', 'Perfluorononane',
       'Rhenium(VII) dioxytrifluoride', 'Rhenium(V) fluoride',
       'Rhenium(VI) oxytetrafluoride', 'Tetraborane(10)',
       'Thionitrosyl fluoride (NSF)', '1,13-Tridecanediol',
       '2,4,7-Trimethyloctane', 'Tungsten(VI) fluoride',
       'Tungsten(VI) oxytetrachloride'], dtype=object)

In [38]:
df['SoluteSMILES'] = results

In [41]:
somedf = df[df.SoluteSMILES != 'Did not work']
corrected = somedf[['SoluteSMILES', 't_bp', 'Formula', 'Delta_G(t)', 'Delta_G(25)']].reset_index().drop('index', axis = 1)
corrected = corrected.rename({'Delta_G(t)':'Delta_G(t)(kJ/mol)', 'Delta_G(25)':'Delta_G(25)(kJ/mol)'}, axis = 1)
corrected.to_excel("Enthalpy_data.xlsx")

In [54]:
all_data

[['Acetaldehyde',
  'Ethanal',
  'C2H4O',
  'CH3CHO',
  '75-07-0',
  '44.052',
  '20.8',
  '25.76',
  '25.47',
  '1'],
 ['Acetic acid',
  'Ethanoic acid',
  'C2H4O2',
  'CH3COOH',
  '64-19-7',
  '60.052',
  '117.9',
  '23.70',
  '23.36',
  '1'],
 ['Acetic anhydride',
  'Acetyl acetate',
  'C4H6O3',
  'C4H6O3',
  '108-24-7',
  '102.089',
  '139.5',
  '38.2',
  '',
  ''],
 ['Acetone',
  '2-Propanone',
  'C3H6O',
  '(CH3)2CO',
  '67-64-1',
  '58.079',
  '56.08',
  '29.10',
  '30.99',
  '1'],
 ['Acetonitrile',
  'Methyl cyanide',
  'C2H3N',
  'CH3CN',
  '75-05-8',
  '41.052',
  '81.6',
  '29.75',
  '32.94',
  '1'],
 ['Acetophenone',
  'Methyl phenyl ketone',
  'C8H8O',
  'C6H5C=OCH3',
  '98-86-2',
  '120.149',
  '202.1',
  '43.98',
  '55.40',
  '8'],
 ['Acrolein',
  '2-Propenal',
  'C3H4O',
  'CH2=CHCHO',
  '107-02-8',
  '56.063',
  '52.3',
  '28.3',
  '',
  ''],
 ['Acrylonitrile',
  'Propenenitrile',
  'C3H3N',
  'CH2=CHCN',
  '107-13-1',
  '53.063',
  '77.2',
  '32.6',
  '',
  ''],
 ['Al