In [1]:
## Import packages

from cmath import e
import pandas as pd
import csv

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

In [3]:
# Read CSV file into dataframe

df = pd.read_csv("cosing_clean.csv", header=0)
print(len(df))

3617


In [4]:
# Set file header

header = ['id', 'Chemical Name', 'LogP (Partition coefficient)', 'LogS (water solubility of the ingredient)', 'Molecular weight', 'Density', 'Chemical formula', 'Structure', 'Synonyms', 'Functionalities']

In [6]:
# Setup Selenium web driver

def setupDriver():
    driver = webdriver.Chrome(executable_path='C:\webdriver\chromedriver.exe')
    driver.get("https://www.chembk.com/en")
    return driver

In [7]:
# Enter searchbox

def search(index, driver):
    # The first one searched from home page, the rest searched from result page
    if index == 0:
        searchBox = driver.find_element(By.ID, "chem")
    else:
        driver.find_element(By.ID, "chem").clear()
        searchBox = driver.find_element(By.ID, "chem")

    searchBox.send_keys(chemical_name)
    searchBox.send_keys(Keys.ENTER)

In [8]:
# Mark properties as empty when no results found

def writeEmpty(writer, index, chemical_name):
    writer.writerow([index, chemical_name, "", "", "", "", "", "", "", ""])

In [9]:
# Extract information from Supplier Page

def extractSupplierPage(tds, driver, writer):
    for row in driver.find_elements(By.TAG_NAME, 'tr'):
        tds = row.find_elements(By.TAG_NAME, 'td')
        
        # Get CAS
        if tds[0].text == "CAS":
            cas = tds[1].text

        # Get formula
        if tds[0].text == "Chemical Formula":
            formula = tds[1].text

        # Get molecular weight
        if tds[0].text == "Molecular Weight":
            weight = tds[1].text
            break
        
    writer.writerow([index, chemical_name, "", water_solubility, weight, density, formula, "", synonyms, ""])
        
    driver.get("https://www.chembk.com/en")

In [11]:
# Main

with open('chembk_1020_all.csv', 'w', encoding = 'UTF8', newline="") as f:
    try:
        writer = csv.writer(f)
        writer.writerow(header)

        # Setup Selenium web driver
        driver = setupDriver()

        count_e = 0
        for index, row in df.iterrows():
            try:
                found = False

                # Use chemical name as search keyword
                # Remove slash in the string which cannot be put into search box
                chemical_name = row['Chemical Name'].title().replace("/", " ")
                
                # Initialization
                macthed_name = ""
                cas = ""
                synonyms = ""
                formula = ""
                weight = ""
                density = ""
                water_solubility = ""
                
                # Search
                search(index, driver)
                
                # -- Choose which page to move on -- 
                
                trs = driver.find_elements(By.TAG_NAME, 'tr')
                tds = driver.find_elements(By.TAG_NAME, 'td')
                
                # If no table -> No Results found
                if len(trs) == 0 or len(tds) == 0:   
                    writeEmpty(writer, index, chemical_name)
                    print(chemical_name + " : no results found.")
                    continue
                
                # If exact name match found
                if len(driver.find_elements(By.XPATH, '//a[@href="/en/chem/'+chemical_name+'"]')) == 1:
                    matched_name = chemical_name
                    nextPage = driver.find_element(By.XPATH, '//a[@href="/en/chem/'+chemical_name+'"]')
                    found = True
                    print(chemical_name + " : exact name found.")
                
                # If no exact match, having multiple results:
                #    1. See if the search name matches one of the Synonyms
                if found == False:
                    tbodys = driver.find_elements(By.TAG_NAME, 'tbody')
                    trs = tbodys[0].find_elements(By.TAG_NAME, 'tr')
                    for tr in trs:
                        cols = tr.find_elements(By.TAG_NAME, 'td')
                        if len(cols) >= 5:   # make sure it is data, not some random row with tds
                            if chemical_name in cols[3].text:
                                nextPage = cols[2].find_element(By.TAG_NAME, 'a')
                                matched_name = cols[2].text
                                found = True
                                print(chemical_name + " : have matched synonym.")
                                break

                #    2. In the rest options, pick the first one with Molecular Formula shown
                if found == False:
                    trs = driver.find_elements(By.TAG_NAME, 'tr')
                    for tr in trs:
                        cols = tr.find_elements(By.TAG_NAME, 'td')
                        if len(cols) >= 5:   # make sure it is data, not some random row with tds
                            if cols[4].text != "":
                                nextPage = cols[2].find_element(By.TAG_NAME, 'a')
                                matched_name = cols[2].text
                                found = True
                                print(chemical_name + " : pick the first one with molecular.")
                                break

                #    3. If no options showing Molecular Formula, pick the first one containing the chemical name as its substring of the results
                if found == False:
                    if driver.find_elements(By.XPATH, '//a[contains(@href, "%s")]' % chemical_name):
                        options = driver.find_elements(By.XPATH, '//a[contains(@href, "%s")]' % chemical_name)
                        if len(options) > 1:
                            matched_name = options[1].text
                            nextPage = options[1]   # Skip option 0
                            found = True
                            print(chemical_name + " : pick the first one containing the chemical name.")
                            break
                    
                #  4. Pick the first one if no conditions matched.
                if found == False:
                    tbodys = driver.find_elements(By.TAG_NAME, 'tbody')
                    if len(tbodys) != 0:
                        for tbody in tbodys:
                            trs = tbody.find_elements(By.TAG_NAME, 'tr')
                            if len(trs) != 0:                    
                                tds = trs[0].find_elements(By.TAG_NAME, 'td')
                                nextPage = tds[2].find_element(By.TAG_NAME, 'a')
                                matched_name = tds[2].text
                                print(chemical_name + " : pick the first one.")
                                break
                                    
                # If chemical found
                nextPage.click()

                # If the page found is a supplier page
                if len(driver.find_elements(By.TAG_NAME, 'h4')) > 0:
                    if driver.find_elements(By.TAG_NAME, 'h4')[0].text == "Request for quotation":
                        extractSupplierPage(tds, driver, writer)
                        continue

                # Extract info from header paragraph
                formula = driver.find_elements(By.TAG_NAME, 'h4')[2].text.split(":")[1].strip()
                weight =  driver.find_elements(By.TAG_NAME, 'h4')[3].text.split(":")[1].strip()

                # Extract info from table
                for row in driver.find_elements(By.TAG_NAME, 'tr'):
                    tds = row.find_elements(By.TAG_NAME, 'td')
                    
                    # Grab Synonyms
                    if tds[0].text == "Synonyms":
                        syns = tds[1].find_elements(By.TAG_NAME, 'a')
                        for syn in syns:
                            if len(syn.text.split(",")) > 1:
                                synonyms += syn.text + ","
                            else:
                                synonyms += syn.text

                    # Grab CAS
                    if tds[0].text == "CAS":
                        cas = tds[1].find_element(By.TAG_NAME, 'a').text
                    
                    # Grab Density
                    if tds[0].text == "Density":
                        density = tds[1].text

                    # Grab Water Solubility
                    if tds[0].text == "Water Solubility":
                        water_solubility = tds[1].text
                        break   # exit earlier if all the properties have found

                writer.writerow([index, chemical_name, "", water_solubility, weight, density, formula, "", synonyms, ""])
            except Exception as e:
                writer.writerow([index, chemical_name, "", "", "", "", "", "", ""])
                count_e += 1
                print("error" + str(count_e) + ": " + chemical_name + " - " + str(e))
                continue
    except Exception as e:
        print(e)

  driver = webdriver.Chrome(executable_path='C:\webdriver\chromedriver.exe')


Message: session not created: This version of ChromeDriver only supports Chrome version 105
Current browser version is 107.0.5304.107 with binary path C:\Program Files\Google\Chrome\Application\chrome.exe
Stacktrace:
Backtrace:
	Ordinal0 [0x0064DF13+2219795]
	Ordinal0 [0x005E2841+1779777]
	Ordinal0 [0x004F423D+803389]
	Ordinal0 [0x005164AC+943276]
	Ordinal0 [0x005119F0+924144]
	Ordinal0 [0x0050F179+913785]
	Ordinal0 [0x005436B9+1128121]
	Ordinal0 [0x0054331A+1127194]
	Ordinal0 [0x0053E616+1107478]
	Ordinal0 [0x00517F89+950153]
	Ordinal0 [0x00518F56+954198]
	GetHandleVerifier [0x00942CB2+3040210]
	GetHandleVerifier [0x00932BB4+2974420]
	GetHandleVerifier [0x006E6A0A+565546]
	GetHandleVerifier [0x006E5680+560544]
	Ordinal0 [0x005E9A5C+1808988]
	Ordinal0 [0x005EE3A8+1827752]
	Ordinal0 [0x005EE495+1827989]
	Ordinal0 [0x005F80A4+1867940]
	BaseThreadInitThunk [0x755C6939+25]
	RtlGetFullPathName_UEx [0x77898FD2+1218]
	RtlGetFullPathName_UEx [0x77898F9D+1165]

