In [1]:
import requests
import json
import pandas as pd

## Cas API 

In [67]:
# Base URL da API
BASE_URL = "https://commonchemistry.cas.org/api"

# Parâmetros de busca
cas_rn = "151-21-3"  # Altere para o termo desejado

try:
    # Requisição GET para o endpoint /search
    response = requests.get(f"{BASE_URL}/detail?cas_rn={cas_rn}")
    response.raise_for_status()  # Verifica se houve erro na requisição

    # Resultados da busca
    data = response.json()
    df = pd.DataFrame([data]) 
    print('Done!')

except requests.exceptions.RequestException as e:
    print(f"Erro ao acessar a API: {e}")


df.head()

Done!


Unnamed: 0,uri,rn,name,images,inchi,inchiKey,smile,canonicalSmile,molecularFormula,molecularMass,experimentalProperties,propertyCitations,synonyms,replacedRns,hasMolfile
0,substance/pt/151213,151-21-3,Sodium dodecyl sulfate,"[<svg width=""374"" viewBox=""0 0 374 82"" style=""...",InChI=1S/C12H26O4S.Na/c1-2-3-4-5-6-7-8-9-10-11...,InChIKey=FMWAXKQEIXRUTI-UHFFFAOYSA-N,C(CCCCCCCCCC)COS(=O)(=O)O.[Na],[Na].O=S(=O)(O)OCCCCCCCCCCCC,C<sub>12</sub>H<sub>26</sub>O<sub>4</sub>S.Na,,"[{'name': 'Melting Point', 'property': '204-20...","[{'docUri': '', 'sourceNumber': 1, 'source': '...",[Sulfuric acid monododecyl ester sodium salt (...,"[1334-67-4, 1335-72-4, 8012-56-4, 8048-56-4, 1...",True


## Pubchem API

In [76]:
# Base URL da API
BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"

# Parâmetros de busca
cas_rn = "151-21-3"  # Altere para o termo desejado

try:
    # Requisição GET para o endpoint /search
    response = requests.get(f"{BASE_URL}/compound/xref/RN/{cas_rn}/description/JSON")
    response.raise_for_status()  # Verifica se houve erro na requisição

    # Resultados da busca
    data = response.json()
    info_list = data["InformationList"]["Information"]
    pubchem_df = pd.DataFrame(results) 
    print('Done!')

except requests.exceptions.RequestException as e:
    print(f"Erro ao acessar a API: {e}")

Done!


In [85]:
df = pd.DataFrame(info_list)
df

Unnamed: 0,CID,Title,Description,DescriptionSourceName,DescriptionURL
0,8778,Lauryl sulfate,,,
1,8778,,Dodecyl hydrogen sulfate is an alkyl sulfate. ...,ChEBI,https://www.ebi.ac.uk/chebi/searchId.do?chebiI...
2,3423265,Sodium dodecyl sulfate,,,
3,3423265,,Sodium dodecyl sulfate is an organic sodium sa...,ChEBI,https://www.ebi.ac.uk/chebi/searchId.do?chebiI...
4,4329331,Irium,,,
5,4329331,,Dodecyl sulfate is an organosulfate oxoanion. ...,ChEBI,https://www.ebi.ac.uk/chebi/searchId.do?chebiI...
6,6330930,Dodecyl hydrogen sulfate; sodium,,,
7,21909502,Dreft,,,


## Pubchem with Selenium

In [62]:
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

#### SEARCH

In [63]:
# Inicializa o WebDriver
chromedriver_autoinstaller.install()
driver = webdriver.Chrome()

In [64]:
url = 'https://pubchem.ncbi.nlm.nih.gov/'
driver.get(url)

In [65]:
time.sleep(3)
search = driver.find_element(By.XPATH, '/html/body/div[1]/div/div/main/div[1]/div/div[2]/div/div[2]/form/div/div[1]/input')
search.send_keys('cas-151-21-3')

search.send_keys(Keys.RETURN)

In [66]:
# Aguardar até que o elemento esteja visível e clicável
try:
    # Substitua o tempo de espera conforme necessário (em segundos)
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/main/div[2]/div[1]/div/div[2]/div/div[1]/div[2]/div[1]/a/span/span'))
    )
    # Clica no elemento
    element.click()
    print("Elemento clicado com sucesso!")
except Exception as e:
    print(f"Erro ao localizar ou clicar no elemento: {e}")#send.click()

Elemento clicado com sucesso!


### Select field to extractor 

In [67]:
# Extração de informações específicas
try:
    cid = driver.find_element(By.XPATH, '//div[text()="PubChem CID"]/following-sibling::div').text
    molecular_formula = driver.find_element(By.XPATH, '//div[text()="Molecular Formula"]/following-sibling::div').text
    synonyms = driver.find_element(By.XPATH, '//div[text()="Synonyms"]/following-sibling::div').text
    molecular_weight = driver.find_element(By.XPATH, '//div[text()="Molecular Weight"]/following-sibling::div').text
    parent_compounds = driver.find_element(By.XPATH, '//div[text()="Parent Compound"]/following-sibling::div').text
    component_compounds = driver.find_element(By.XPATH, '//div[text()="Component Compounds"]/following-sibling::div').text
    dates = driver.find_element(By.XPATH, '//div[text()="Dates"]/following-sibling::div').text
    description = driver.find_element(By.XPATH, '//div[text()="Description"]/following-sibling::div').text
    expanded_content = driver.find_element(By.XPATH, '//*[@id="Title-and-Summary"]/div/div/div/div[10]/div[2]/div/button').text

# Feche o navegador
driver.quit()

CID: 3423265
Fórmula Molecular: NaSO4C12H25
C12H25O4S.Na
C12H25NaO4S
Sinônimos: Sodium dodecyl sulfate
151-21-3
SODIUM LAURYL SULFATE
Sodium lauryl sulphate
Sodium dodecylsulfate
View More...
Peso Molecular: 288.38 g/mol
Computed by PubChem 2.2 (PubChem release 2021.10.14)
Componentes parentes: CID 5360545 (Sodium)
CID 8778 (Lauryl sulfate)
Datas: Create:
2005-09-08
Modify:
2025-01-18
Descrição: Dodecyl sulfate, [sodium salt] appears as white to pale yellow paste or liquid with a mild odor. Sinks and mixes with water. (USCG, 1999)
U.S. Coast Guard. 1999. Chemical Hazard Response Information System (CHRIS) - Hazardous Chemical Data. Commandant Instruction 16465.12C. Washington, D.C.: U.S. Government Printing Office.
CAMEO Chemicals
Sodium dodecyl sulfate is an organic sodium salt that is the sodium salt of dodecyl hydrogen sulfate. It has a role as a detergent and a protein denaturant. It contains a dodecyl sulfate.
ChEBI
Sodium Lauryl Sulfate (SLS) is an anionic surfactant naturally de

### Passando pra DF

In [75]:
data = {
    "CID": [cid],
    "Fórmula Molecular": [molecular_formula],
    "Sinônimos": [synonyms],
    "Peso Molecular": [molecular_weight],
    "Componentes parentes": [component_compounds],
    "Datas": [dates],
    "Descrição": [description]
}

df = df.applymap(lambda x: str(x).replace('\\n', ' ').replace('\n', ' ') if isinstance(x, str) else x)

# Criação de um DataFrame
df = pd.DataFrame(data)

In [76]:
df

Unnamed: 0,CID,Fórmula Molecular,Sinônimos,Peso Molecular,Componentes parentes,Datas,Descrição
0,3423265,NaSO4C12H25\nC12H25O4S.Na\nC12H25NaO4S,Sodium dodecyl sulfate\n151-21-3\nSODIUM LAURY...,288.38 g/mol\nComputed by PubChem 2.2 (PubChem...,CID 5360545 (Sodium)\nCID 8778 (Lauryl sulfate),Create:\n2005-09-08\nModify:\n2025-01-18,"Dodecyl sulfate, [sodium salt] appears as whit..."


## T3db

In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd 

In [2]:
def fetch_toxin_xml(toxin_id):
    """
    Fetch XML data for a given toxin ID from the T3DB database.
    """
    
    base_url = f"http://www.t3db.ca/toxins/{toxin_id}.xml"
    try:
        response = requests.get(base_url)
        response.raise_for_status()  # Ensure the request was successful
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching toxin XML for ID {toxin_id}: {e}")
        return None

def parse_xml_to_table(xml_content):
    """
    Parse XML content into a pandas DataFrame with one row of data.
    """
    try:
        root = ET.fromstring(xml_content)
        data = {}

        # Extract key-value pairs from the XML structure
        for element in root:
            data[element.tag] = element.text

        # Convert the dictionary to a DataFrame
        return pd.DataFrame([data])
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on parse failure


In [5]:
data = pd.read_csv('../data/toxins.csv')
data = data.reset_index()
data.head(1)

data = data[['level_0', 'level_7']]

data.rename(columns={'level_0': 'toxin_id', 'level_7': 'cas_number'}, inplace=True)

In [100]:
data

Unnamed: 0,toxin_id,cas_number
0,T3D0001,7440-38-2
1,T3D0002,7439-92-1
2,T3D0003,7439-97-6
3,T3D0004,75-01-4
4,T3D0006,71-43-2
...,...,...
3673,T3D5000,7664-93-9
3674,T3D5001,692-29-5
3675,T3D5002,89762-39-0
3676,T3D5003,10275-07-7


In [112]:
def t3db_extractor(cas_numbers:str, delay=1):
    """
    Process a list of toxin IDs and fetch their data with a delay between requests.
    
    Args:
        cas_number (list): List of toxin IDs to process.
        delay (int or float): Time in seconds to wait between requests.
    
    Returns:
        pd.DataFrame: A DataFrame with the combined toxin data.
    """
    all_toxins = []  # Lista para armazenar os resultados

    # Carrega os dados do CSV
    toxins_data = pd.read_csv('../data/toxins_id.csv')

    # Converte 'cas_number' para string (para garantir que a comparação funcione corretamente)
    toxins_data['cas_number'] = toxins_data['cas_number'].astype(str)

    # Itera sobre a lista de cas_numbers
    for cas_number in cas_numbers:
        toxin_id_ = toxins_data.query(f"cas_number == '{cas_number}'")[['toxin_id']]
        print(toxin_id_.loc[0, 'toxin_id']) 
        xml_content = fetch_toxin_xml(toxin_id_.loc[0, 'toxin_id'])
        if xml_content:  # Check if valid XML was fetched
            df_toxin = parse_xml_to_table(xml_content)
            all_toxins.append(df_toxin)
        
        # Delay between requests
        #time.sleep(delay)

    # Concatenate all DataFrames, if available
    if all_toxins:
        return pd.concat(all_toxins, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an em

In [115]:
teste = t3db_extractor(['7440-38-2'])

T3D0001


In [111]:
xml_content = fetch_toxin_xml('T3D0003')

## ECHA

In [19]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import chromedriver_autoinstaller

# Instalar e configurar o driver automaticamente
chromedriver_autoinstaller.install()
driver = webdriver.Chrome()


# Acessar a página
driver.get("https://echa.europa.eu/pt/information-on-chemicals")

# Configurar WebDriverWait
wait = WebDriverWait(driver, 10)
actions = ActionChains(driver)

# Aguarde o botão de aceitar cookies estar clicável
cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="cookie-consent-banner"]/div/div/div[2]/a[1]')))
actions.move_to_element(cookie_button).click().perform()

time.sleep(2)
# Localizar o checkbox pelo id
checkbox = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="_disssimplesearchhomepage_WAR_disssearchportlet_fm"]/div[2]/label/span')))
actions.move_to_element(checkbox).click().perform()

time.sleep(2)

time.sleep(3)
search = driver.find_element(By.XPATH, '//*[@id="autocompleteKeywordInput"]')
search.send_keys('151-21-3')

search.send_keys(Keys.RETURN)

# Aguardar até que o elemento esteja visível e clicável
try:
    # Substitua o tempo de espera conforme necessário (em segundos)
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="_disssimplesearch_WAR_disssearchportlet_rmlSearchResultVOsSearchContainerSearchContainer"]/table/tbody/tr[1]/td[1]/a'))
    )
    # Clica no elemento
    element.click()
    print("Elemento clicado com sucesso!")
except Exception as e:
    print(f"Erro ao localizar ou clicar no elemento: {e}")#send.click()


time.sleep(5)
    # Extração de informações específicas
try:
    ec = driver.find_element(By.XPATH, '//*[@id="infocardContainer"]/div/div[1]/div/div[1]/div/div[1]/div/div/div/p[1]').text
    cas = driver.find_element(By.XPATH, '//*[@id="infocardContainer"]/div/div[1]/div/div[1]/div/div[1]/div/div/div/p[3]').text
    molecular_formula = driver.find_element(By.XPATH, '//*[@id="infocardContainer"]/div/div[1]/div/div[1]/div/div[1]/div/div/div/p[3]').text
    haz_classification_laballing = driver.find_element(By.XPATH, '//*[@id="infocardContainer"]/div/div[1]/div/div[1]/div/div[2]/div/div/div/p').text
    about_1 = driver.find_element(By.XPATH, '//*[@id="aboutSubstanceParagraphWrapper"]/p[1]').text
    about_2 = driver.find_element(By.XPATH, '//*[@id="aboutSubstanceParagraphWrapper"]/p[2]').text
    consumer_user = driver.find_element(By.XPATH, '//*[@id="aboutSubstanceParagraphWrapper"]/p[3]').text
    article_services = driver.find_element(By.XPATH, '//div[text()="Description"]/following-sibling::div').text
    
# Feche o navegador
driver.quit()
#//*[@id="_disssimplesearch_WAR_disssearchportlet_rmlSearchResultVOsSearchContainerSearchContainer"]/table/tbody/tr[1]/td[1]/a

Elemento clicado com sucesso!
