In [None]:
import re
import pubchempy as pcp
import pandas as pd

In [5]:
chemicals = pd.read_excel('chemical_list_nhanes.xlsx')['Adducts']
chemicals

0                               Acrylamide
1                           Ethylene Oxide
2                             Formaldehyde
3                              Glycidamide
4                         3-Chlorotyrosine
                      ...                 
463              p-Phenylenediamine (PPDA)
464                   Sex Steroid Hormones
465                       Estradiol, Total
466    Sex hormone-binding globulin (SHBG)
467                    Testosterone, Total
Name: Adducts, Length: 468, dtype: object

In [None]:
def get_chemical_synomyms(chemicals):
    synonyms = []
    for chemical in chemicals:
        try:
            # Remove parenthesis and anything inside the parenthesis if they appear at the end of the string
            if chemical.endswith(')'):
                chemical = re.sub(r'\(.*?\)$', '', chemical).strip()
            compound = pcp.get_compounds(chemical, 'name')[0]
            syns = compound.synonyms
            synonyms.append(syns)
        except (IndexError, AttributeError):
            synonyms.append(None)
    return synonyms



In [55]:
synonyms = get_chemical_synomyms(chemicals)
def extract_cas_number_from_synonyms(synonyms):
    cas_number = None
    for syn in synonyms:
        if re.match(r'^\d{2,7}-\d{2}-\d$', syn):
            cas_number = syn
            break
    return cas_number

chemical_cas_df = pd.DataFrame({
    'Chemical Name': chemicals,
    'CAS Number': [extract_cas_number_from_synonyms(syn) if syn is not None else '000-00-000' for syn in synonyms]
})
chemical_cas_df

Unnamed: 0,Chemical Name,CAS Number
0,Acrylamide,79-06-1
1,Ethylene Oxide,75-21-8
2,Formaldehyde,50-00-0
3,Glycidamide,5694-00-8
4,3-Chlorotyrosine,7423-93-0
...,...,...
463,p-Phenylenediamine (PPDA),106-50-3
464,Sex Steroid Hormones,000-00-000
465,"Estradiol, Total",000-00-000
466,Sex hormone-binding globulin (SHBG),000-00-000


In [None]:
chemical_cas_df = pd.DataFrame({
    'Chemical Name': chemicals,
    'CAS Number': [extract_cas_number_from_synonyms(syn) if syn is not None else '000-00-000' for syn in synonyms]
})
chemical_cas_df

Unnamed: 0,Chemical Name,CAS Number
418,N-Acetyl-S-(3-hydroxypropyl-1-methyl)-L-cysteine,
419,2-Aminothiazoline-4-carboxylic acid,2150-55-2
420,N-Acetyl-S-(N-methylcarbamoyl)-L-cysteine,103974-29-4
421,Phenylglyoxylic acid,611-73-4
422,N-Acetyl-S-(2-hydroxypropyl)-L-cysteine,923-43-3
423,N-Acetyl-S-(phenyl-2-hydroxyethyl)-L-cysteine,000-00-000
424,N-Acetyl-S-(1-phenyl-2-hydroxyethyl)-L-cystein...,000-00-000
425,Mandelic acid,90-64-2
426,N-Acetyl-S-(trichlorovinyl)-L-cysteine,111348-61-9
427,N-Acetyl-S-(benzyl)-L-cysteine,000-00-000


In [56]:
chemical_cas_df.to_excel('chemical_cas_numbers.xlsx', index=False)

In [44]:
synonyms[467]

In [45]:
chemicals[467]

'Testosterone, Total'