In [5]:
import os
import pandas as pd
import numpy as np

In [62]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_mz(smiles, charge=1):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")
    molecular_weight = Descriptors.ExactMolWt(mol)
    mz = molecular_weight / charge
    return mz

In [6]:
path_to_results_files = '/usr/scratch/NASA/results/'

In [7]:
def get_files_from_directory(path_to_dir):
    files = os.listdir(path_to_dir)
    return files

In [8]:
results_path = get_files_from_directory(path_to_results_files)

In [12]:
all_compounds = set()
for result in results_path:
    df = pd.read_csv(path_to_results_files+result, encoding='latin-1')
    unique_compounds = set(df['Name'].unique())

    all_compounds = all_compounds.union(unique_compounds)

all_compounds

  exec(code_obj, self.user_global_ns, self.user_ns)


{'Acenaphthene',
 '3-Quinolinecarboxylic acid, 7,8-dichloro-1,4-dihydro-4-oxo-',
 '9,10-DIHYDRO-2-METHYL-ANTHRACENE',
 '6-METHOXY-7-HYDROXYBENZOFURAN',
 'Verapamil',
 '1,1,1,8,8,8-HEXACHLORO-3,3,6,6-TETRAMETHYLOCTANE',
 '1-(4-Trimethylsilylmethyl-3-cyclohexenyl)-5-methyl-4-hexen-1-ol',
 'CYCLOHEXANOL, 4-(1,1-DIMETHYLETHYL)-1-(2-PROPENYL)-, CIS-',
 '9,12-Octadecadienoic acid (Z,Z)-, 2-(acetyloxy)-1-[(acetyloxy)methyl]ethyl ester',
 'Phosphorodifluoridothioic hydrazide, 2,2-dimethyl-',
 'Fluoro(methyl)(2,4,6-tri-tert-butylphenyl)silanol',
 'Naphthalene, 1,2,3,4-tetrahydro-2,3-dimethyl-',
 'Androst-1-en-3-one, 17-hydroxy-, (5ß,17ß)-',
 '2-Methylsulfanyl-6-(piperidine-1-sulfonyl)-benzothiazole',
 'Thr-Asp',
 '2-CYCLOHEXENE-1-METHANOL, 2,6,6-TRIMETHYL-a-1-PROPENYL-',
 'D-ERYTHRO-PENTITOL, 1,5-ANHYDRO-3,4-DIDEOXY-3-[[(4-METHYLPHENYL)SULFONYL]AMINO]-, 2-ACETATE',
 'Succinic acid, 3-methylbut-2-yl tetrahydrofurfuryl ester',
 "Spiro[androsta-1,4-diene-6,2'-oxirane]-3,17-dione",
 '[C-15N]-1,3,3-

In [6]:
all_compounds_array = np.array(list(all_compounds))
np.save('all_compounds_array.npy', all_compounds_array)


In [16]:
all_compounds_array[3535]

'Di(tert-butyl) trisulfide, perfluoro-'

# PubChem

In [17]:
import requests

def get_pubchem_link(compound_name):
    base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
    search_url = f"{base_url}/compound/name/{compound_name}/cids/JSON"

    response = requests.get(search_url)
    if response.status_code == 200:
        data = response.json()
        if "IdentifierList" in data and "CID" in data["IdentifierList"]:
            cid = data["IdentifierList"]["CID"][0]
            link = f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
            return link
        else:
            return "NaN"
    else:
        return "NaN"

# Example usage
compound_name = "Di(tert-butyl) trisulfide, perfluoro-"
link = get_pubchem_link(compound_name)
print(f"Link to {compound_name} on PubChem: {link}")


Link to Di(tert-butyl) trisulfide, perfluoro- on PubChem: https://pubchem.ncbi.nlm.nih.gov/compound/548567


In [28]:
import requests

def get_chemical_info(pubchem_url):
    # Extract the CID (Compound ID) from the URL
    cid = pubchem_url.split('/')[-1]

    # Construct the API URL to fetch the compound information
    api_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/InChI,InChIKey,CanonicalSMILES/JSON"

    # Send a GET request to the API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        properties = data['PropertyTable']['Properties'][0]

        # Extract InChI, InChIKey, and Canonical SMILES
        inchi = properties.get('InChI', 'Not available')
        inchikey = properties.get('InChIKey', 'Not available')
        canonical_smiles = properties.get('CanonicalSMILES', 'Not available')

        return inchi, inchikey, canonical_smiles
    else:
        return None, None, None

# Example usage
pubchem_url = "https://pubchem.ncbi.nlm.nih.gov/compound/297"
inchi, inchikey, canonical_smiles = get_chemical_info(pubchem_url)
print("InChI:", inchi)
print("InChIKey:", inchikey)
print("Canonical SMILES:", canonical_smiles)


InChI: InChI=1S/CH4/h1H4
InChIKey: VNWKTOKETHGBQD-UHFFFAOYSA-N
Canonical SMILES: C


# NIST

In [30]:
import requests

def get_nist_link(compound_name):
    search_url = f"https://webbook.nist.gov/cgi/cbook.cgi?Name={compound_name}&Units=SI"
    response = requests.get(search_url)
    if response.status_code == 200:
        if "Name Not Found" not in response.text:
            return search_url
        else:
            return "Compound not found"
    else:
        return "Error accessing NIST Chemistry WebBook"

# Example usage
compound_name = "CH4"
link = get_nist_link(compound_name)
print(f"Link to {compound_name} on NIST Chemistry WebBook: {link}")


Link to CH4 on NIST Chemistry WebBook: https://webbook.nist.gov/cgi/cbook.cgi?Name=CH4&Units=SI


In [40]:
import requests
from bs4 import BeautifulSoup

def extract_inchi_and_inchikey(nist_url):
    # Send a GET request to the NIST URL
    response = requests.get(nist_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the elements containing the InChI and InChIKey
        inchi_elem = soup.find('strong', text='IUPAC Standard InChI:')
        inchikey_elem = soup.find('strong', text='IUPAC Standard InChIKey:')

        # Extract the text and clean it up
        inchi = inchi_elem.find_next_sibling('span').text.strip() if inchi_elem else None
        inchikey = inchikey_elem.find_next_sibling('span').text.strip() if inchikey_elem else None

        return inchi, inchikey
    else:
        # Return None if the request was not successful
        return None, None

# Example usage
nist_url = 'https://webbook.nist.gov/cgi/cbook.cgi?ID=C7732185&Units=SI'
inchi, inchikey = extract_inchi_and_inchikey(nist_url)
print('IUPAC Standard InChI:', inchi)
print('IUPAC Standard InChIKey:', inchikey)


IUPAC Standard InChI: None
IUPAC Standard InChIKey: None


  
  from ipykernel import kernelapp as app


# Putting it all together

In [8]:
df = pd.read_csv('results/pubchem_extracted_info.csv')

spectrabase = pd.read_csv('results/spectrabase_info_extracted.csv')

df['Spectrabase_Link'] = 'NaN'


In [16]:
for idx, row in spectrabase[spectrabase['Spectrabase_Link'].notna()].iterrows():
    compound_name = row['Compound']
    df.loc[df['Compound'] == compound_name, 'Spectrabase_Link'] = row['Spectrabase_Link']
    df.loc[df['Compound'] == compound_name, 'InChI'] = row['InChI']
    df.loc[df['Compound'] == compound_name, 'InChI'] = row['InChI']
    df.loc[df['Compound'] == compound_name, 'InChIKey'] = row['InChIKey']

In [17]:
df

Unnamed: 0,Compound,PubChem_Link,InChI,InChIKey,Canonical_SMILES,Spectrabase_Link
0,Acenaphthene,https://pubchem.ncbi.nlm.nih.gov/compound/6734,InChI=1S/C12H10/c1-3-9-4-2-6-11-8-7-10(5-1)12(...,CWRYPZZKDGJXCA-UHFFFAOYSA-N,C1CC2=CC=CC3=C2C1=CC=C3,
1,"3-Quinolinecarboxylic acid, 7,8-dichloro-1,4-d...",,,,,https://spectrabase.com/compound/F2TkDN19xxT?f=
2,"9,10-DIHYDRO-2-METHYL-ANTHRACENE",,,,,
3,6-METHOXY-7-HYDROXYBENZOFURAN,,,,,https://spectrabase.com/compound/5nB8Hya86wY?f=
4,Verapamil,https://pubchem.ncbi.nlm.nih.gov/compound/2520,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",SGTNSNPWRIOYBX-UHFFFAOYSA-N,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,
...,...,...,...,...,...,...
11697,"2-PROPEN-1-ONE, 1-(2,6-DIHYDROXYPHENYL)-3-(4-M...",,,,,
11698,"Methyl (4S,5R)-2,2,5-trimethyl-1,3-dioxolane-4...",,,,,
11699,"5-O-[(1,1-DIMETHYLETHYL)DIMETHYLSILYL]-2,3-O-(...",,,,,https://spectrabase.com/compound/B603LoXKTtg?f=
11700,"Propanamide, 2,3,3,3-tetrafluoro-2-heptafluoro...",https://pubchem.ncbi.nlm.nih.gov/compound/561513,"InChI=1S/C13H8F11NO2/c14-9(11(17,18)19,8(26)25...",MBYGYAYBAZPQOT-UHFFFAOYSA-N,C1=CC=C(C=C1)CNC(=O)C(C(F)(F)F)(OC(C(C(F)(F)F)...,


# Generating dataset

In [67]:
df_list = []
for result in results_path:
    df = pd.read_csv(path_to_results_files+result, encoding='latin-1')
    df_list.append(df)

all_results = pd.concat(df_list)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [68]:
grouped_df = all_results.groupby('Name').mean()[['1st Dimension Time (s)', '2nd Dimension Time (s)']]

In [69]:
df = pd.read_csv('results/pubchem_spectrabase_combined_extracted_info.csv')

In [70]:
df = df.merge(grouped_df, left_on='Compound', right_index=True, how='left')

In [71]:
df = df.dropna(subset=['InChI'])
df

Unnamed: 0,Compound,PubChem_Link,InChI,InChIKey,Canonical_SMILES,Spectrabase_Link,1st Dimension Time (s),2nd Dimension Time (s)
0,Acenaphthene,https://pubchem.ncbi.nlm.nih.gov/compound/6734,InChI=1S/C12H10/c1-3-9-4-2-6-11-8-7-10(5-1)12(...,CWRYPZZKDGJXCA-UHFFFAOYSA-N,C1CC2=CC=CC3=C2C1=CC=C3,,5633.9200,1.790333
1,"3-Quinolinecarboxylic acid, 7,8-dichloro-1,4-d...",,InChI=1S/C12H9Cl2NO3/c1-2-18-12(17)7-5-15-10-6...,FAEKTUDLKDVPSY-UHFFFAOYSA-N,,https://spectrabase.com/compound/F2TkDN19xxT?f=,6688.6200,2.008000
3,6-METHOXY-7-HYDROXYBENZOFURAN,,InChI=1S/C10H12O3/c1-12-10-5-7-3-2-4-13-9(7)6-...,GZZZUNTZVMRCLS-UHFFFAOYSA-N,,https://spectrabase.com/compound/5nB8Hya86wY?f=,6422.3200,1.831000
4,Verapamil,https://pubchem.ncbi.nlm.nih.gov/compound/2520,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",SGTNSNPWRIOYBX-UHFFFAOYSA-N,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,,6909.3800,0.915000
5,"1,1,1,8,8,8-HEXACHLORO-3,3,6,6-TETRAMETHYLOCTANE",,"InChI=1S/C12H20Cl6/c1-9(2,7-11(13,14)15)5-6-10...",UHNXATNZQQDGKT-UHFFFAOYSA-N,,https://spectrabase.com/compound/BvzRAsT35lc,10131.3000,0.846500
...,...,...,...,...,...,...,...,...
11695,"benzene, 1,1',1'',1'''-(1,2-ethenediylidene)te...",https://pubchem.ncbi.nlm.nih.gov/compound/9174...,InChI=1S/C34H36O4/c1-5-35-29-17-9-25(10-18-29)...,ULKJGRGKIGOYGV-UHFFFAOYSA-N,CCOC1=CC=C(C=C1)C(=C(C2=CC=C(C=C2)OCC)C3=CC=C(...,,2998.9125,0.834250
11696,"2',4'-Dihydroxy-2,3-dimethoxychalcone",https://pubchem.ncbi.nlm.nih.gov/compound/5377844,InChI=1S/C17H16O5/c1-21-16-5-3-4-11(17(16)22-2...,JUCNRAJYHMZLOT-RMKNXTFCSA-N,COC1=CC=CC(=C1OC)C=CC(=O)C2=C(C=C(C=C2)O)O,,6982.9600,0.872000
11699,"5-O-[(1,1-DIMETHYLETHYL)DIMETHYLSILYL]-2,3-O-(...",,"InChI=1S/C14H26O5Si/c1-13(2,3)20(6,7)16-8-9-10...",VBPYRQYBEKHPKA-GMTAPVOTSA-N,,https://spectrabase.com/compound/B603LoXKTtg?f=,9330.6400,0.778000
11700,"Propanamide, 2,3,3,3-tetrafluoro-2-heptafluoro...",https://pubchem.ncbi.nlm.nih.gov/compound/561513,"InChI=1S/C13H8F11NO2/c14-9(11(17,18)19,8(26)25...",MBYGYAYBAZPQOT-UHFFFAOYSA-N,C1=CC=C(C=C1)CNC(=O)C(C(F)(F)F)(OC(C(C(F)(F)F)...,,4106.8760,1.090200


In [72]:
for idx, row in df.iterrows():
    if row['Spectrabase_Link'] == 'NaN': continue
    
    inchi = row['InChI']

    try:
        if inchi != 'NaN':
            mol = Chem.MolFromInchi(inchi)
            smiles = Chem.MolToSmiles(mol)
            df.loc[idx, 'Canonical_SMILES'] = smiles
            print(smiles)
    except:
        continue

c1cc2c3c(cccc3c1)CC2
CCOC(=O)c1c[nH]c2c(Cl)c(Cl)ccc2c1=O
COc1cc2c(cc1O)OCCC2
COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C)cc1OC
CC(C)(CCC(C)(C)CC(Cl)(Cl)Cl)CC(Cl)(Cl)Cl
CC(C)=CCCC(O)C1CC=C(C[Si](C)(C)C)CC1
CCCCC/C=C\C/C=C\CCCCCCCC(=O)OC(COC(C)=O)COC(C)=O
CN(C)NP(F)(F)=S
CC(C)(C)c1cc(C(C)(C)C)c([Si](C)(O)F)c(C(C)(C)C)c1
CC1Cc2ccccc2CC1C
CSc1nc2ccc(S(=O)(=O)N3CCCCC3)cc2s1
C[C@@H](O)[C@H](N)C(O)=N[C@@H](CC(=O)O)C(=O)O
C/C=C\C(O)C1C(C)=CCCC1(C)C
CC(=O)OC1COCCC1NS(=O)(=O)c1ccc(C)cc1
CC(C)C(C)OC(=O)CCC(=O)OCC1CCCO1
C[C@]12C=CC(=O)C=C1C1(CO1)C[C@@H]1[C@@H]2CC[C@]2(C)C(=O)CC[C@@H]12
O=[N+]([O-])N1CC([N+](=O)[O-])([15N+](=O)[O-])C1
CCCCCCCCCCCCCCCOC(=O)C(=O)OC1CCC1
CC(C)CC(C)O[Si](C)(C)C
C#C/C=C/c1ccccc1
CCCCCC=CCC=CC=CC=CC(SCC(N)C(=O)O)C(O)CCCC(=O)OC
CC(=O)O[C@H]1C[C@@H](OC(C)=O)[C@@H](C/C=C/c2ccccc2)[C@H]1/C=C/c1ccccc1
COCC(=O)N(c1c(C)cccc1C)C(C)C(=O)OC
Nc1ccccc1CPc1ccccc1
CCN(CC)c1ccc(NS(=O)(=O)c2ccc(N)cc2)cc1
CC(C)NCC(O[Si](C)(C)C)c1ccccc1Cl
CC(C)C(C)/C=C/C(C)C(C)C
O=O
O=C1c2c(O)cc(O)cc2O

[01:03:24] Explicit valence for atom # 4 B, 5, is greater than permitted
[01:03:24] ERROR: Explicit valence for atom # 4 B, 5, is greater than permitted



OC[C@@H]1O[C@H](Oc2cc(O)cc(/C=C/c3ccc(O)cc3)c2)[C@@H](O)[C@H](O)[C@H]1O
CC(C)P(CN(C)C)C(C)C
CCC(CC)(CC)c1ccccc1
O=C(O)[C@H](O)[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO
CC1(C)C=CCC23C4CC(CC412)C3(C)C
COc1ccc(-c2cc(=O)c3c(OC)c(OC)c(OC)cc3o2)cc1
C[NH+](C)C.[O-]C(F)(F)F
CCCC(C)C(C)CC
C1CCSSCC1
CCCCC#CCCCCCCCl
CC1(C)[C@@]2(C)N=N[C@]1(C)C1CCCC[C@@H]12
CCC/C=C/c1ccccc1
CCCCC/C=C/C1CCCCC1
CCCCCCO
N=c1c2c([nH]c3c1CCCC3)CCCC2
COc1ccc(CCN2CCC(N=c3[nH]c4ccccc4n3Cc3ccc(F)cc3)CC2)cc1
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)OCc1cccnc1
OC(c1ccccc1)c1ccccc1
CCc1c(C)ccc(C)c1C
COc1cc(OC)c(OC)cc1CC(C)N
O=c1c(O)c(-c2ccccc2O)oc2ccccc12
CCCCCCCCCCCCCCCCCCCCCC(=O)OC(=O)CCCCCCCCCCCCCCCCCCCCC
CC1(C)COC(CCO)OC1
CC1C2CC(=O)C1c1ccccc12
CC/C=C/C(CC)CO
CN1CCCC(C(=O)c2cccnc2)C1=O
CCCCCCCCCCCCC1C(C2CCCC2)CCC1C1CCCC1
C[N+](C)(C)CC(=O)[O-]
C=CCCC1CCCC(=O)C1
CC1(C)C2=CC=CC(C)(C)C23C=CC1C3
COc1cccc(CCC(=O)N2CCCC2OC)c1
C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CCC4[C@@]5(C)CC[C@H](O)C(C)(C)C5CC[C@]43C)C2[C@H]1C
C#CC(C)(O)CCCC(C)CCCC(C)CCCC

[01:03:26] Cannot assign bond directions!


O=C(O)CN=C(O)C(CS)Cc1ccccc1
C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C@@H](O)C[C@]2(C)[C@@]1(O)C(=O)CO
OCC1OCCO1
O=C(O)CCCCCCC[C@@H](O)[C@@H](O)CCCCCCO
COc1cc2c(cc1OC)[C@@H](Cc1ccc(Oc3cc(C[C@@H]4c5cc(OC)c(OC)cc5CCN4C)ccc3O)cc1)N(C)CC2
CCC1CC(C)C(C)C(CC)C1
CC(C)O[Si](OC(C)C)(OC(C)C)OC(C)C
C=C([C@H](CCCC)[C@H](O)CCCCCC)[Si](C)(C)C
CCCCCCCCCCCCCC(C)C
CCCCCCCCC(C)(C)O
CCCCC/C=C/C=C1/C(=O)CC(O)C1C/C=C\CCCC(=O)O
C=CC(C)(C)c1ccc(OC(C)=O)c(CC=C(C)C)c1
SC(=NC1CCCCC1)N1CCC(c2cnc[nH]2)CC1
CCCCCCCCCCC#CCOC(=O)c1ccccc1OC
CC(Cc1ccccc1)[n+]1cc(NC(=O)C(F)(F)F)on1
C[Si](C)(C)OC(=O)Cc1cc(O[Si](C)(C)C)ccc1O[Si](C)(C)C
C1=C(c2ccccc2)SC(c2ccccc2)=CC1=C(c1ccccc1)c1ccccc1
COc1cc2c(cc1OC)CC(=O)OC2
FC(F)(F)CC(F)(F)F
COc1cc(OC)nc(N=C(O)NS(=O)(=O)c2ncccc2C(=O)N(C)C)n1
S
C[Si]1(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O1
C=C(C)c1cc(=O)c(O)ccc1CC=C(C)C
FC(F)(F)C12C(F)(F)C3(F)C(F)(F)C(F)(C1(F)F)C(F)(F)C(C(F)(F)F)(C3(F)F)C2(F)F
CCCCC#CCCCCCC
CCCCCC1C2CCC(C)CC12
C[Si](C)(C)OCCS(=O)CCO[Si](C)(C)C
Cc1csc(N=C(O)c

[01:03:26] Cannot assign bond directions!


Cc1cccc(C(=O)Oc2cc(Cl)ccc2Cl)c1
CCC(C)CCCC(C)CCCC(C)CCCC(C)C
O=C(/C=C/C=C/c1ccccc1)C(Br)Br
CSCSC(C)=O
COc1ccc(-c2c(C)c3cc(OC)ccc3oc2=O)cc1
Cc1ccc(C(C)C)c(CC(C)CO)c1
COc1ccc([Si](C)(C)C)cc1
CC(C)(C)[Si](C)(C)OCCOC(=O)c1ccccc1O
CC/C=C\C#CCCCCCCCCCCC
O=C1CCCCC1CP(=O)(c1ccccc1)c1ccccc1
Oc1ccc2ccccc2c1Cc1c(O)nc(S)nc1-c1ccccc1
OC1(C(F)(F)F)CCCC(c2ccccc2)C1
CSCSSC
FC(F)(F)C(F)(F)C(F)(F)F
O=C(O)c1cc(O)nc(O)c1
COc1ccc(NCc2nn3c(C(=S)Nc4ccccc4)c(-c4ccc(Br)cc4)c4c3n2CCCC4)cc1
Cn1c(COc2ccc(Cl)cc2)c(C#N)c2ccccc21
CC(C)(c1ccc(O)cc1)c1ccc(O[Si](C)(C)C(C)(C)C)cc1
CC1=C(CCC(C)C=O)C(C)(C)CCC1
CC(C)(O)C(O)C(=O)O
CCC/C=C\C(C)CCCCCCCCOC(C)=O
CCCc1ccc(C#Cc2ccc(C)cc2)cc1
CCOC(=O)C(F)(F)C(F)(F)C(F)(F)F
CSC(=N)N
C[Si](C)(C)O[C@H]1C(F)=CC[C@H]2C(=O)C=CC(=O)[C@H]21
CCC/C=C\C/C=C\C/C=C\CCCCC(=O)OCCCC
CC(Oc1ccc(Cl)cc1Cl)C(O)=NC(C)(C#N)C(C)C
O=C(O)COCCOCCOCC(=O)O
CCCCCCCCCCCCCCCCCCCCCCCCCCC(C)C
[N-]=[N+]=C1C=C([N+](=O)[O-])C=C([N+](=O)[O-])C1=O
CC(=O)[C@]1(O)CC[C@@]2(O)[C@]1(C)[C@H](OC(=O)c1ccc(O)cc1)C[C@@H]1[C@@]3(

[01:03:29] Cannot assign bond directions!


CC(=O)OCC1(O)CC23CCC4c5ccoc5CCC4(C)C2CCC1C3
C=CCc1ccccc1OCC(O)CNC(C)C
CC1CCCC(C)(C)C1C
CCCCCCC12CCC(CC1)CC2
CCOC(=O)C1c2c(nnc3c2C(C)C=N3)N(C)C1C(=O)OCC
Cc1ccc(C(=O)c2ccc(CC(=O)O)n2C)cc1
CCCCCCCCCCCCCCC1CCCCCCC1
CCCCCCCCCOC(=O)C(=O)OCC(C)C
CC(C)CCCC1CCC(C(C)CCCC(C)C)CC1
N#CC1(N)CC2C=CC1C2
CC1=CC(C)C(C=O)C(C)C1
C=C1CC1CCCCCl
CC1=C(C2CC2C)C(C)(C)CCC1
Cc1ccc(-c2nnc(N=C(O)c3ccccc3)s2)cc1
C=Cc1ccccc1CCC(C)=O
C/C=C(\O)C(C)C(C)CC
Cc1c([Si](C)(C)C)cc2cc[nH]c2c1C
CCCC(F)CBr
FC(F)(F)C(OC1CCCC1)C(F)(F)F
CCC/C=C\S(=O)(=O)OCC
C=C1/C(=C\C=C2/CCC[C@]3(C)[C@@H]([C@H](C)CCCC(C)(C)O)CC[C@@H]23)C[C@@H](O)C[C@@H]1O
C[Si](C)(C)N(c1ccccc1-c1nnc(O)n([Si](C)(C)C)c1=O)[Si](C)(C)C
CC12CC3(SSC45CC6(C)SC(C)(SC(C)(S6)S4)S5)SC(C)(S1)SC(C)(S2)S3
N=C(O)CC(=N)O
CC(C)=CC(O)CC(C)c1ccc(C)cc1
Cc1cccc(-c2ccccc2)c1
CC(C=O)Cc1ccc(C(C)C)cc1
CC(Cl)CO
O=C(O)CCC1CCCC1
CC(C)=CCOc1c2ccoc2cc2oc(=O)ccc12
CCCCN1CCPCC1
CC(=O)OC1CCC2(C)C(=CCC3C2CCC2(C)C3CC3OC(CCC(C)CN=C(C)O)=C(C)C32)C1
Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=

[01:03:30] Explicit valence for atom # 0 Br, 3, is greater than permitted
[01:03:30] ERROR: Explicit valence for atom # 0 Br, 3, is greater than permitted



In [73]:
df = df.dropna(subset=['Canonical_SMILES'])
df

Unnamed: 0,Compound,PubChem_Link,InChI,InChIKey,Canonical_SMILES,Spectrabase_Link,1st Dimension Time (s),2nd Dimension Time (s)
0,Acenaphthene,https://pubchem.ncbi.nlm.nih.gov/compound/6734,InChI=1S/C12H10/c1-3-9-4-2-6-11-8-7-10(5-1)12(...,CWRYPZZKDGJXCA-UHFFFAOYSA-N,c1cc2c3c(cccc3c1)CC2,,5633.9200,1.790333
1,"3-Quinolinecarboxylic acid, 7,8-dichloro-1,4-d...",,InChI=1S/C12H9Cl2NO3/c1-2-18-12(17)7-5-15-10-6...,FAEKTUDLKDVPSY-UHFFFAOYSA-N,CCOC(=O)c1c[nH]c2c(Cl)c(Cl)ccc2c1=O,https://spectrabase.com/compound/F2TkDN19xxT?f=,6688.6200,2.008000
3,6-METHOXY-7-HYDROXYBENZOFURAN,,InChI=1S/C10H12O3/c1-12-10-5-7-3-2-4-13-9(7)6-...,GZZZUNTZVMRCLS-UHFFFAOYSA-N,COc1cc2c(cc1O)OCCC2,https://spectrabase.com/compound/5nB8Hya86wY?f=,6422.3200,1.831000
4,Verapamil,https://pubchem.ncbi.nlm.nih.gov/compound/2520,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",SGTNSNPWRIOYBX-UHFFFAOYSA-N,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,,6909.3800,0.915000
5,"1,1,1,8,8,8-HEXACHLORO-3,3,6,6-TETRAMETHYLOCTANE",,"InChI=1S/C12H20Cl6/c1-9(2,7-11(13,14)15)5-6-10...",UHNXATNZQQDGKT-UHFFFAOYSA-N,CC(C)(CCC(C)(C)CC(Cl)(Cl)Cl)CC(Cl)(Cl)Cl,https://spectrabase.com/compound/BvzRAsT35lc,10131.3000,0.846500
...,...,...,...,...,...,...,...,...
11695,"benzene, 1,1',1'',1'''-(1,2-ethenediylidene)te...",https://pubchem.ncbi.nlm.nih.gov/compound/9174...,InChI=1S/C34H36O4/c1-5-35-29-17-9-25(10-18-29)...,ULKJGRGKIGOYGV-UHFFFAOYSA-N,CCOc1ccc(C(=C(c2ccc(OCC)cc2)c2ccc(OCC)cc2)c2cc...,,2998.9125,0.834250
11696,"2',4'-Dihydroxy-2,3-dimethoxychalcone",https://pubchem.ncbi.nlm.nih.gov/compound/5377844,InChI=1S/C17H16O5/c1-21-16-5-3-4-11(17(16)22-2...,JUCNRAJYHMZLOT-RMKNXTFCSA-N,COc1cccc(/C=C/C(=O)c2ccc(O)cc2O)c1OC,,6982.9600,0.872000
11699,"5-O-[(1,1-DIMETHYLETHYL)DIMETHYLSILYL]-2,3-O-(...",,"InChI=1S/C14H26O5Si/c1-13(2,3)20(6,7)16-8-9-10...",VBPYRQYBEKHPKA-GMTAPVOTSA-N,CC1(C)O[C@@H]2[C@@H](CO[Si](C)(C)C(C)(C)C)OC(=...,https://spectrabase.com/compound/B603LoXKTtg?f=,9330.6400,0.778000
11700,"Propanamide, 2,3,3,3-tetrafluoro-2-heptafluoro...",https://pubchem.ncbi.nlm.nih.gov/compound/561513,"InChI=1S/C13H8F11NO2/c14-9(11(17,18)19,8(26)25...",MBYGYAYBAZPQOT-UHFFFAOYSA-N,OC(=NCc1ccccc1)C(F)(OC(F)(F)C(F)(F)C(F)(F)F)C(...,,4106.8760,1.090200


In [74]:
for idx, row in df.iterrows():
    if row['Spectrabase_Link'] == 'NaN': continue
    
    SMILES = row['Canonical_SMILES']

    try:
        if inchi != 'NaN':
            charge = 1
            mz = calculate_mz(SMILES, charge)
            df.loc[idx, 'm_z'] = mz
    except:
        continue


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [81]:
df.to_csv('results/training_set_march20.csv', index=False)