# Converting InChi Keys of the compounds on the test dataset into IUPAC names (or to plain English synonyms if available)

In [1]:
import pandas as pd
import requests
import time

# Raw Data

In [2]:
df = pd.read_csv("../raw_data/tox21_compoundData.csv")

In [3]:
print(df.shape)
print(df.info())

(12707, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12707 entries, 0 to 12706
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             12707 non-null  object 
 1   inchikey       12707 non-null  object 
 2   sdftitle       12707 non-null  object 
 3   order          12707 non-null  int64  
 4   set            12707 non-null  object 
 5   CVfold         8245 non-null   float64
 6   NR.AhR         9051 non-null   float64
 7   NR.AR          10240 non-null  float64
 8   NR.AR.LBD      9434 non-null   float64
 9   NR.Aromatase   7968 non-null   float64
 10  NR.ER          8478 non-null   float64
 11  NR.ER.LBD      9640 non-null   float64
 12  NR.PPAR.gamma  9056 non-null   float64
 13  SR.ARE         7956 non-null   float64
 14  SR.ATAD5       9985 non-null   float64
 15  SR.HSE         9027 non-null   float64
 16  SR.MMP         8101 non-null   float64
 17  SR.p53         9519 non-null   float64

# Retrieve SMILES, IUPAC name, and InChIKey for UI dataset

## Dataset Without Targets

**All data**

In [4]:
df_ids = df[["ID", "inchikey", "set"]]
df_ids

Unnamed: 0,ID,inchikey,set
0,NCGC00178831-03,PEJLNXHANOHNSU-UHFFFAOYSA-N,training
1,NCGC00166114-03,SEACYXSIPDVVMV-UHFFFAOYSA-L,training
2,NCGC00263563-01,WQGJEAMPBSZCIF-VEKNOCPUSA-N,training
3,NCGC00013058-02,CNYGFPPAGUCRIC-UHFFFAOYSA-L,training
4,NCGC00167516-01,LYCYLGFSIXIXAB-NUZRHMIVSA-N,training
...,...,...,...
12702,NCGC00357168-01,SHLSSLVZXJBVHE-UHFFFAOYSA-N,test
12703,NCGC00357283-01,HUHGPYXAVBJSJV-UHFFFAOYSA-N,test
12704,NCGC00357210-01,XOHZHMUQBFJTNH-UHFFFAOYSA-N,test
12705,NCGC00357118-01,MNHVNIJQQRJYDH-UHFFFAOYSA-N,test


**Test data only**

In [5]:
df_ids_test = df_ids[df_ids["set"] == "test"].reset_index(drop=True)
df_ids_test

Unnamed: 0,ID,inchikey,set
0,NCGC00261900-01,ABCSSKWSUJMJCP-WQDFMEOSSA-N,test
1,NCGC00260869-01,DMRMZQATXPQOTP-XIIVPSJUSA-M,test
2,NCGC00261776-01,ACVGWSKVRYFWRP-UHFFFAOYSA-N,test
3,NCGC00261380-01,LCHACRBDLUKTTM-UHFFFAOYSA-N,test
4,NCGC00261842-01,MQUQNUAYKLCRME-INIZCTEOSA-N,test
...,...,...,...
642,NCGC00357168-01,SHLSSLVZXJBVHE-UHFFFAOYSA-N,test
643,NCGC00357283-01,HUHGPYXAVBJSJV-UHFFFAOYSA-N,test
644,NCGC00357210-01,XOHZHMUQBFJTNH-UHFFFAOYSA-N,test
645,NCGC00357118-01,MNHVNIJQQRJYDH-UHFFFAOYSA-N,test


## Functions that collect the names of the compounds from PubChem based on their InChi Keys 

In [6]:
def get_smiles(inchikey):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/CanonicalSMILES/JSON"
    try:
        res = requests.get(url, timeout=10)
        if res.status_code == 200:
            return res.json()["PropertyTable"]["Properties"][0]["CanonicalSMILES"]
    except:
        pass
    return None

In [7]:
def get_iupac_name(inchikey):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/IUPACName/JSON"
    try:
        res = requests.get(url, timeout=10)
        if res.status_code == 200:
            data = res.json()
            return data["PropertyTable"]["Properties"][0]["IUPACName"]
    except:
        pass
    return None


In [8]:
def get_title(inchikey):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/Title/JSON"
    try:
        res = requests.get(url, timeout=10)
        if res.status_code == 200:
            data = res.json()
            return data["PropertyTable"]["Properties"][0]["Title"]
    except:
        pass
    return None

## Retrieve Data From Pubchem API

**Create empty lists**

In [9]:
smiles_list = []
iupac_names = []
titles = []
total = len(df_ids_test)

**Loop call to API**

In [10]:
for i, inchikey in enumerate(df_ids_test["inchikey"], 1):
    smiles = get_smiles(inchikey)
    iupac = get_iupac_name(inchikey)
    title = get_title(inchikey)

    smiles_list.append(smiles)
    iupac_names.append(iupac)
    titles.append(title)
    
    time.sleep(0.2)  
    if i % 50 == 0:
        print(f"Processed {i} of {total} compounds")

Processed 50 of 647 compounds
Processed 100 of 647 compounds
Processed 150 of 647 compounds
Processed 200 of 647 compounds
Processed 250 of 647 compounds
Processed 300 of 647 compounds
Processed 350 of 647 compounds
Processed 400 of 647 compounds
Processed 450 of 647 compounds
Processed 500 of 647 compounds
Processed 550 of 647 compounds
Processed 600 of 647 compounds


**Add lists to DataFrame**

In [11]:
df_ids_test["smiles"] = smiles_list
df_ids_test["iupac_name"] = iupac_names
df_ids_test["title"] = titles
df_ids_test

Unnamed: 0,ID,inchikey,set,smiles,iupac_name,title
0,NCGC00261900-01,ABCSSKWSUJMJCP-WQDFMEOSSA-N,test,CC12CC=C3C(C1CCC2C(=O)CN4CCN(CC4)C5=NC(=NC(=C5...,"(Z)-but-2-enedioic acid;(8S,10S,13S,14S,17S)-1...",U-74389G maleate
1,NCGC00260869-01,DMRMZQATXPQOTP-XIIVPSJUSA-M,test,C1C2C(C(C(O2)N3C4=NC=NC(=C4N=C3Br)N)O)OP(=O)(O...,sodium;(7aR)-6-(6-amino-8-bromopurin-9-yl)-2-o...,
2,NCGC00261776-01,ACVGWSKVRYFWRP-UHFFFAOYSA-N,test,C1CN2C(=NC3=CC=CC=C3C2=O)C4=C1C5=CC=CC=C5N4,"3,13,21-triazapentacyclo[11.8.0.02,10.04,9.015...",Rutecarpine
3,NCGC00261380-01,LCHACRBDLUKTTM-UHFFFAOYSA-N,test,C1=CC(=CC=C1C(C2=CC=C(C=C2)F)OCCCC3=CN=CN3)F.Cl,5-[3-[bis(4-fluorophenyl)methoxy]propyl]-1H-im...,3-(1H-Imidazol-4-yl)propyl di(p-fluorophenyl)m...
4,NCGC00261842-01,MQUQNUAYKLCRME-INIZCTEOSA-N,test,CC1=CC=C(C=C1)S(=O)(=O)NC(CC2=CC=CC=C2)C(=O)CCl,N-[(2S)-4-chloro-3-oxo-1-phenylbutan-2-yl]-4-m...,N-Tosyl-L-phenylalanyl chloromethyl ketone
...,...,...,...,...,...,...
642,NCGC00357168-01,SHLSSLVZXJBVHE-UHFFFAOYSA-N,test,C(CO)CS,3-sulfanylpropan-1-ol,3-Mercapto-1-propanol
643,NCGC00357283-01,HUHGPYXAVBJSJV-UHFFFAOYSA-N,test,C1N(CN(CN1CCO)CCO)CCO,"2-[3,5-bis(2-hydroxyethyl)-1,3,5-triazinan-1-y...",Triazinetriethanol
644,NCGC00357210-01,XOHZHMUQBFJTNH-UHFFFAOYSA-N,test,CN1C(=NN=N1)S,1-methyltetrazole-5-thiol,
645,NCGC00357118-01,MNHVNIJQQRJYDH-UHFFFAOYSA-N,test,C1CC1(C(CC2=CC=CC=C2Cl)(CN3C(=S)N=CN3)O)Cl,2-[2-(1-chlorocyclopropyl)-3-(2-chlorophenyl)-...,Prothioconazole


In [12]:
df_ids_test["iupac_name"][1]

'sodium;(7aR)-6-(6-amino-8-bromopurin-9-yl)-2-oxido-2-oxo-4a,6,7,7a-tetrahydro-4H-furo[3,2-d][1,3,2]dioxaphosphinin-7-ol'

**Save compound names to disk**

In [16]:
df_ids_test.to_csv("../data/test_compound_names.csv", index=False)

**Load compound names from disk**

In [17]:
test_compound_names = pd.read_csv("../data/test_compound_names.csv")
test_compound_names

Unnamed: 0,ID,inchikey,set,smiles,iupac_name,title
0,NCGC00261900-01,ABCSSKWSUJMJCP-WQDFMEOSSA-N,test,CC12CC=C3C(C1CCC2C(=O)CN4CCN(CC4)C5=NC(=NC(=C5...,"(Z)-but-2-enedioic acid;(8S,10S,13S,14S,17S)-1...",U-74389G maleate
1,NCGC00260869-01,DMRMZQATXPQOTP-XIIVPSJUSA-M,test,C1C2C(C(C(O2)N3C4=NC=NC(=C4N=C3Br)N)O)OP(=O)(O...,sodium;(7aR)-6-(6-amino-8-bromopurin-9-yl)-2-o...,
2,NCGC00261776-01,ACVGWSKVRYFWRP-UHFFFAOYSA-N,test,C1CN2C(=NC3=CC=CC=C3C2=O)C4=C1C5=CC=CC=C5N4,"3,13,21-triazapentacyclo[11.8.0.02,10.04,9.015...",Rutecarpine
3,NCGC00261380-01,LCHACRBDLUKTTM-UHFFFAOYSA-N,test,C1=CC(=CC=C1C(C2=CC=C(C=C2)F)OCCCC3=CN=CN3)F.Cl,5-[3-[bis(4-fluorophenyl)methoxy]propyl]-1H-im...,3-(1H-Imidazol-4-yl)propyl di(p-fluorophenyl)m...
4,NCGC00261842-01,MQUQNUAYKLCRME-INIZCTEOSA-N,test,CC1=CC=C(C=C1)S(=O)(=O)NC(CC2=CC=CC=C2)C(=O)CCl,N-[(2S)-4-chloro-3-oxo-1-phenylbutan-2-yl]-4-m...,N-Tosyl-L-phenylalanyl chloromethyl ketone
...,...,...,...,...,...,...
642,NCGC00357168-01,SHLSSLVZXJBVHE-UHFFFAOYSA-N,test,C(CO)CS,3-sulfanylpropan-1-ol,3-Mercapto-1-propanol
643,NCGC00357283-01,HUHGPYXAVBJSJV-UHFFFAOYSA-N,test,C1N(CN(CN1CCO)CCO)CCO,"2-[3,5-bis(2-hydroxyethyl)-1,3,5-triazinan-1-y...",Triazinetriethanol
644,NCGC00357210-01,XOHZHMUQBFJTNH-UHFFFAOYSA-N,test,CN1C(=NN=N1)S,1-methyltetrazole-5-thiol,
645,NCGC00357118-01,MNHVNIJQQRJYDH-UHFFFAOYSA-N,test,C1CC1(C(CC2=CC=CC=C2Cl)(CN3C(=S)N=CN3)O)Cl,2-[2-(1-chlorocyclopropyl)-3-(2-chlorophenyl)-...,Prothioconazole


In [None]:
path_features_test = "../raw_data/tox21_dense_test.csv.gz" 
X_test = pd.read_csv(path_features_test)
X_test.shape

In [None]:
X_test.head()

In [None]:
X_test = X_test.rename(columns={"Unnamed: 0": "ID"})

In [None]:
X_test_with_names = df_ids_test.merge(X_test, how="inner", on="ID")

### Dataframe for prediction in the backend

In [None]:
X_test_with_names

### Dictionary of names for the frontend

In [None]:
compound_names_dict = X_test_with_names.set_index('ID')[['smiles', 'iupac_name', 'title']].to_dict(orient='index')

In [None]:
compound_names_dict