### Converting InChi Keys of the compounds on the test dataset into IUPAC names (or to plain English synonyms if available)

Imports:

In [1]:
import pandas as pd
import requests
import time

In [2]:
df = pd.read_csv('../raw_data/tox21_compoundData.csv')

In [3]:
df_ids = df[['ID', 'inchikey', 'set']]

In [4]:
df_ids_test = df_ids[df_ids['set'] == 'test'].reset_index(drop=True)
df_ids_test

Unnamed: 0,ID,inchikey,set
0,NCGC00261900-01,ABCSSKWSUJMJCP-WQDFMEOSSA-N,test
1,NCGC00260869-01,DMRMZQATXPQOTP-XIIVPSJUSA-M,test
2,NCGC00261776-01,ACVGWSKVRYFWRP-UHFFFAOYSA-N,test
3,NCGC00261380-01,LCHACRBDLUKTTM-UHFFFAOYSA-N,test
4,NCGC00261842-01,MQUQNUAYKLCRME-INIZCTEOSA-N,test
...,...,...,...
642,NCGC00357168-01,SHLSSLVZXJBVHE-UHFFFAOYSA-N,test
643,NCGC00357283-01,HUHGPYXAVBJSJV-UHFFFAOYSA-N,test
644,NCGC00357210-01,XOHZHMUQBFJTNH-UHFFFAOYSA-N,test
645,NCGC00357118-01,MNHVNIJQQRJYDH-UHFFFAOYSA-N,test


### Functions that collect the names of the compounds from PubChem based on their InChi Keys 

In [5]:
def get_iupac_name(inchikey):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/IUPACName/JSON'
    try:
        res = requests.get(url, timeout=10)
        if res.status_code == 200:
            data = res.json()
            return data['PropertyTable']['Properties'][0]['IUPACName']
    except:
        pass
    return None


In [6]:
def get_title(inchikey):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/Title/JSON'
    try:
        res = requests.get(url, timeout=10)
        if res.status_code == 200:
            data = res.json()
            return data['PropertyTable']['Properties'][0]['Title']
    except:
        pass
    return None

In [7]:
def get_synonyms(inchikey):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/synonyms/JSON'
    try:
        res = requests.get(url, timeout=10)
        if res.status_code == 200:
            data = res.json()
            return '; '.join(data['InformationList']['Information'][0]['Synonym'][:5])
    except:
        pass
    return None

In [8]:
iupac_names = []
titles = []
synonyms = []

total = len(df_ids_test)


In [9]:
for i, inchikey in enumerate(df_ids_test['inchikey'], 1):
    iupac = get_iupac_name(inchikey)
    title = get_title(inchikey)
    syns = get_synonyms(inchikey)
    
    iupac_names.append(iupac)
    titles.append(title)
    synonyms.append(syns)
    
    time.sleep(0.2)  
    if i % 50 == 0:
        print(f"Processed {i} of {total} compounds")

Processed 50 of 647 compounds
Processed 100 of 647 compounds
Processed 150 of 647 compounds
Processed 200 of 647 compounds
Processed 250 of 647 compounds
Processed 300 of 647 compounds
Processed 350 of 647 compounds
Processed 400 of 647 compounds
Processed 450 of 647 compounds
Processed 500 of 647 compounds
Processed 550 of 647 compounds
Processed 600 of 647 compounds


In [10]:
df_ids_test['iupac_name'] = iupac_names
df_ids_test['title'] = titles
df_ids_test['synonyms'] = synonyms

In [11]:
df_ids_test

Unnamed: 0,ID,inchikey,set,iupac_name,title,synonyms
0,NCGC00261900-01,ABCSSKWSUJMJCP-WQDFMEOSSA-N,test,"(Z)-but-2-enedioic acid;(8S,10S,13S,14S,17S)-1...",U-74389G maleate,u-74389g; U-74389G maleate; 153190-29-5; U 743...
1,NCGC00260869-01,DMRMZQATXPQOTP-XIIVPSJUSA-M,test,sodium;(7aR)-6-(6-amino-8-bromopurin-9-yl)-2-o...,,
2,NCGC00261776-01,ACVGWSKVRYFWRP-UHFFFAOYSA-N,test,"3,13,21-triazapentacyclo[11.8.0.02,10.04,9.015...",Rutecarpine,Rutaecarpine; 84-26-4; Rutecarpine; Rutaecarpi...
3,NCGC00261380-01,LCHACRBDLUKTTM-UHFFFAOYSA-N,test,5-[3-[bis(4-fluorophenyl)methoxy]propyl]-1H-im...,3-(1H-Imidazol-4-yl)propyl di(p-fluorophenyl)m...,3-(1H-Imidazol-4-yl)propyl di(p-fluorophenyl)m...
4,NCGC00261842-01,MQUQNUAYKLCRME-INIZCTEOSA-N,test,N-[(2S)-4-chloro-3-oxo-1-phenylbutan-2-yl]-4-m...,N-Tosyl-L-phenylalanyl chloromethyl ketone,402-71-1; TOSYLPHENYLALANYL CHLOROMETHYL KETON...
...,...,...,...,...,...,...
642,NCGC00357168-01,SHLSSLVZXJBVHE-UHFFFAOYSA-N,test,3-sulfanylpropan-1-ol,3-Mercapto-1-propanol,"3-Mercapto-1-propanol; 19721-22-3; 1-Propanol,..."
643,NCGC00357283-01,HUHGPYXAVBJSJV-UHFFFAOYSA-N,test,"2-[3,5-bis(2-hydroxyethyl)-1,3,5-triazinan-1-y...",Triazinetriethanol,"Triazinetriethanol; Grotan BK; 1,3,5-Triazine-..."
644,NCGC00357210-01,XOHZHMUQBFJTNH-UHFFFAOYSA-N,test,1-methyltetrazole-5-thiol,,
645,NCGC00357118-01,MNHVNIJQQRJYDH-UHFFFAOYSA-N,test,2-[2-(1-chlorocyclopropyl)-3-(2-chlorophenyl)-...,Prothioconazole,"Prothioconazole; 178928-70-6; 3H-1,2,4-Triazol..."


In [25]:
df_ids_test['iupac_name'][1]

'sodium;(7aR)-6-(6-amino-8-bromopurin-9-yl)-2-oxido-2-oxo-4a,6,7,7a-tetrahydro-4H-furo[3,2-d][1,3,2]dioxaphosphinin-7-ol'

In [14]:
path_features_test = "../raw_data/tox21_dense_test.csv.gz" 
X_test = pd.read_csv(path_features_test)
X_test.shape

(647, 802)

In [18]:
X_test.head()

Unnamed: 0.1,Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
0,NCGC00261900-01,26124820.0,12.688,2.226,3.226,37.329,25.44,3.663,24.2,20.222,...,9687.312,42351.907,194.444,-2518.829,-83.11,772.051,10001.075,131.633,145.967,5.499
1,NCGC00260869-01,8333337.0,17.5,2.167,2.923,16.353,10.872,1.193,11.116,9.279,...,1256.41,2621.885,104.011,-475.829,-33.456,219.411,1003.763,76.703,76.043,3.728
2,NCGC00261776-01,4.074,12.464,2.364,3.043,14.681,10.826,2.149,9.98,9.469,...,1072.43,3152.648,93.486,-341.628,-21.327,174.791,638.757,32.885,45.933,3.657
3,NCGC00261380-01,8000005.0,13.827,2.08,2.845,16.778,11.72,0.777,10.139,8.207,...,1408.177,4596.402,127.215,-519.799,-27.729,199.061,813.323,35.712,58.214,3.659
4,NCGC00261842-01,4.838,14.509,2.087,2.88,16.872,10.92,0.413,10.035,7.719,...,1217.075,4343.46,134.802,-816.522,-55.496,192.858,1168.142,33.19,57.065,3.635


In [19]:
X_test = X_test.rename(columns={'Unnamed: 0': 'ID'})

In [20]:
X_test_with_names = df_ids_test.merge(X_test, how="inner", on="ID")

In [21]:
X_test_with_names

Unnamed: 0,ID,inchikey,set,iupac_name,title,synonyms,AW,AWeight,Arto,BertzCT,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
0,NCGC00261900-01,ABCSSKWSUJMJCP-WQDFMEOSSA-N,test,"(Z)-but-2-enedioic acid;(8S,10S,13S,14S,17S)-1...",U-74389G maleate,u-74389g; U-74389G maleate; 153190-29-5; U 743...,2.612482e+07,12.688,2.226,3.226,...,9687.312,42351.907,194.444,-2518.829,-83.110,772.051,10001.075,131.633,145.967,5.499
1,NCGC00260869-01,DMRMZQATXPQOTP-XIIVPSJUSA-M,test,sodium;(7aR)-6-(6-amino-8-bromopurin-9-yl)-2-o...,,,8.333337e+06,17.500,2.167,2.923,...,1256.410,2621.885,104.011,-475.829,-33.456,219.411,1003.763,76.703,76.043,3.728
2,NCGC00261776-01,ACVGWSKVRYFWRP-UHFFFAOYSA-N,test,"3,13,21-triazapentacyclo[11.8.0.02,10.04,9.015...",Rutecarpine,Rutaecarpine; 84-26-4; Rutecarpine; Rutaecarpi...,4.074000e+00,12.464,2.364,3.043,...,1072.430,3152.648,93.486,-341.628,-21.327,174.791,638.757,32.885,45.933,3.657
3,NCGC00261380-01,LCHACRBDLUKTTM-UHFFFAOYSA-N,test,5-[3-[bis(4-fluorophenyl)methoxy]propyl]-1H-im...,3-(1H-Imidazol-4-yl)propyl di(p-fluorophenyl)m...,3-(1H-Imidazol-4-yl)propyl di(p-fluorophenyl)m...,8.000005e+06,13.827,2.080,2.845,...,1408.177,4596.402,127.215,-519.799,-27.729,199.061,813.323,35.712,58.214,3.659
4,NCGC00261842-01,MQUQNUAYKLCRME-INIZCTEOSA-N,test,N-[(2S)-4-chloro-3-oxo-1-phenylbutan-2-yl]-4-m...,N-Tosyl-L-phenylalanyl chloromethyl ketone,402-71-1; TOSYLPHENYLALANYL CHLOROMETHYL KETON...,4.838000e+00,14.509,2.087,2.880,...,1217.075,4343.460,134.802,-816.522,-55.496,192.858,1168.142,33.190,57.065,3.635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642,NCGC00357168-01,SHLSSLVZXJBVHE-UHFFFAOYSA-N,test,3-sulfanylpropan-1-ol,3-Mercapto-1-propanol,"3-Mercapto-1-propanol; 19721-22-3; 1-Propanol,...",2.000000e+00,16.820,1.600,1.157,...,25.161,207.065,25.378,-34.751,-8.135,45.801,62.715,7.705,6.893,1.868
643,NCGC00357283-01,HUHGPYXAVBJSJV-UHFFFAOYSA-N,test,"2-[3,5-bis(2-hydroxyethyl)-1,3,5-triazinan-1-y...",Triazinetriethanol,"Triazinetriethanol; Grotan BK; 1,3,5-Triazine-...",3.714000e+00,13.208,2.000,2.134,...,406.748,2632.001,51.600,-186.503,-25.487,151.500,547.566,24.878,29.311,2.976
644,NCGC00357210-01,XOHZHMUQBFJTNH-UHFFFAOYSA-N,test,1-methyltetrazole-5-thiol,,,1.905000e+00,16.017,2.000,2.295,...,49.621,146.562,34.623,-39.767,-6.189,38.729,44.481,7.121,12.809,1.779
645,NCGC00357118-01,MNHVNIJQQRJYDH-UHFFFAOYSA-N,test,2-[2-(1-chlorocyclopropyl)-3-(2-chlorophenyl)-...,Prothioconazole,"Prothioconazole; 178928-70-6; 3H-1,2,4-Triazol...",4.186000e+00,15.674,2.190,2.851,...,902.831,2855.278,112.538,-443.321,-28.525,142.805,562.539,28.106,55.845,3.282
