In [1]:
import pandas as pd
import numpy as np 

In [2]:
calib_fraction = 0.8

In [4]:
directory = '/home/dangnd/project/cheminfo_basic/PredictionofSmallMoleculeLipophilicity/'
dfl = pd.read_csv('lipophilicity/Lipophilicity.csv')
smiles_array = dfl['smiles'].values

print(smiles_array.shape)
smiles_array

(4200,)


array(['Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14',
       'COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23',
       'COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl', ...,
       'COc1cccc2[nH]ncc12', 'Clc1ccc2ncccc2c1C(=O)NCC3CCCCC3',
       'CN1C(=O)C=C(CCc2ccc3ccccc3c2)N=C1N'], dtype=object)

In [6]:
# create sorted, unique array of smiles characters
array_smiles_unique = np.unique(list("".join(smiles_array)))

array_smiles_unique

array(['#', '%', '(', ')', '+', '-', '.', '/', '0', '1', '2', '3', '4',
       '5', '6', '7', '8', '9', '=', '@', 'B', 'C', 'F', 'H', 'I', 'N',
       'O', 'P', 'S', '[', '\\', ']', 'c', 'e', 'i', 'l', 'n', 'o', 'r',
       's'], dtype='<U1')

In [8]:
# create embedded (not padded)
mapping = dict(zip(array_smiles_unique, range(1, 1 + len(array_smiles_unique))))

mapping

{'#': 1,
 '%': 2,
 '(': 3,
 ')': 4,
 '+': 5,
 '-': 6,
 '.': 7,
 '/': 8,
 '0': 9,
 '1': 10,
 '2': 11,
 '3': 12,
 '4': 13,
 '5': 14,
 '6': 15,
 '7': 16,
 '8': 17,
 '9': 18,
 '=': 19,
 '@': 20,
 'B': 21,
 'C': 22,
 'F': 23,
 'H': 24,
 'I': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'S': 29,
 '[': 30,
 '\\': 31,
 ']': 32,
 'c': 33,
 'e': 34,
 'i': 35,
 'l': 36,
 'n': 37,
 'o': 38,
 'r': 39,
 's': 40}

In [9]:
list_embedded_smiles = []
for s in smiles_array:
    list_embedded_smiles.append([mapping[i] for i in s])
        
df_lipophilicity_embedded_smiles_shuffled = pd.DataFrame(index=dfl.index,
                                                            data=dfl.values,
                                                            columns=dfl.columns)
df_lipophilicity_embedded_smiles_shuffled['smiles embedded'] = list_embedded_smiles
    
    
# df_lipophilicity_embedded_smiles_shuffled.to_pickle(directory +
#                 'df_lipophilicity_embedded_smiles_shuffled.pkl')

df_lipophilicity_embedded_smiles_shuffled

Unnamed: 0,CMPD_CHEMBLID,exp,smiles,smiles embedded
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,"[22, 37, 10, 33, 3, 22, 26, 11, 22, 22, 26, 3,..."
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,"[22, 27, 33, 10, 33, 33, 3, 27, 22, 4, 33, 3, ..."
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,"[22, 27, 22, 3, 19, 27, 4, 30, 22, 20, 20, 24,..."
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,"[27, 22, 30, 22, 20, 24, 32, 3, 27, 4, 22, 26,..."
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,"[22, 33, 10, 33, 33, 33, 33, 3, 22, 30, 22, 20..."
...,...,...,...,...
4195,CHEMBL496929,3.85,OCCc1ccc(NC(=O)c2cc3cc(Cl)ccc3[nH]2)cc1,"[27, 22, 22, 33, 10, 33, 33, 33, 3, 26, 22, 3,..."
4196,CHEMBL199147,3.21,CCN(C1CCN(CCC(c2ccc(F)cc2)c3ccc(F)cc3)CC1)C(=O...,"[22, 22, 26, 3, 22, 10, 22, 22, 26, 3, 22, 22,..."
4197,CHEMBL15932,2.1,COc1cccc2[nH]ncc12,"[22, 27, 33, 10, 33, 33, 33, 33, 11, 30, 37, 2..."
4198,CHEMBL558748,2.65,Clc1ccc2ncccc2c1C(=O)NCC3CCCCC3,"[22, 36, 33, 10, 33, 33, 33, 11, 37, 33, 33, 3..."


In [10]:
# split and save as numpy arrays
calib_num = int(calib_fraction*df_lipophilicity_embedded_smiles_shuffled.shape[0])
x_calib = df_lipophilicity_embedded_smiles_shuffled['smiles embedded'].values[:calib_num]
# np.save(directory + 'x_calib_smiles_embedded.npy', x_calib)
x_prod = df_lipophilicity_embedded_smiles_shuffled['smiles embedded'].values[calib_num:]
# np.save(directory + 'x_prod_smiles_embedded.npy', x_prod)

In [11]:
x_calib

array([list([22, 37, 10, 33, 3, 22, 26, 11, 22, 22, 26, 3, 22, 22, 11, 4, 33, 12, 33, 33, 33, 3, 22, 36, 4, 33, 33, 12, 4, 37, 33, 13, 33, 33, 33, 33, 33, 10, 13]),
       list([22, 27, 33, 10, 33, 33, 3, 27, 22, 4, 33, 3, 33, 33, 10, 26, 22, 3, 19, 27, 4, 22, 29, 22, 22, 3, 19, 27, 4, 27, 4, 29, 3, 19, 27, 4, 3, 19, 27, 4, 26, 11, 22, 3, 22, 4, 22, 22, 33, 12, 33, 33, 33, 33, 33, 11, 12]),
       list([22, 27, 22, 3, 19, 27, 4, 30, 22, 20, 20, 24, 32, 3, 26, 10, 22, 22, 33, 11, 40, 33, 33, 33, 11, 22, 10, 4, 33, 12, 33, 33, 33, 33, 33, 12, 22, 36]),
       ...,
       list([27, 33, 10, 33, 11, 22, 3, 19, 27, 4, 26, 26, 22, 3, 19, 27, 4, 33, 11, 37, 33, 12, 33, 33, 3, 22, 36, 4, 33, 33, 3, 22, 36, 4, 33, 10, 12]),
       list([22, 22, 26, 3, 22, 22, 4, 22, 3, 19, 27, 4, 33, 10, 33, 33, 33, 3, 33, 33, 10, 4, 22, 3, 19, 22, 11, 22, 22, 26, 22, 22, 11, 4, 33, 12, 33, 33, 33, 33, 3, 33, 12, 4, 22, 3, 23, 4, 3, 23, 4, 23]),
       list([22, 22, 30, 22, 20, 24, 32, 3, 22, 4, 30, 22, 20, 20, 

In [12]:
x_prod

array([list([22, 27, 22, 22, 22, 27, 33, 10, 33, 33, 37, 33, 3, 22, 30, 29, 5, 32, 3, 30, 27, 6, 32, 4, 33, 11, 37, 33, 12, 33, 33, 33, 33, 33, 12, 30, 37, 24, 32, 11, 4, 33, 10, 22]),
       list([26, 29, 3, 19, 27, 4, 3, 19, 27, 4, 33, 10, 33, 33, 33, 11, 22, 3, 19, 22, 26, 22, 3, 19, 27, 4, 33, 11, 33, 10, 4, 22, 3, 19, 27, 4, 26, 22, 30, 22, 20, 20, 24, 32, 3, 27, 4, 22, 26, 12, 22, 22, 22, 3, 22, 22, 12, 4, 27, 33, 13, 33, 33, 33, 3, 22, 36, 4, 33, 3, 22, 36, 4, 33, 13]),
       list([27, 19, 22, 10, 27, 22, 3, 19, 22, 33, 11, 33, 33, 33, 33, 33, 10, 11, 4, 33, 12, 33, 33, 33, 33, 33, 12]),
       list([22, 22, 3, 22, 4, 30, 22, 20, 20, 24, 32, 10, 26, 3, 22, 4, 33, 11, 33, 33, 33, 33, 33, 11, 22, 30, 22, 20, 20, 24, 32, 3, 22, 27, 4, 26, 22, 10, 19, 27]),
       list([22, 22, 3, 19, 27, 4, 26, 22, 33, 10, 33, 33, 33, 3, 22, 26, 11, 22, 22, 26, 3, 22, 22, 11, 4, 33, 12, 33, 33, 33, 33, 33, 12, 4, 33, 33, 10]),
       list([22, 27, 33, 10, 33, 33, 33, 3, 22, 26, 11, 22, 3, 19, 27, 

In [13]:
x_calib.shape, x_prod.shape

((3360,), (840,))