In [2]:
import numpy as np
import pandas as pd
from pyarrow.parquet import ParquetFile
import pickle
import os

In [3]:
DEBUG = False
if DEBUG:
    NUM_ROWS = 30000000
else:
    NUM_ROWS = 295246830

In [4]:
dataset_path = '../../train.parquet'

In [5]:
def get_binds(dataset_path):
    binds =  pd.read_parquet(dataset_path, engine = 'pyarrow', columns=['binds']).binds.to_numpy()
    binds = binds[:NUM_ROWS]
    return np.reshape(binds.astype('byte'), [-1, 3])

binds = get_binds(dataset_path)

In [6]:
def get_unique_BB(dataset_path, col):
    BBs = pd.read_parquet(dataset_path, engine = 'pyarrow', columns=[col])
    BBs = BBs[:NUM_ROWS]
    BBs = BBs.to_numpy()[:, 0]
    BBs_reshaped = np.reshape(BBs, [-1, 3])
    
    if np.mean(BBs_reshaped[:, 0] == BBs_reshaped[:, 1]) != 1:
        print('ERROR')
    if np.mean(BBs_reshaped[:, 0] == BBs_reshaped[:, 2]) != 1:
        print('ERROR')
    
    BBs_unique = np.unique(BBs_reshaped[:, 0])
    BBs_unique = list(BBs_unique)
    BBs_dict = {BBs_unique[i]:i for i in range(len(BBs_unique))}
    BBs_dict_reverse = {i:BBs_unique[i] for i in range(len(BBs_unique))}
    return BBs_dict, BBs_dict_reverse

BBs_dict_1, BBs_dict_reverse_1 = get_unique_BB(dataset_path, 'buildingblock1_smiles')
print(len(BBs_dict_1))
BBs_dict_2, BBs_dict_reverse_2 = get_unique_BB(dataset_path, 'buildingblock2_smiles')
print(len(BBs_dict_2))
BBs_dict_3, BBs_dict_reverse_3 = get_unique_BB(dataset_path, 'buildingblock3_smiles')
print(len(BBs_dict_3))

271
693
872


In [7]:
def get_encoded(dataset_path, col, BBs_dict):
    BBs = pd.read_parquet(dataset_path, engine = 'pyarrow', columns=[col])
    BBs = BBs[:NUM_ROWS]
    BBs = BBs[col].to_numpy()
    BBs_reshaped = np.reshape(BBs, [-1, 3])
    BBs = BBs_reshaped[:, 0]
    encoded_BBs = [BBs_dict[x] for x in BBs]
    encoded_BBs = np.asarray(encoded_BBs, dtype = np.int16)
    return encoded_BBs

encoded_BBs_1 = get_encoded(dataset_path, 'buildingblock1_smiles', BBs_dict_1)
encoded_BBs_2 = get_encoded(dataset_path, 'buildingblock2_smiles', BBs_dict_2)
encoded_BBs_3 = get_encoded(dataset_path, 'buildingblock3_smiles', BBs_dict_3)

In [8]:
def get_molecule_smiles(dataset_path):
    if DEBUG:
        molecule_smiles = pd.read_csv(f'{dataset_path[:-7]}csv', usecols=['molecule_smiles'], nrows = NUM_ROWS)
    else:
        molecule_smiles = pd.read_parquet(dataset_path, engine = 'pyarrow', columns=['molecule_smiles'])
    molecule_smiles = molecule_smiles.molecule_smiles.to_numpy()
    molecule_smiles = np.reshape(molecule_smiles, [-1, 3])
    if np.mean(molecule_smiles[:, 0] == molecule_smiles[:, 1]) != 1:
        print('ERROR')
    if np.mean(molecule_smiles[:, 0] == molecule_smiles[:, 2]) != 1:
        print('ERROR')
    return molecule_smiles[:, 0]

molecule_smiles = get_molecule_smiles(dataset_path)

In [9]:
data = {'buildingblock1_smiles':encoded_BBs_1, 'buildingblock2_smiles':encoded_BBs_2, 'buildingblock3_smiles':encoded_BBs_3,
        'molecule_smiles':molecule_smiles, 'binds_BRD4':binds[:, 0], 'binds_HSA':binds[:, 1], 'binds_sEH':binds[:, 2]}
df = pd.DataFrame(data=data)
df.head(2)

Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,binds_BRD4,binds_HSA,binds_sEH
0,0,0,0,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,0,0,0
1,0,0,1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,0,0,0


In [11]:
try:
    os.mkdir('../train_data/')
    os.mkdir('../train_data/train_dicts')
except:
    pass
df.to_parquet('../train_data/train.parquet', index = False)

   
pickle.dump(BBs_dict_1, open('../train_data/train_dicts/BBs_dict_1.p', 'bw'))
pickle.dump(BBs_dict_2, open('../train_data/train_dicts/BBs_dict_2.p', 'bw'))
pickle.dump(BBs_dict_3, open('../train_data/train_dicts/BBs_dict_3.p', 'bw'))
pickle.dump(BBs_dict_reverse_1, open('../train_data/train_dicts/BBs_dict_reverse_1.p', 'bw'))
pickle.dump(BBs_dict_reverse_2, open('../train_data/train_dicts/BBs_dict_reverse_2.p', 'bw'))
pickle.dump(BBs_dict_reverse_3, open('../train_data/train_dicts/BBs_dict_reverse_3.p', 'bw'))

In [12]:
test_path = '../../test.parquet'

In [13]:
molecule_smiles = pd.read_parquet(test_path, engine = 'pyarrow', columns=['molecule_smiles']).molecule_smiles.to_numpy()
protein_name = pd.read_parquet(test_path, engine = 'pyarrow', columns=['protein_name']).protein_name.to_numpy()
first_unique_molecule_smiles_indices = []
molecule_smiles_unique = {}
is_BRD4 = {}
is_HSA = {}
is_sEH = {}
for i,x in enumerate(molecule_smiles):
    if x not in molecule_smiles_unique:
        molecule_smiles_unique[x] = [i]
        first_unique_molecule_smiles_indices.append(i)
        is_BRD4[x] = False
        is_HSA[x] = False
        is_sEH[x] = False
        if protein_name[i] == 'BRD4':
            is_BRD4[x] = True
        if protein_name[i] == 'HSA':
            is_HSA[x] = True
        if protein_name[i] == 'sEH':
            is_sEH[x] = True
    else:
        molecule_smiles_unique[x].append(i)
        if protein_name[i] == 'BRD4':
            is_BRD4[x] = True
        if protein_name[i] == 'HSA':
            is_HSA[x] = True
        if protein_name[i] == 'sEH':
            is_sEH[x] = True
first_unique_molecule_smiles_indices = np.asarray(first_unique_molecule_smiles_indices)
print(len(is_BRD4))
print(np.sum([is_BRD4[x] for x in is_BRD4]))
print(np.sum([is_HSA[x] for x in is_HSA]))
print(np.sum([is_sEH[x] for x in is_sEH]))

molecule_smiles_unique_arr = molecule_smiles[first_unique_molecule_smiles_indices]
print(len(np.unique(molecule_smiles_unique_arr)) == len(molecule_smiles_unique_arr))

878022
558859
557895
558142
True


In [14]:
is_BRD4_arr = np.asarray([is_BRD4[x] for x in molecule_smiles_unique])
is_HSA_arr = np.asarray([is_HSA[x] for x in molecule_smiles_unique])
is_sEH_arr = np.asarray([is_sEH[x] for x in molecule_smiles_unique])

print(np.sum(is_BRD4_arr))
print(np.sum(is_HSA_arr))
print(np.sum(is_sEH_arr))

558859
557895
558142


In [15]:
def get_unique_BB_test(dataset_path, col):
    BBs = pd.read_parquet(dataset_path, engine = 'pyarrow', columns=[col])
    BBs = BBs[col].to_numpy()
    BBs_unique = np.unique(BBs)
    BBs_unique = list(BBs_unique)
    BBs_dict = {BBs_unique[i]:i for i in range(len(BBs_unique))}
    BBs_dict_reverse = {i:BBs_unique[i] for i in range(len(BBs_unique))}
    return BBs_dict, BBs_dict_reverse

BBs_dict_1_test, BBs_dict_reverse_1_test = get_unique_BB_test(test_path, 'buildingblock1_smiles')
print(len(BBs_dict_1_test))
BBs_dict_2_test, BBs_dict_reverse_2_test = get_unique_BB_test(test_path, 'buildingblock2_smiles')
print(len(BBs_dict_2_test))
BBs_dict_3_test, BBs_dict_reverse_3_test = get_unique_BB_test(test_path, 'buildingblock3_smiles')
print(len(BBs_dict_3_test))

341
1140
1389


In [17]:
def get_encoded_test(dataset_path, col, BBs_dict):
    BBs = pd.read_parquet(dataset_path, engine = 'pyarrow', columns=[col])
    BBs = BBs[col].to_numpy()
    BBs = BBs[first_unique_molecule_smiles_indices]
    encoded_BBs = [BBs_dict[x] for x in BBs]
    encoded_BBs = np.asarray(encoded_BBs, dtype = np.int16)
    return encoded_BBs

encoded_BBs_1_test = get_encoded_test(test_path, 'buildingblock1_smiles', BBs_dict_1_test)
encoded_BBs_2_test = get_encoded_test(test_path, 'buildingblock2_smiles', BBs_dict_2_test)
encoded_BBs_3_test = get_encoded_test(test_path, 'buildingblock3_smiles', BBs_dict_3_test)

In [18]:
data = {'buildingblock1_smiles':encoded_BBs_1_test, 'buildingblock2_smiles':encoded_BBs_2_test,
        'buildingblock3_smiles':encoded_BBs_3_test,'molecule_smiles':molecule_smiles_unique_arr,
        'is_BRD4':is_BRD4_arr, 'is_HSA':is_HSA_arr, 'is_sEH':is_sEH_arr}
df = pd.DataFrame(data=data)
df.head(2)

Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,is_BRD4,is_HSA,is_sEH
0,0,17,17,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,True,True,True
1,0,17,87,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,True,True,True


In [19]:
try:
    os.mkdir('../test_data/')
    os.mkdir('../test_data/test_dicts')
except:
    pass
df.to_parquet('../test_data/test.parquet', index = False)

   
pickle.dump(BBs_dict_1, open('../test_data/test_dicts/BBs_dict_1.p', 'bw'))
pickle.dump(BBs_dict_2, open('../test_data/test_dicts/BBs_dict_2.p', 'bw'))
pickle.dump(BBs_dict_3, open('../test_data/test_dicts/BBs_dict_3.p', 'bw'))
pickle.dump(BBs_dict_reverse_1, open('../test_data/test_dicts/BBs_dict_reverse_1.p', 'bw'))
pickle.dump(BBs_dict_reverse_2, open('../test_data/test_dicts/BBs_dict_reverse_2.p', 'bw'))
pickle.dump(BBs_dict_reverse_3, open('../test_data/test_dicts/BBs_dict_reverse_3.p', 'bw'))