In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2


In [None]:
import pandas as pd
import pickle
import rdkit
import numpy as np
from tqdm import tqdm
import re

RAW_DATA = '/content/drive/MyDrive/Generative_ML/current_data/raw_data/'
REGEX_PATTERN = "(\[[^\]]+]|<|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|@@|\?|>|!|~|\*|\$|\%[0-9]{2}|[0-9])"
regex = re.compile(REGEX_PATTERN)

In [None]:
BINDING_DB = pickle.load(open(RAW_DATA + 'smile_set_bindingDB_07_11.pkl', 'rb'))
# MOSES = pd.read_csv(RAW_DATA+'MOSES.csv')['SMILES'].unique().tolist()
# GUACAMOL = []
# with open(RAW_DATA + 'guacamol_v1_all.smiles', 'r') as f:
#     lines = f.readlines()
# for rline in lines:
#     GUACAMOL.append(rline.rstrip())
# CHEMBL = pd.read_csv(RAW_DATA + 'chembl_33.txt', sep='\t')['canonical_smiles'].unique().tolist()

# Prepare training/validation partitions

In [None]:
pbar = tqdm(BINDING_DB, total=len(BINDING_DB))
token_to_freq = {}
block_sizes = []
max_block_size = 0
vocab = set()
for smile in pbar:
    tokens = regex.findall(smile.strip())
    for token in tokens:
        vocab.add(token)
        if token not in token_to_freq:
            token_to_freq[token] = 0
        token_to_freq[token] += 1
    max_block_size = max(max_block_size, len(tokens))
    block_sizes.append(len(tokens))

100%|██████████| 1161892/1161892 [00:45<00:00, 25798.71it/s]


In [None]:
len(vocab), max_block_size

(200, 1503)

In [None]:
sorted_dict = {k: v for k, v in sorted(token_to_freq.items(), key=lambda item: item[1], reverse=True)}
sorted_dict

# Combine datasets

In [None]:
origin_to_smiles = {'moses': set(MOSES), 'bindingDB': set(BINDING_DB), 'ChemBL': set(CHEMBL), 'GuacaMol': set(GUACAMOL)}
all_smiles = set()
total = 0
for origin, smileset in origin_to_smiles.items():
    print(f'{origin} contains {len(smileset)}')
    total += len(smileset)
    all_smiles = all_smiles | smileset
len(all_smiles), total

moses contains 1936962
bindingDB contains 1161892
ChemBL contains 2372528
GuacaMol contains 1591378


(5772122, 7062760)

In [None]:
smiles_to_origin = {}
for origin, smiles in origin_to_smiles.items():
    for smile in smiles:
        smiles_to_origin.setdefault(smile, []).append(origin)

In [None]:
pickle.dump(all_smiles, open(RAW_DATA+'all_smiles.pkl', 'wb'))

In [None]:
pickle.dump(smiles_to_origin, open(RAW_DATA+'smiles_to_origin.pkl', 'wb'))
pickle.dump(origin_to_smiles, open(RAW_DATA+'origin_to_smiles.pkl', 'wb'))

In [None]:
all_smiles = pickle.load(open(RAW_DATA+'all_smiles.pkl', 'rb'))

In [None]:
smiles_arr = list(all_smiles)
np.random.shuffle(smiles_arr)
train_size = int(0.95*len(smiles_arr))
print(train_size, len(smiles_arr)-train_size)


5483515 288607


In [None]:
train_set = smiles_arr[:train_size]
test_set = smiles_arr[train_size:]

In [None]:
train_df = pd.DataFrame({"smiles": train_set})
test_df = pd.DataFrame({"smiles": test_set})
train_df.to_csv(RAW_DATA + 'combined_train.csv')
test_df.to_csv(RAW_DATA + 'combined_test.csv')

In [None]:
datasets = [('moses', set(MOSES)), ('bindingDB', set(BINDING_DB)), ('ChemBL', set(CHEMBL)), ('GuacaMol', set(GUACAMOL))]
import itertools
pairs = list(itertools.combinations(datasets, 2))
for (name1, ds1), (name2, ds2) in pairs:
    print(f"{name1} has {len(ds1)} molecules")
    print(f"{name2} has {len(ds2)} molecules")
    print(f"{name1} and {name2} have {len(ds1 & ds2)} molecules in common")


moses has 1936962 molecules
bindingDB has 1161892 molecules
moses and bindingDB have 2567 molecules in common
moses has 1936962 molecules
ChemBL has 2372528 molecules
moses and ChemBL have 86661 molecules in common
moses has 1936962 molecules
GuacaMol has 1591378 molecules
moses and GuacaMol have 71509 molecules in common
bindingDB has 1161892 molecules
ChemBL has 2372528 molecules
bindingDB and ChemBL have 105799 molecules in common
bindingDB has 1161892 molecules
GuacaMol has 1591378 molecules
bindingDB and GuacaMol have 63253 molecules in common
ChemBL has 2372528 molecules
GuacaMol has 1591378 molecules
ChemBL and GuacaMol have 1092369 molecules in common


In [None]:
all_mols = pd.DataFrame(columns=['smiles', 'isInBindingDB', 'isInMOSES', 'isInChemBL', 'isInGuacaMol'])
i = 0
seen = set()
for dataset, name in [(BINDING_DB, 'bindingDB'), (MOSES, 'MOSES'), (CHEMBL, 'ChemBL'), (GUACAMOL, 'GuacaMol')]:
    pbar = tqdm(dataset, total=len(dataset))
    pbar.set_description(f"Dataset {name}")
    for molecule in pbar:
        if molecule in seen: continue # O(1)
        seen.add(molecule)
        is_in_checker = lambda dataset, molecule: True if molecule in origin_to_smiles[dataset] else np.nan
        bdb = lambda x: is_in_checker('bindingDB', x)
        mss = lambda x: is_in_checker('moses', x)
        cbl = lambda x: is_in_checker('ChemBL', x)
        gcm = lambda x: is_in_checker('GuacaMol', x)
        all_mols.loc[i] = [molecule, bdb(molecule), mss(molecule), cbl(molecule), gcm(molecule)]
        i += 1

In [None]:
all_mols.head()

Unnamed: 0,smiles,isInBindingDB,isInMOSES,isInChemBL,isInGuacaMol
0,CC[C@H](C)Cc1cn(nn1)[C@@H](CCCN=C(N)N)C(=O)NCC...,True,,,
