In [1]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2


In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from tqdm import tqdm
import pickle
import numpy as np

BASE_PATH = '/content/drive/MyDrive/Generative_ML/current_data/' #@param {type:"string"}

PRETRAINING_PATH = BASE_PATH + '1. Pretraining/'
GENERATION_PATH = BASE_PATH + '2. Generation/'
SAMPLING_PATH = BASE_PATH + '3. Sampling/'
DIFFDOCK_PATH = BASE_PATH + '4. DiffDock/'
SCORING_PATH = BASE_PATH + '5. Scoring/'
AL_PATH = BASE_PATH + '6. ActiveLearning/'
PICKLES = BASE_PATH + 'Archive/pickle/'

In [4]:
suffix = '_even'
keyToData = {}
invalid_smiles = set()
all_mols = pd.read_csv(f"{PRETRAINING_PATH}descriptors/combined_processed_freq1000_block133_extra_mols.csv")['smiles'].to_list()

# All RDKit

In [5]:
len(keyToData), len(invalid_smiles), len(all_mols)

(0, 0, 20291)

In [6]:
def process_smile(smile):
  mol = Chem.MolFromSmiles(smile)
  if mol is None:
    return None
  return Descriptors.CalcMolDescriptors(mol, missingVal=None, silent=False)

parity_counter = 0
pbar = tqdm(enumerate(all_mols), total=len(all_mols))
for i, smile in pbar:
    if smile in invalid_smiles:
        continue
    data = process_smile(smile)
    if data is None:
        invalid_smiles.add(smile)
    else:
        keyToData.setdefault('smiles', []).append(smile)
        for descriptor, value in data.items():
            keyToData.setdefault(descriptor, []).append(value)
    # if i != 0 and i % 100_000 == 0:
    #     print('Making a dump')
    #     suffix = '_odd' if parity_counter == 1 else '_even'
    #     pickle.dump(smile_to_descriptors, open(BASE+f'pickle/smile_to_descriptors{suffix}.pkl', 'wb'))
    #     pickle.dump(invalid_smiles, open(BASE+f'pickle/invalid_smiles{suffix}.pkl', 'wb'))
    #     pickle.dump(valid_smiles, open(BASE+f'pickle/valid_smiles{suffix}.pkl', 'wb'))
    #     parity_counter += 1
    #     parity_counter %= 2



100%|██████████| 20291/20291 [04:27<00:00, 75.86it/s]


In [7]:
len(invalid_smiles), len(keyToData)

(0, 210)

In [8]:
pd.DataFrame(keyToData).to_pickle(f"{PRETRAINING_PATH}descriptors/combined_processed_freq1000_block133_extra_mols.pkl")

In [None]:
len(valid_smiles), len(invalid_smiles), len(smile_to_descriptors)

In [None]:
parity_counter = 0
print('Making a dump')
suffix = '_odd' if parity_counter == 1 else '_even'
pickle.dump(smile_to_descriptors, open(BASE+f'pickle/smile_to_descriptors{suffix}.pkl', 'wb'))
pickle.dump(invalid_smiles, open(BASE+f'pickle/invalid_smiles{suffix}.pkl', 'wb'))
pickle.dump(valid_smiles, open(BASE+f'pickle/valid_smiles{suffix}.pkl', 'wb'))
parity_counter += 1
parity_counter %= 2

# MQN

In [None]:
def combine_train_and_valid(dataset_name):
    train_df = pd.read_csv(f"{PRETRAINING_PATH}datasets/{dataset_name}_train.csv.gz")['smiles']
    val_df = pd.read_csv(f"{PRETRAINING_PATH}datasets/{dataset_name}_val.csv.gz")['smiles']
    all_mols = set(train_df) | set(val_df)
    print(f"There are {len(all_mols)=} molecules in the training set")
    return list(all_mols)

def calculate_mqn_descriptors(molecules, out_fname):
    parity_counter = 0
    keyToData = {}
    seen = set()
    invalid = set()
    for mol_counter, smile in tqdm(enumerate(molecules), total=len(molecules)):
        if mol_counter != 0 and mol_counter % 500_000 == 0:
            print("Performing dump")
            suffix = '_even' if parity_counter == 0 else '_odd'
            pickle.dump(keyToData, open(f"{PRETRAINING_PATH}descriptors/{out_fname}_mqns{suffix}.pkl", 'wb'))
            pickle.dump(seen, open(f"{PRETRAINING_PATH}descriptors/{out_fname}_seen{suffix}.pkl", 'wb'))
            pickle.dump(invalid, open(f"{PRETRAINING_PATH}descriptors/{out_fname}_invalid{suffix}.pkl", 'wb'))
            parity_counter += 1
            parity_counter %= 2
        if smile in invalid: continue
        if smile in seen: continue
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            invalid.add(smile)
            continue
        keyToData.setdefault('smiles', []).append(smile)
        descriptors = rdMolDescriptors.MQNs_(mol)
        assert len(descriptors) == 42, f"Expected 42 descriptors, got {len(descriptors)}"
        for i, descriptor in enumerate(descriptors):
            keyToData.setdefault(f"MQN{i}", []).append(descriptor)
        seen.add(smile)
    pd.DataFrame(keyToData).to_pickle(f"{PRETRAINING_PATH}descriptors/{out_fname}_mqns.pkl")

In [None]:
calculate_mqn_descriptors(combine_train_and_valid("combined_processed_freq1000_block133"), "combined_processed_freq1000_block133")

There are len(all_mols)=5550953 molecules in the training set


  9%|▉         | 499850/5550953 [03:15<33:10, 2538.18it/s]

Performing dump


 18%|█▊        | 999791/5550953 [06:31<29:27, 2574.47it/s]

Performing dump


 27%|██▋       | 1499827/5550953 [09:48<26:30, 2546.37it/s]

Performing dump


 36%|███▌      | 1999949/5550953 [13:07<23:02, 2568.22it/s]

Performing dump


 45%|████▌     | 2499776/5550953 [16:27<19:44, 2576.83it/s]

Performing dump


 54%|█████▍    | 2999815/5550953 [19:48<16:29, 2579.20it/s]

Performing dump


 63%|██████▎   | 3499856/5550953 [23:12<13:25, 2545.05it/s]

Performing dump


 72%|███████▏  | 3999882/5550953 [26:37<09:57, 2595.09it/s]

Performing dump


 81%|████████  | 4499926/5550953 [30:03<06:49, 2564.82it/s]

Performing dump


 90%|█████████ | 4999926/5550953 [33:29<03:36, 2548.61it/s]

Performing dump


 99%|█████████▉| 5499740/5550953 [36:56<00:19, 2663.37it/s]

Performing dump


100%|██████████| 5550953/5550953 [37:30<00:00, 2466.52it/s]
