In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm
import pickle

BASE = '/content/drive/MyDrive/Generative_ML/current_data/'

all_mols = pickle.load(open(BASE + 'raw_data/all_smiles.pkl', 'rb'))
smile_to_descriptors = {}

In [None]:
suffix = '_even'
# smile_to_descriptors = pickle.load(open(BASE + f'pickle/smile_to_descriptors{suffix}.pkl', 'rb'))
invalid_smiles = pickle.load(open(BASE + f'pickle/invalid_smiles{suffix}.pkl', 'rb'))
valid_smiles = pickle.load(open(BASE + f'pickle/valid_smiles{suffix}.pkl', 'rb'))

In [None]:
len(smile_to_descriptors), len(invalid_smiles), len(valid_smiles)

(0, 1209, 4683119)

In [None]:
def process_smile(smile):
  mol = Chem.MolFromSmiles(smile)
  if mol is None:
    return None
  return Descriptors.CalcMolDescriptors(mol)

parity_counter = 0
pbar = tqdm(enumerate(all_mols), total=len(all_mols))
for i, smile in pbar:
    if smile in valid_smiles or smile in invalid_smiles:
        continue
    data = process_smile(smile)
    if data is None:
        invalid_smiles.add(smile)
    else:
        smile_to_descriptors[smile] = data
    valid_smiles.add(smile)
    if i != 0 and i % 100_000 == 0:
        print('Making a dump')
        suffix = '_odd' if parity_counter == 1 else '_even'
        pickle.dump(smile_to_descriptors, open(BASE+f'pickle/smile_to_descriptors{suffix}.pkl', 'wb'))
        pickle.dump(invalid_smiles, open(BASE+f'pickle/invalid_smiles{suffix}.pkl', 'wb'))
        pickle.dump(valid_smiles, open(BASE+f'pickle/valid_smiles{suffix}.pkl', 'wb'))
        parity_counter += 1
        parity_counter %= 2



  0%|          | 13620/5772122 [00:45<4:41:04, 341.46it/s][17:01:47] Can't kekulize mol.  Unkekulized atoms: 24 25 26 28 29
  0%|          | 27620/5772122 [01:30<4:17:36, 371.66it/s][17:02:32] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
  1%|          | 52103/5772122 [02:54<4:52:36, 325.81it/s][17:03:56] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19
  2%|▏         | 91447/5772122 [05:08<10:04:25, 156.64it/s][17:06:10] Can't kekulize mol.  Unkekulized atoms: 14 15 16 17 25
  2%|▏         | 98277/5772122 [05:30<4:33:57, 345.17it/s][17:06:32] Explicit valence for atom # 7 B, 6, is greater than permitted
  2%|▏         | 103159/5772122 [05:46<3:50:34, 409.76it/s][17:06:48] Explicit valence for atom # 1 C, 7, is greater than permitted
  2%|▏         | 104116/5772122 [05:49<4:03:54, 387.30it/s][17:06:51] Explicit valence for atom # 19 N, 4, is greater than permitted
  2%|▏         | 108848/5772122 [06:04<7:26:34, 211.36it/s][17:07:06] Explicit valence for atom

Making a dump


  7%|▋         | 403408/5772122 [23:09<4:30:58, 330.20it/s][17:24:11] Can't kekulize mol.  Unkekulized atoms: 42 43 44 45 46
  7%|▋         | 423764/5772122 [24:18<3:39:42, 405.71it/s][17:25:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
  7%|▋         | 432002/5772122 [25:00<3:24:05, 436.10it/s][17:26:02] Can't kekulize mol.  Unkekulized atoms: 44 45 46 47 48
  8%|▊         | 444023/5772122 [25:44<5:38:46, 262.12it/s][17:26:45] Can't kekulize mol.  Unkekulized atoms: 64 65 66 67 68 69 70 72 73
  8%|▊         | 475014/5772122 [27:33<6:19:14, 232.79it/s][17:28:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
  9%|▉         | 534804/5772122 [31:19<4:35:11, 317.19it/s][17:32:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 34
  9%|▉         | 547364/5772122 [32:08<4:24:35, 329.11it/s][17:33:11] Can't kekulize mol.  Unkekulized atoms: 1 2 3 26 27
 10%|▉         | 548633/5772122 [32:14<4:09:21, 349.13it/s][17:33:16] Can't kekulize mol.  Unkekulized atoms: 24 2

Making a dump


 11%|█         | 620560/5772122 [37:20<3:29:07, 410.55it/s][17:38:21] Can't kekulize mol.  Unkekulized atoms: 24 25 27
 12%|█▏        | 665932/5772122 [39:58<4:36:45, 307.50it/s][17:41:00] Explicit valence for atom # 9 N, 4, is greater than permitted
 12%|█▏        | 667458/5772122 [40:03<5:02:01, 281.68it/s][17:41:05] Can't kekulize mol.  Unkekulized atoms: 25 26 27 28 36
 13%|█▎        | 743586/5772122 [44:30<3:46:46, 369.57it/s][17:45:32] Can't kekulize mol.  Unkekulized atoms: 1 2 6
 13%|█▎        | 754718/5772122 [45:04<5:01:34, 277.29it/s][17:46:06] Explicit valence for atom # 26 N, 4, is greater than permitted
 13%|█▎        | 778451/5772122 [46:23<3:13:13, 430.72it/s][17:47:25] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 18
 14%|█▍        | 803232/5772122 [47:48<10:38:02, 129.79it/s][17:48:50] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8 9 11 12 13
 14%|█▍        | 831737/5772122 [49:21<4:03:57, 337.52it/s][17:50:23] Explicit valence for atom # 25 N, 4,

Making a dump


 30%|██▉       | 1711971/5772122 [1:41:29<3:02:36, 370.57it/s][18:42:31] Can't kekulize mol.  Unkekulized atoms: 7 8 9 10 11
 30%|███       | 1736921/5772122 [1:42:56<3:28:38, 322.33it/s][18:43:58] Explicit valence for atom # 22 N, 4, is greater than permitted
 30%|███       | 1740676/5772122 [1:43:08<3:14:19, 345.76it/s][18:44:10] Explicit valence for atom # 27 O, 3, is greater than permitted
 31%|███       | 1787493/5772122 [1:45:54<2:56:13, 376.85it/s][18:46:56] Can't kekulize mol.  Unkekulized atoms: 8 9 10 14 15
 32%|███▏      | 1831270/5772122 [1:48:27<3:01:59, 360.90it/s][18:49:29] Can't kekulize mol.  Unkekulized atoms: 2 3 5 36 38
 33%|███▎      | 1897243/5772122 [1:52:26<4:05:37, 262.93it/s][18:53:28] Explicit valence for atom # 7 B, 6, is greater than permitted
 34%|███▎      | 1938153/5772122 [1:54:43<8:38:28, 123.24it/s][18:55:45] Explicit valence for atom # 9 N, 4, is greater than permitted
 34%|███▍      | 1967696/5772122 [1:56:24<5:17:22, 199.79it/s][18:57:26] Can't kek

Making a dump


 42%|████▏     | 2430517/5772122 [2:24:06<2:47:44, 332.01it/s][19:25:08] Explicit valence for atom # 29 C, 5, is greater than permitted
 43%|████▎     | 2464366/5772122 [2:26:19<2:58:20, 309.11it/s][19:27:20] Explicit valence for atom # 12 N, 4, is greater than permitted
 43%|████▎     | 2466393/5772122 [2:26:28<2:38:09, 348.34it/s][19:27:30] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 33
 43%|████▎     | 2501484/5772122 [2:28:27<2:48:26, 323.62it/s][19:29:28] Explicit valence for atom # 31 N, 4, is greater than permitted
 43%|████▎     | 2510174/5772122 [2:28:54<2:07:37, 425.97it/s][19:29:56] Can't kekulize mol.  Unkekulized atoms: 8 9 10 23 24 25 26 27 28
 44%|████▎     | 2525004/5772122 [2:29:46<7:02:00, 128.24it/s][19:30:48] Can't kekulize mol.  Unkekulized atoms: 24 25 27 28 30
 44%|████▍     | 2548385/5772122 [2:31:09<2:48:40, 318.54it/s][19:32:11] Can't kekulize mol.  Unkekulized atoms: 7 8 9 10 11
 45%|████▍     | 2588886/5772122 [2:33:36<2:31:50, 349.41it/s][19:34:38] 

Making a dump


 47%|████▋     | 2706128/5772122 [2:41:06<2:45:11, 309.32it/s][19:42:08] Can't kekulize mol.  Unkekulized atoms: 2 4 7
 47%|████▋     | 2706441/5772122 [2:41:08<4:02:45, 210.48it/s][19:42:10] Explicit valence for atom # 3 N, 4, is greater than permitted
 47%|████▋     | 2729452/5772122 [2:42:27<3:05:40, 273.11it/s][19:43:29] Explicit valence for atom # 6 C, 6, is greater than permitted
 48%|████▊     | 2747278/5772122 [2:43:32<1:49:18, 461.21it/s][19:44:34] Explicit valence for atom # 23 N, 4, is greater than permitted
 48%|████▊     | 2751585/5772122 [2:43:47<2:13:35, 376.83it/s][19:44:49] Can't kekulize mol.  Unkekulized atoms: 31 32 34 35 37
 48%|████▊     | 2766356/5772122 [2:44:43<5:34:15, 149.87it/s][19:45:44] Explicit valence for atom # 13 F, 3, is greater than permitted
 48%|████▊     | 2768431/5772122 [2:44:51<2:41:57, 309.11it/s][19:45:53] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 8 9 10 11
 48%|████▊     | 2769904/5772122 [2:44:57<3:19:03, 251.37it/s][19:45:58] Can't

Making a dump


 53%|█████▎    | 3034811/5772122 [3:01:26<2:07:58, 356.50it/s][20:02:28] Explicit valence for atom # 21 F, 2, is greater than permitted
 53%|█████▎    | 3053011/5772122 [3:02:27<2:49:43, 267.01it/s][20:03:28] Explicit valence for atom # 14 N, 4, is greater than permitted
 54%|█████▎    | 3089713/5772122 [3:04:27<4:34:56, 162.61it/s][20:05:28] Explicit valence for atom # 3 N, 4, is greater than permitted
 54%|█████▍    | 3113388/5772122 [3:05:52<1:49:10, 405.90it/s][20:06:53] Explicit valence for atom # 20 N, 4, is greater than permitted
 54%|█████▍    | 3125911/5772122 [3:06:34<4:33:46, 161.10it/s][20:07:35] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
 55%|█████▍    | 3151625/5772122 [3:07:58<3:02:57, 238.72it/s][20:09:00] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15 16 17 18 19
 55%|█████▍    | 3155336/5772122 [3:08:12<1:51:43, 390.34it/s][20:09:14] Explicit valence for atom # 33 N, 4, is greater than permitted
 55%|█████▍    | 3162724/5772122 [3:08:37<2:28:36, 29

Making a dump


 75%|███████▍  | 4302005/5772122 [4:16:11<1:25:52, 285.35it/s][21:17:13] Can't kekulize mol.  Unkekulized atoms: 1 4 6
 75%|███████▍  | 4318465/5772122 [4:17:10<53:58, 448.81it/s]  [21:18:11] Can't kekulize mol.  Unkekulized atoms: 13 14 15 17 18 20 21
 75%|███████▌  | 4333647/5772122 [4:18:08<1:18:42, 304.59it/s][21:19:10] Can't kekulize mol.  Unkekulized atoms: 1 4 6
 76%|███████▌  | 4385768/5772122 [4:21:21<1:34:58, 243.27it/s][21:22:23] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 9 10 11 12 26 27 28
 76%|███████▌  | 4388003/5772122 [4:21:27<57:03, 404.29it/s][21:22:29] Explicit valence for atom # 4 N, 4, is greater than permitted
 77%|███████▋  | 4423669/5772122 [4:23:33<1:28:36, 253.62it/s][21:24:35] Explicit valence for atom # 15 N, 4, is greater than permitted
 77%|███████▋  | 4423787/5772122 [4:23:34<4:11:26, 89.37it/s][21:24:36] Can't kekulize mol.  Unkekulized atoms: 23 24 28
 77%|███████▋  | 4465330/5772122 [4:25:56<48:39, 447.67it/s][21:26:58] Can't kekulize mol.  Unkek

Making a dump


 78%|███████▊  | 4504249/5772122 [4:29:58<1:14:22, 284.14it/s][21:31:00] Can't kekulize mol.  Unkekulized atoms: 1 2 6
 78%|███████▊  | 4508751/5772122 [4:30:15<1:40:18, 209.91it/s][21:31:16] Can't kekulize mol.  Unkekulized atoms: 15 16 18 19 21
 78%|███████▊  | 4521624/5772122 [4:31:00<1:36:30, 215.94it/s][21:32:02] Explicit valence for atom # 19 N, 4, is greater than permitted
 78%|███████▊  | 4530026/5772122 [4:31:28<1:02:02, 333.65it/s][21:32:29] Explicit valence for atom # 27 N, 4, is greater than permitted
 79%|███████▉  | 4571111/5772122 [4:33:47<1:11:58, 278.08it/s][21:34:48] Can't kekulize mol.  Unkekulized atoms: 1 2 6
 79%|███████▉  | 4571964/5772122 [4:33:49<53:04, 376.85it/s][21:34:51] Explicit valence for atom # 20 O, 3, is greater than permitted
 80%|███████▉  | 4598900/5772122 [4:35:22<4:57:33, 65.71it/s][21:36:24] Can't kekulize mol.  Unkekulized atoms: 2 3 19 21 22
 80%|███████▉  | 4614951/5772122 [4:36:13<57:07, 337.61it/s][21:37:15] Can't kekulize mol.  Unkekulized

In [None]:
len(valid_smiles), len(invalid_smiles), len(smile_to_descriptors)

(5771533, 1485, 1088138)

In [None]:
parity_counter = 0
print('Making a dump')
suffix = '_odd' if parity_counter == 1 else '_even'
pickle.dump(smile_to_descriptors, open(BASE+f'pickle/smile_to_descriptors{suffix}.pkl', 'wb'))
pickle.dump(invalid_smiles, open(BASE+f'pickle/invalid_smiles{suffix}.pkl', 'wb'))
pickle.dump(valid_smiles, open(BASE+f'pickle/valid_smiles{suffix}.pkl', 'wb'))
parity_counter += 1
parity_counter %= 2

Making a dump
