In [6]:
import pandas as pd
import numpy as np
import os
import sys
import polars as pl
import json
from joblib import Parallel, delayed
import deepchem
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Configure Polars 
cfg = pl.Config()
cfg.set_tbl_rows(20)
cfg.set_tbl_cols(50)
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold, GroupShuffleSplit, ShuffleSplit
import gc
import random

from rdkit import Chem;
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from functools import partial
import mapply


In [None]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/building_blocks.json', 'r') as f:
    bbs_meta = json.load(f)

[print(k, '-->', len(v)) for k, v in bbs_meta.items()]

In [None]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/scaffolds.json', 'r') as f:
    scaffolds = json.load(f)
print(len(scaffolds))

In [4]:
train_scaffolds = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_scaffold.csv').collect()
train_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2.csv')\
    .select(
        pl.col('molecule'),
        pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        pl.col('BRD4', 'HSA', 'eSH').cast(pl.UInt8),
        scaffold_idx = train_scaffolds['scaffold_idx'],
    )\
    .collect()
print(train_df.estimated_size('mb'))
train_df

8601.91998577118


molecule,bb1,bb2,bb3,BRD4,HSA,eSH,scaffold_idx
str,u16,u16,u16,u8,u8,u8,i64
"""C#CCOc1ccc(CNc…",1640,1653,765,0,0,0,4283326
"""C#CCOc1ccc(CNc…",1640,1653,205,0,0,0,4486646
"""C#CCOc1ccc(CNc…",1640,1653,1653,0,0,0,1015728
"""C#CCOc1ccc(CNc…",1640,1653,146,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,439,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,196,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,253,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,1219,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,604,0,0,0,543172
"""C#CCOc1ccc(CNc…",1640,1653,121,0,0,0,2571428


In [2]:
test_scaffolds = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_scaffold.csv').collect()
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v2.csv')\
    .select(
        pl.col('id','molecule'),
        pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        pl.col('protein'),
        scaffold_idx = test_scaffolds['scaffold_idx']
    ).group_by('molecule').first().collect().sort('id')
print(test_df.estimated_size('mb'))
test_df

84.21934127807617


molecule,id,bb1,bb2,bb3,protein,scaffold_idx
str,i64,u16,u16,u16,str,i64
"""C#CCCC[C@H](Nc…",295246830,1989,409,409,"""BRD4""",2217250
"""C#CCCC[C@H](Nc…",295246833,1989,409,1012,"""BRD4""",602641
"""C#CCCC[C@H](Nc…",295246836,1989,409,1722,"""BRD4""",4502748
"""C#CCCC[C@H](Nc…",295246839,1989,409,1078,"""BRD4""",3936208
"""C#CCCC[C@H](Nc…",295246842,1989,409,605,"""BRD4""",4550856
"""C#CCCC[C@H](Nc…",295246845,1989,409,521,"""BRD4""",4414349
"""C#CCCC[C@H](Nc…",295246848,1989,409,41,"""BRD4""",5367715
"""C#CCCC[C@H](Nc…",295246851,1989,409,1826,"""BRD4""",1422452
"""C#CCCC[C@H](Nc…",295246854,1989,409,1970,"""BRD4""",4752663
"""C#CCCC[C@H](Nc…",295246857,1989,409,598,"""BRD4""",5758930


In [3]:
# Generate ECFPs
def get_ecfp(smile, radius=3, bits=2048):
    if smile is None:
        return None
    mol = Chem.MolFromSmiles(smile)
    fp = np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bits).ToList())
    fp = np.packbits(fp, axis=None)
    return fp


def get_maccs(smile):
    if smile is None:
        return None
    mol = Chem.MolFromSmiles(smile)
    fp = np.array(MACCSkeys.GenMACCSKeys(mol).ToList())
    assert len(np.unique(fp)) <= 2
    fp = np.packbits(fp, axis=None)
    return fp


def get_ecfp_maccs(smile, radius=3, bits=2048):
    if smile is None:
        return None
    mol = Chem.MolFromSmiles(smile)
    maccs = MACCSkeys.GenMACCSKeys(mol).ToList()
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bits).ToList()
    ret = maccs + ecfp
    ret = np.packbits(ret, axis=None)
    return ret


In [10]:
get_ecfp(train_df[0, 'molecule'])

array([ 64,   0,   0,   0,   0,  64,   0,   0,   0,   0, 137,   8,   2,
         0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,  64,   1,   0,   0,   0,  64,   0,   0,   2,
         0,  48,   0,   0,   0,  64,   0,   0,  32,   0,   0,   0,   0,
         0, 128,   0,  34,   0,  32,   0,   0,   0,   0,   1,   0,   0,
         1,   0,   0,   2,   0,   0,  16,   0,   0,   0,   0,   1,   8,
         0,  98,   0,  32,   1,   0,  32,   8,  33,   0, 128,   2,   0,
       128,   0,   0,   2,   4,   8,   0,   0,   0,   9,   0,   1,   0,
         0,   0,   2,   0,   0,   1,   0,   0,   0,   1,  16,   2, 129,
         0,   0,   0,   0,   2,  64,   0,   0,   0,   0,  16,   8,   0,
        16,  16,   0,   0,   0,   0, 129,  64,   0,   0,   0,   0,   0,
         1, 128,  72,   0,   2, 128,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,  32,   0,   1,   0,   4,   0, 130,   4,
         0,   1,  32,   8,   0,  16,   0,   0, 128,   0,   0,   

In [4]:
get_maccs(train_df[0, 'molecule'])

NameError: name 'train_df' is not defined

In [12]:
get_ecfp_maccs(train_df[0, 'molecule']).shape

(277,)

In [None]:
# def mapply(func, col, backend = 'loky', n_jobs = 48):
#     ret = Parallel(n_jobs=n_jobs, backend=backend)(
#     delayed(func)(e) for e in tqdm(col))
#     return ret

In [None]:
# test_ecfp6s = mapply(partial(get_ecfp, radius=3, bits=2048), test_df['molecule'])
# test_ecfp6s[:3]

In [7]:
mapply.init(
    n_workers=-1,
    progressbar=True,
    chunk_size = 1024,
    max_chunks_per_worker = 32,
)

In [9]:
test_df

molecule,id,bb1,bb2,bb3,protein,scaffold_idx
str,i64,u16,u16,u16,str,i64
"""C#CCCC[C@H](Nc…",295246830,1989,409,409,"""BRD4""",2217250
"""C#CCCC[C@H](Nc…",295246833,1989,409,1012,"""BRD4""",602641
"""C#CCCC[C@H](Nc…",295246836,1989,409,1722,"""BRD4""",4502748
"""C#CCCC[C@H](Nc…",295246839,1989,409,1078,"""BRD4""",3936208
"""C#CCCC[C@H](Nc…",295246842,1989,409,605,"""BRD4""",4550856
"""C#CCCC[C@H](Nc…",295246845,1989,409,521,"""BRD4""",4414349
"""C#CCCC[C@H](Nc…",295246848,1989,409,41,"""BRD4""",5367715
"""C#CCCC[C@H](Nc…",295246851,1989,409,1826,"""BRD4""",1422452
"""C#CCCC[C@H](Nc…",295246854,1989,409,1970,"""BRD4""",4752663
"""C#CCCC[C@H](Nc…",295246857,1989,409,598,"""BRD4""",5758930


In [8]:
test_mols = test_df['molecule'].to_pandas()
print('Start!')
test_ecfp6s = test_mols.mapply(partial(get_ecfp, radius=3, bits=2048))
test_ecfp6s[:3]

Start!


100%|████████████████████████████████████████| 857/857 [00:27<00:00, 30.98it/s]


0    [64, 0, 0, 64, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0,...
1    [64, 0, 0, 64, 0, 1, 0, 0, 0, 0, 128, 16, 0, 0...
2    [64, 16, 0, 64, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0...
Name: molecule, dtype: object

In [None]:
mapply.parallel.sensible_cpu_count()

In [None]:
train_ecfp6s = train_df['molecule'].to_pandas().mapply(partial(get_ecfp, radius=3, bits=2048))
train_ecfp6s[:3]

In [None]:
# # 13 hours for this single process code
# for i, row in tqdm(enumerate(train_df.iter_rows(named=True)), total = len(train_df)):
#     train_ecfp6s[i] = get_ecfp(row['molecule'], radius=3, bits=2048)

In [None]:
len(train_ecfp6s)

In [None]:
train_ecfp6s_np = np.stack(train_ecfp6s, axis = 0)
train_ecfp6s_np.shape

In [None]:
!mkdir -p /home/dangnh36/datasets/competitions/leash_belka/processed/features/ 

In [None]:
fp = np.memmap('/home/dangnh36/datasets/competitions/leash_belka/processed/features/train_ecfp6_2048.memmap', dtype=train_ecfp6s_np.dtype, mode='w+', shape=train_ecfp6s_np.shape)
fp[:] = train_ecfp6s_np[:]

In [None]:
del train_ecfp6s, train_ecfp6s_np
gc.collect()

In [10]:
test_ecfp6s_np = np.stack(test_ecfp6s, axis = 0)
test_ecfp6s_np.shape

(878022, 256)

In [12]:
fp = np.memmap('/home/dangnh36/datasets/competitions/leash_belka/processed/features/test_ecfp6_2048.mmap',
               dtype=test_ecfp6s_np.dtype,
               mode='w+',
               shape=test_ecfp6s_np.shape)
fp[:] = test_ecfp6s_np[:]

In [13]:
test_mols = test_df['molecule'].to_pandas()
print('Start!')
test_maccs = test_mols.mapply(partial(get_maccs))
test_maccs[:3]

Start!


100%|████████████████████████████████████████| 857/857 [01:02<00:00, 13.65it/s]


0    [2, 0, 64, 64, 32, 8, 6, 0, 68, 5, 128, 59, 48...
1    [2, 0, 64, 64, 34, 8, 6, 0, 68, 21, 148, 59, 1...
2    [2, 0, 64, 64, 32, 8, 6, 0, 100, 37, 160, 59, ...
Name: molecule, dtype: object

In [14]:
test_maccs_np = np.stack(test_maccs, axis = 0)
print(test_maccs_np.shape)
fp = np.memmap('/home/dangnh36/datasets/competitions/leash_belka/processed/features/test_maccs.mmap',
               dtype=test_maccs_np.dtype,
               mode='w+',
               shape=test_maccs_np.shape)
fp[:] = test_maccs_np[:]

(878022, 21)


In [16]:
train_maccs = train_df['molecule'].to_pandas().mapply(partial(get_maccs))
train_maccs[:3]

  0%|                                                  | 0/3104 [00:00<?, ?it/s]

0    [2, 0, 64, 64, 2, 8, 12, 2, 68, 21, 180, 58, 2...
1    [2, 0, 64, 64, 0, 10, 4, 0, 68, 5, 161, 26, 10...
2    [2, 0, 64, 64, 0, 8, 4, 0, 68, 4, 160, 26, 104...
Name: molecule, dtype: object

In [17]:
train_maccs_np = np.stack(train_maccs, axis = 0)
print(train_maccs_np.shape)
fp = np.memmap('/home/dangnh36/datasets/competitions/leash_belka/processed/features/train_maccs.mmap',
               dtype=train_maccs_np.dtype,
               mode='w+',
               shape=train_maccs_np.shape)
fp[:] = train_maccs_np[:]

(98415610, 21)
