In [1]:
import pandas as pd
import numpy as np
import os
import sys
import polars as pl
import json
from joblib import Parallel, delayed
import deepchem
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Configure Polars 
cfg = pl.Config()
cfg.set_tbl_rows(20)
cfg.set_tbl_cols(50)
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold, GroupShuffleSplit, ShuffleSplit, StratifiedKFold
import gc
import random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/opt/conda/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped 

In [2]:
PERCENTILES = [0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975, 0.99]

In [5]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/building_blocks.json', 'r') as f:
    bbs_meta = json.load(f)

[print(k, '-->', len(v)) for k, v in bbs_meta.items()]

train_bbs --> 1145
train_bb1s --> 271
train_bb2s --> 693
train_bb3s --> 872
test_bb1s --> 341
test_bb2s --> 1140
test_bb3s --> 1389
test_bbs --> 2110
all_bbs --> 2110


[None, None, None, None, None, None, None, None, None]

In [6]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/scaffolds.json', 'r') as f:
    scaffolds = json.load(f)
print(len(scaffolds))

5971685


In [7]:
train_scaffolds = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_scaffold.csv').collect()
train_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2.csv')\
    .select(
        pl.col('molecule'),
        pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
        scaffold_idx = train_scaffolds['scaffold_idx'],
    )\
    .collect()
print(train_df.estimated_size('mb'))
train_df

8601.91998577118


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx
str,u16,u16,u16,u8,u8,u8,i64
"""C#CCOc1ccc(CNc…",1640,1653,765,0,0,0,4283326
"""C#CCOc1ccc(CNc…",1640,1653,205,0,0,0,4486646
"""C#CCOc1ccc(CNc…",1640,1653,1653,0,0,0,1015728
"""C#CCOc1ccc(CNc…",1640,1653,146,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,439,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,196,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,253,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,1219,0,0,0,5301385
"""C#CCOc1ccc(CNc…",1640,1653,604,0,0,0,543172
"""C#CCOc1ccc(CNc…",1640,1653,121,0,0,0,2571428


In [8]:
test_scaffolds = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_scaffold.csv').collect()
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v2.csv')\
    .select(
        pl.col('id','molecule'),
        pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        pl.col('protein'),
        scaffold_idx = test_scaffolds['scaffold_idx']
    ).group_by('molecule').first().collect()
print(test_df.estimated_size('mb'))
test_df

84.21934127807617


molecule,id,bb1,bb2,bb3,protein,scaffold_idx
str,i64,u16,u16,u16,str,i64
"""O=C(N[Dy])[C@@…",296074963,689,878,46,"""BRD4""",673725
"""COc1cccc(F)c1C…",296332256,189,777,1260,"""BRD4""",2494349
"""Cn1cc(-c2cnc(N…",296857951,751,259,1252,"""HSA""",3209794
"""Cc1ccc2c(c1)cc…",296835810,1333,401,1777,"""HSA""",3217947
"""O=C(N[Dy])c1cc…",296133662,1060,412,536,"""BRD4""",2709445
"""CCCCOc1cc(OC(F…",296729227,449,1885,157,"""sEH""",5715082
"""COc1cnc(CNc2nc…",296552972,1187,1023,1996,"""BRD4""",5376929
"""Cc1cc(F)ccc1Nc…",295297464,349,1199,1543,"""BRD4""",3975886
"""Cc1cccc(Nc2nc(…",295820338,14,249,420,"""BRD4""",2484639
"""COC(=O)c1c[nH]…",296319134,1022,2093,1332,"""BRD4""",3481448


In [9]:
df = pl.concat([train_df.select(pl.col('molecule', 'bb1', 'bb2', 'bb3', 'scaffold_idx'), ori=pl.lit(0)),
                test_df.select(pl.col('molecule', 'bb1', 'bb2', 'bb3', 'scaffold_idx'), ori= pl.lit(1))])
df = df.with_columns(
    pl.col('molecule').count().over('scaffold_idx').alias('mol_per_scaf')
)

df

molecule,bb1,bb2,bb3,scaffold_idx,ori,mol_per_scaf
str,u16,u16,u16,i64,i32,u32
"""C#CCOc1ccc(CNc…",1640,1653,765,4283326,0,844
"""C#CCOc1ccc(CNc…",1640,1653,205,4486646,0,9444
"""C#CCOc1ccc(CNc…",1640,1653,1653,1015728,0,9030
"""C#CCOc1ccc(CNc…",1640,1653,146,5301385,0,26814
"""C#CCOc1ccc(CNc…",1640,1653,439,5301385,0,26814
"""C#CCOc1ccc(CNc…",1640,1653,196,5301385,0,26814
"""C#CCOc1ccc(CNc…",1640,1653,253,5301385,0,26814
"""C#CCOc1ccc(CNc…",1640,1653,1219,5301385,0,26814
"""C#CCOc1ccc(CNc…",1640,1653,604,543172,0,844
"""C#CCOc1ccc(CNc…",1640,1653,121,2571428,0,864


In [10]:
train_df.columns

['molecule', 'bb1', 'bb2', 'bb3', 'BRD4', 'HSA', 'sEH', 'scaffold_idx']

In [11]:
train_df = train_df.with_columns(mol_per_scaf = df.filter(pl.col('ori') == 0)['mol_per_scaf'])
train_df

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf
str,u16,u16,u16,u8,u8,u8,i64,u32
"""C#CCOc1ccc(CNc…",1640,1653,765,0,0,0,4283326,844
"""C#CCOc1ccc(CNc…",1640,1653,205,0,0,0,4486646,9444
"""C#CCOc1ccc(CNc…",1640,1653,1653,0,0,0,1015728,9030
"""C#CCOc1ccc(CNc…",1640,1653,146,0,0,0,5301385,26814
"""C#CCOc1ccc(CNc…",1640,1653,439,0,0,0,5301385,26814
"""C#CCOc1ccc(CNc…",1640,1653,196,0,0,0,5301385,26814
"""C#CCOc1ccc(CNc…",1640,1653,253,0,0,0,5301385,26814
"""C#CCOc1ccc(CNc…",1640,1653,1219,0,0,0,5301385,26814
"""C#CCOc1ccc(CNc…",1640,1653,604,0,0,0,543172,844
"""C#CCOc1ccc(CNc…",1640,1653,121,0,0,0,2571428,864


In [12]:
train_df = train_df.with_columns((pl.col('BRD4') * 4 + pl.col('HSA') * 2 + pl.col('sEH')).alias('label'))
train_df.select(pl.n_unique('*'))

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
98415610,271,693,872,2,2,2,5711873,2388,8


In [13]:
del test_df, train_scaffolds, test_scaffolds
gc.collect()

11

In [None]:
1 / 500, 1/468

In [15]:
def train_test_split(df, test_only_bb_idxs, train_only_bb_idxs = None, random_state = 42):

    # 369_039 is estimated number of overlapped-BB samples for real test set (the remaining 508_983 is completely non-shared)
    stage23_factor = 369_039 / 508_983
    
    # 158_699 is estimated number of samples from Stage 2 (Scaffold split) for real test set
    stage2_factor = 158_699 / 508_983
    
    all_bbs = bbs_meta['all_bbs']
    # subset = 255: UNK
    df = df.with_columns(subset = pl.lit(255).cast(pl.UInt8))

    # Stage 1: BB-split
    # in this stage, we hold out a set of test-only building blocks
    print('##### STAGE 1 #####')
    print('Number of test-only building blocks:', len(test_only_bb_idxs))

    # For pre-defined competition train-test split, & or | is both equivalent
    # But for user-defined custom train-test split on train data, it really matters
    if train_only_bb_idxs is None:
        stage2_df = df.filter(~(pl.col('bb1').is_in(test_only_bb_idxs) | pl.col('bb2').is_in(test_only_bb_idxs) | pl.col('bb3').is_in(test_only_bb_idxs)))
    else:
        stage2_df = df.filter((pl.col('bb1').is_in(train_only_bb_idxs) & pl.col('bb2').is_in(train_only_bb_idxs) & pl.col('bb3').is_in(train_only_bb_idxs)))
    test1_df = df.filter(
        (pl.col('bb1').is_in(test_only_bb_idxs) | pl.col('bb2').is_in(test_only_bb_idxs) | pl.col('bb3').is_in(test_only_bb_idxs))
    )
    # subset = 0: non-share, i.e NONE of the BB is shared between train and test
    # subset = 1: at-least one BB is not shared, e.g A-B-C where A and C is shared, but B is not shared
    test1_df = test1_df.with_columns(
        pl.when((pl.col('bb1').is_in(test_only_bb_idxs) & pl.col('bb2').is_in(test_only_bb_idxs) & pl.col('bb3').is_in(test_only_bb_idxs)))\
        .then(0).otherwise(1).alias('subset'))

    # number of non-shared molecules
    test1_non_shared_df = test1_df.filter(pl.col('subset') == 0)
    num_non_shared = test1_non_shared_df.shape[0]
    stage2_test_size = round(stage2_factor * num_non_shared)
    stage23_test_size = round(stage23_factor * num_non_shared)
    print('Number of non-shared molecules:', test1_non_shared_df.shape)
    print(f'Stage2={stage2_test_size}, Stage3={stage23_test_size}')

    print('Stage 1 Test set statistic:')
    display(test1_df.select(pl.n_unique('*')))
    display(test1_df.describe(percentiles = PERCENTILES))
    print('Stage 1 NON-SHARED Test set statistic:')
    display(test1_non_shared_df.select(pl.n_unique('*')))
    display(test1_non_shared_df.describe(percentiles = PERCENTILES))
    print('Stage 1 Train set statistic:')
    display(stage2_df.select(pl.n_unique('*')))
    display(stage2_df.describe(percentiles = PERCENTILES))
    gc.collect()

    # Stage 2
    # Leave 20% molecules with most regular scaffolds: > 6116 mols/scaffold
    # Then do a Scaffold Split on the remaining 80% molecules
    # Scaffold Split is simply implemented with sklearn's GroupKFold
    print('##### STAGE 2 #####')
    stage2_df = stage2_df.with_columns(index = pl.Series(list(range(len(stage2_df)))))
    lower_stage2_df = stage2_df.filter(pl.col('mol_per_scaf').is_between(1, 6116))
    higher_stage2_df = stage2_df.filter((pl.col('mol_per_scaf') > 6116))
    if isinstance(stage2_test_size, int) and stage2_test_size > 1:
        stage2_test_pct = stage2_test_size / len(lower_stage2_df)
    else:
        raise ValueError
    stage2_n_folds = round(1 / stage2_test_pct)
    print('Number of folds:', stage2_n_folds)
    splitter = GroupKFold(n_splits=stage2_n_folds)
    # splitter = StratifiedGroupKFold(n_splits=stage2_n_folds, shuffle=True, random_state=random_state)
    for i, (tmp_train2_idxs, tmp_test2_idxs) in enumerate(splitter.split(range(len(lower_stage2_df)),
                                                                         lower_stage2_df['label'],
                                                                         lower_stage2_df['scaffold_idx'])):
        print(f"Fold {i}:")
        train2_idxs = lower_stage2_df[tmp_train2_idxs, 'index'].to_list()
        test2_idxs = lower_stage2_df[tmp_test2_idxs, 'index'].to_list()
        print('Fold train/test size:', len(train2_idxs), len(test2_idxs))
        break
    
    train2_df = stage2_df[train2_idxs]
    test2_df = stage2_df[test2_idxs].select(pl.col('*').exclude('index'))
    # subset = 2: scaffold split on lower-80% with overlapped BB (stage 2)
    test2_df = test2_df.with_columns(subset = pl.lit(2))
    stage3_df = pl.concat([train2_df, higher_stage2_df])

    print('Stage 2 Test set statistic:')
    display(test2_df.select(pl.n_unique('*')))
    display(test2_df.describe(percentiles = PERCENTILES))
    print('Stage 2 Train set statistic:')
    display(stage3_df.select(pl.n_unique('*')))
    display(stage3_df.describe(percentiles = PERCENTILES))
    gc.collect()

    # Stage 3: Random split on the remaining
    print('##### STAGE 3 #####')
    stage3_test_size = stage23_test_size - len(test2_df)
    if isinstance(stage3_test_size, int) and stage3_test_size > 1:
        stage3_test_pct = stage3_test_size / len(stage3_df)
    elif isinstance(stage3_test_size, float) and stage3_test_size < 1:
        stage3_test_pct = stage3_test_size
    else:
        raise ValueError

    stage3_n_folds = round(1 / stage3_test_pct)
    print('Number of folds:', stage3_n_folds, stage3_test_size)
    # splitter = KFold(n_splits=stage3_n_folds, shuffle=True, random_state=random_state)
    splitter = StratifiedKFold(n_splits=stage3_n_folds, shuffle=True, random_state=random_state)
    for i, (train3_idxs, test3_idxs) in enumerate(splitter.split(list(range(len(stage3_df))), stage3_df['label'])):
        print(f"Fold {i}:")
        print('Fold train/test size:', len(train3_idxs), len(test3_idxs))
        break
    train3_df = stage3_df[train3_idxs]
    test3_df = stage3_df[test3_idxs].select(pl.col('*').exclude('index'))
    # subset = 3: result of Stage 3: Random Split on the remaining
    test3_df = test3_df.with_columns(subset = pl.lit(3))
    
    print('Stage 3 Test set statistic:')
    display(test3_df.select(pl.n_unique('*')))
    display(test3_df.describe(percentiles = PERCENTILES))
    print('Stage 3 Train set statistic:')
    display(train3_df.select(pl.n_unique('*')))
    display(train3_df.describe(percentiles = PERCENTILES))
    gc.collect()

    test23_df = pl.concat([test2_df, test3_df])
    print('Test 2+3 set statistic:')
    display(test23_df.select(pl.n_unique('*')))
    display(test23_df.describe(percentiles = PERCENTILES))
    
    train_df = train3_df
    test_df = pl.concat([test1_df, test2_df, test3_df])

    print('Final Test set statistic:')
    display(test_df.select(pl.n_unique('*')))
    display(test_df.describe(percentiles = PERCENTILES))
    print('Final Train set statistic:')
    display(train_df.select(pl.n_unique('*')))
    display(train_df.describe(percentiles = PERCENTILES))
    gc.collect()

    train_df = train_df.select(pl.col('*').exclude('subset'))
    return train_df, test_df

In [16]:
test_only_bbs = set(bbs_meta['test_bbs']).difference(set(bbs_meta['train_bbs']))
all_bbs = bbs_meta['all_bbs']
test_only_bb_idxs = [all_bbs.index(e) for e in test_only_bbs]
len(test_only_bb_idxs)

965

In [None]:
train_df, test_df = train_test_split(df, test_only_bb_idxs, stage2_test_size = 158699, test_size = 369_039 , random_state = 42)

# Non-overlapped building blocks

In [20]:
train_bb1s = set(bbs_meta['train_bb1s'])
train_bb2s = set(bbs_meta['train_bb2s'])
train_bb3s = set(bbs_meta['train_bb3s'])
len(train_bb1s), len(train_bb2s), len(train_bb3s) 

(271, 693, 872)

In [21]:
((0.33 ** 3) * 0.65)** 0.33

0.28946110000386416

In [22]:
0.28946110000386416 ** 3

0.02425328803396033

In [23]:
0.33 ** 3

0.035937000000000004

In [24]:
from deepchem.splits.splitters import _generate_scaffold

def get_scaffolds(l):
    return [_generate_scaffold(e) for e in l]

In [25]:
all_bb_scaffolds = get_scaffolds(bbs_meta['all_bbs'])
len(all_bb_scaffolds)

2110

In [26]:
unique_bb_scaffolds = list(set(all_bb_scaffolds))
len(unique_bb_scaffolds)

614

In [None]:
BB_IDX_TO_SCAF_IDX = {i: unique_bb_scaffolds.index(e) for i, e in enumerate(all_bb_scaffolds)}
print(len(set(BB_IDX_TO_SCAF_IDX.keys())), len(set(BB_IDX_TO_SCAF_IDX.values())))
BB_IDX_TO_SCAF_IDX

In [None]:
df = df.with_columns(pl.col('bb1').replace(BB_IDX_TO_SCAF_IDX, default=-1, return_dtype=pl.UInt16).alias('bb1_scaffold_idx'),
                pl.col('bb2').replace(BB_IDX_TO_SCAF_IDX, default=-1, return_dtype= pl.UInt16).alias('bb2_scaffold_idx'),
                pl.col('bb3').replace(BB_IDX_TO_SCAF_IDX, default=-1, return_dtype = pl.UInt16).alias('bb3_scaffold_idx'),
               )
df

In [None]:
df.select((pl.col('bb1_scaffold_idx', 'bb2_scaffold_idx', 'bb3_scaffold_idx') ==  -1).any())

In [None]:
for col in ['bb1_scaffold_idx', 'bb2_scaffold_idx', 'bb3_scaffold_idx']:
    _train = df.filter(pl.col('ori') == 0)[col].unique().to_list()
    _test = df.filter((pl.col('bb1').is_in(test_only_bb_idxs) | pl.col('bb2').is_in(test_only_bb_idxs) | pl.col('bb3').is_in(test_only_bb_idxs)) & (pl.col('ori') == 1))[col].unique().to_list()
    print(len(_train), len(_test), len(set(_test).difference(set(_train))), len(set(_test).intersection(set(_train))))

In [None]:
10 / 62

In [None]:
49 / 222

In [None]:
85 / 270

In [None]:
df = df.with_columns(pl.col('molecule').count().over('bb1_scaffold_idx').alias('bb1_scaffold_freq'),
                    pl.col('molecule').count().over('bb2_scaffold_idx').alias('bb2_scaffold_freq'),
                    pl.col('molecule').count().over('bb3_scaffold_idx').alias('bb3_scaffold_freq')\
                    )
df

In [None]:
tmp = df.filter((pl.col('ori') == 1) & (pl.col('bb1').is_in(test_only_bb_idxs) | pl.col('bb2').is_in(test_only_bb_idxs) | pl.col('bb3').is_in(test_only_bb_idxs))).group_by('bb1_scaffold_idx').first().sort('bb1_scaffold_freq').select('bb1_scaffold_idx', 'bb1_scaffold_freq')
display(tmp)
tmp.to_pandas()['bb1_scaffold_freq'].hist(bins = 100)

In [None]:
tmp = df.filter(pl.col('ori') == 0).group_by('bb1_scaffold_idx').first().sort('bb1_scaffold_freq').select('bb1_scaffold_idx', 'bb1_scaffold_freq')
display(tmp)
tmp.to_pandas()['bb1_scaffold_freq'].hist(bins = 100)

In [None]:
tmp = df.filter(pl.col('ori') == 0).group_by('bb1_scaffold_idx').agg(pl.n_unique('bb1').alias('num_bb1s_per_scaf')).sort('num_bb1s_per_scaf')
display(tmp)
tmp.to_pandas()['num_bb1s_per_scaf'].hist(bins = 100)

In [None]:
tmp = df.filter((pl.col('ori') == 1) & (pl.col('bb1').is_in(test_only_bb_idxs) | pl.col('bb2').is_in(test_only_bb_idxs) | pl.col('bb3').is_in(test_only_bb_idxs))).group_by('bb1_scaffold_idx').agg(pl.n_unique('bb1').alias('num_bb1s_per_scaf')).sort('num_bb1s_per_scaf')
display(tmp)
tmp.to_pandas()['num_bb1s_per_scaf'].hist(bins = 100)

In [None]:
len(train_bb1s)

In [17]:
bb1s = list(set(bbs_meta['train_bb1s'] + bbs_meta['test_bb1s']))
bb2s = list(set(bbs_meta['train_bb2s'] + bbs_meta['test_bb2s']))
bb3s = list(set(bbs_meta['train_bb3s'] + bbs_meta['test_bb3s']))
len(bb1s), len(bb2s), len(bb3s)

(341, 1140, 1390)

In [18]:
pseu_test_bb1s = random.sample(bb1s, round(len(bb1s) * 0.2053))
pseu_test_bb2s = random.sample(bb2s, round(len(bb2s) * 0.3921))
pseu_test_bb3s = random.sample(bb3s, round(len(bb3s) * 0.373))
len(pseu_test_bb1s), len(pseu_test_bb2s), len(pseu_test_bb3s)

(70, 447, 518)

In [27]:
def analyze_set(s1, s2):
    s1 = set(s1)
    s2 = set(s2)
    overlap = s1.intersection(s2)
    diff = s2.difference(s1)
    union = s2.union(s1)
    print('Len:', len(s1), len(diff), len(s2))
    print('Overlap:', len(overlap), len(overlap) / len(union))
    print('Test only:', len(diff), len(union), len(diff) / len(union) )

In [28]:
analyze_set(get_scaffolds(set(bb1s).difference(pseu_test_bb1s)), get_scaffolds(pseu_test_bb1s))

Len: 82 11 32
Overlap: 21 0.22580645161290322
Test only: 11 93 0.11827956989247312


In [29]:
analyze_set(get_scaffolds(set(bb2s).difference(pseu_test_bb2s)), get_scaffolds(pseu_test_bb2s))

Len: 245 111 187
Overlap: 76 0.21348314606741572
Test only: 111 356 0.31179775280898875


In [30]:
analyze_set(get_scaffolds(set(bb3s).difference(pseu_test_bb3s)), get_scaffolds(pseu_test_bb3s))

Len: 291 124 215
Overlap: 91 0.21927710843373494
Test only: 124 415 0.2987951807228916


In [None]:
len(train_bb1s), len(train_bb2s), len(train_bb3s)

In [None]:
train_scaffolds = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_scaffold.csv').collect()
train_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2.csv')\
    .select(
        pl.col('molecule'),
        pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        pl.col('BRD4', 'HSA', 'eSH').cast(pl.UInt8),
        scaffold_idx = train_scaffolds['scaffold_idx'],
    )\
    .collect()
print(train_df.estimated_size('mb'))
train_df

In [None]:
train_df = train_df.with_columns((pl.col('BRD4') * 4 + pl.col('HSA') * 2 + pl.col('eSH')).alias('label'))
train_df.select(pl.n_unique('*'))

In [None]:
train_df.filter(pl.col('label') > 0)['label'].to_pandas().hist(bins = 7)

In [None]:
train_df.filter(pl.col('label') > 4)['label'].to_pandas().hist(bins = 7)

In [None]:
set(train_bb1s).intersection(set(train_bb2s))

In [None]:
set(train_bb1s).intersection(set(train_bb3s))

In [None]:
len(set(train_bb2s).intersection(set(train_bb3s)))

In [None]:
train_df.filter(pl.col('bb2') == pl.col('bb3'))

In [None]:
train_df.filter((pl.col('bb2') != pl.col('bb3')) & ((pl.col('bb2').cast(pl.String) + '_' + pl.col('bb3').cast(pl.String)) == (pl.col('bb3').cast(pl.String) + '_' + pl.col('bb2').cast(pl.String)) ))

In [None]:
train_df.filter(pl.col('label') > 0)['label'].to_pandas().hist(bins = 7)

In [None]:
train_df['label'].n_unique()

In [None]:
train_df['bb1'].n_unique()

In [None]:
splitter = StratifiedGroupKFold(n_splits=8, shuffle=True, random_state=42)

fold_meta = {}
for i, (train_idxs, test_idxs) in enumerate(splitter.split(list(range(len(train_df))), train_df['label'], train_df['bb1'])):
    fold_meta[i] = {}
    fold_meta[i]['train'] = train_df[train_idxs, 'bb1'].unique().to_list()
    fold_meta[i]['test'] = train_df[test_idxs, 'bb1'].unique().to_list()
    assert len(fold_meta[i]['train'] + fold_meta[i]['test']) == 271
    assert len(set(fold_meta[i]['test']).intersection(set(fold_meta[i]['train']))) == 0
    print(f"Fold {i}: {len(fold_meta[i]['train'])} + {len(fold_meta[i]['test'])}")

bb1_fold_meta = fold_meta

In [None]:
for fold_idx, fold_bb1s in bb1_fold_meta.items():
    test_df = train_df.filter(pl.col('bb1').is_in(fold_bb1s['test']))
    print(fold_idx, test_df.shape)
    print(test_df.select(pl.col('BRD4', 'HSA', 'eSH').mean() * 100))
    test_df.filter(pl.col('label') > 0)['label'].to_pandas().hist(bins = 7)
    from matplotlib import pyplot as plt
    plt.show()

In [None]:
bb23_fold_meta = {}
for bb1_fold_idx, fold_bb1s in bb1_fold_meta.items():
    round1_test_df = train_df.filter(pl.col('bb1').is_in(fold_bb1s['test']))
    print('BB1 fold len:', round1_test_df.shape)
    labels = []
    groups = []
    unique_bb23s = [bbs_meta['all_bbs'].index(e) for e in list(train_bb2s.union(train_bb3s))]
    print('Num building blocks 2 + 3:', len(unique_bb23s))
    for bb in unique_bb23s:
        tmp_df = round1_test_df.filter((pl.col('bb2') == bb) | (pl.col('bb3') == bb))
        labels.extend(tmp_df['label'])
        groups.extend([bb] * len(tmp_df))
    print(len(labels), len(groups))
    splitter = StratifiedGroupKFold(n_splits=8, shuffle=True, random_state=42)
    fold_meta = {}
    for i, (train_idxs, test_idxs) in enumerate(splitter.split(labels, labels, groups)):
        fold_meta[i] = {}
        fold_meta[i]['train'] = np.unique(np.array(groups)[train_idxs]).tolist() 
        fold_meta[i]['test'] =  np.unique(np.array(groups)[test_idxs]).tolist()
        assert len(fold_meta[i]['train'] + fold_meta[i]['test']) == len(unique_bb23s)
        assert len(set(fold_meta[i]['test']).intersection(set(fold_meta[i]['train']))) == 0
        print(f"Fold {i}: {len(fold_meta[i]['train'])} + {len(fold_meta[i]['test'])}")
    bb23_fold_meta[bb1_fold_idx] = fold_meta

In [None]:
bb23_fold_meta

In [None]:
global_fold_meta = {}
for bb1_fold_idx, v in bb23_fold_meta.items():
    global_fold_meta[bb1_fold_idx] = {}
    global_fold_meta[bb1_fold_idx]['bb1'] = bb1_fold_meta[bb1_fold_idx]
    global_fold_meta[bb1_fold_idx]['bb23'] = v

global_fold_meta

In [None]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/bb_split_8_8.json', 'w') as f:
    json.dump(global_fold_meta, f)

In [None]:
tmp = train_df.filter((pl.col('bb1').is_in(global_fold_meta[0]['bb1']['test'])) | \
                (pl.col('bb2').is_in(global_fold_meta[0]['bb23'][0]['test'])) | \
                (pl.col('bb3').is_in(global_fold_meta[0]['bb23'][0]['test']))
               )
tmp.shape

In [None]:
train_df.select(pl.col('BRD4', 'HSA', 'eSH').mean() * 100)

In [None]:
tmp.select(pl.col('BRD4', 'HSA', 'eSH').mean() * 100)

In [None]:
BB1_SPLIT = 8
BB23_SPLIT = 8
for i1 in range(BB1_SPLIT):
    for i23 in range(BB23_SPLIT):
        cur_idx = BB23_SPLIT * i1 + i23
        test_bb1s = global_fold_meta[i1]['bb1']['test']
        test_bb23s = global_fold_meta[i1]['bb23'][i23]['test']
        
        tmp_test = train_df.filter((pl.col('bb1').is_in(test_bb1s)) | \
            (pl.col('bb2').is_in(test_bb23s)) | \
            (pl.col('bb3').is_in(test_bb23s))
        )

        print(f'--------- {cur_idx} ----------')
        print(f'{len(tmp_test)} ({round(len(tmp_test) / len(train_df) * 100, 1)} %)')
        print('building blocks:', len(test_bb1s), len(test_bb23s))
        print(tmp_test.select(pl.col('BRD4', 'HSA', 'eSH').mean() * 100))
        print('\n\n')

In [None]:
train_df

In [34]:
BB1_SPLIT = 8
BB23_SPLIT = 8

with open(f'/home/dangnh36/datasets/competitions/leash_belka/processed/meta/bb_split_{BB1_SPLIT}_{BB23_SPLIT}.json', 'r') as f:
    global_fold_meta = json.load(f)

In [32]:
BB1_FOLD = str(4)
BB23_FOLD = str(4)
test_only_bb1_idxs = global_fold_meta[BB1_FOLD]['bb1']['test']
test_only_bb23_idxs = global_fold_meta[BB1_FOLD]['bb23'][BB23_FOLD]['test']

assert len(set(test_only_bb1_idxs).intersection(set(test_only_bb23_idxs))) == 0

test_only_bb_idxs = set(test_only_bb1_idxs).union(set(test_only_bb23_idxs))
len(test_only_bb1_idxs), len(test_only_bb23_idxs), len(test_only_bb_idxs)

(33, 111, 144)

In [None]:
100_000_000 / 8

In [None]:
31830221 / (34 * 116 * 116)

In [None]:
508_983 / (70 * 447 * 518)

In [None]:
tmp = train_df.filter((pl.col('bb1').is_in(test_only_bb1_idxs)) & \
                (pl.col('bb2').is_in(test_only_bb23_idxs)) & \
                (pl.col('bb3').is_in(test_only_bb23_idxs))
               )
print(tmp.shape)
tmp.shape[0] / train_df.shape[0] * 100

In [None]:
tmp.select(pl.n_unique('*')).to_dicts()

In [None]:
tmp.select(pl.col('BRD4', 'HSA', 'eSH').mean() * 100).to_dicts()

In [None]:
def get_statistic(df, test_only_bb1_idxs, test_only_bb23_idxs):
    ret = {}
    non_share = train_df.filter((pl.col('bb1').is_in(test_only_bb1_idxs)) & \
                (pl.col('bb2').is_in(test_only_bb23_idxs)) & \
                (pl.col('bb3').is_in(test_only_bb23_idxs))
               )
    ret['_non_share_count'] = non_share.shape[0]
    ret['_non_share_pct'] = non_share.shape[0] / train_df.shape[0] * 100
    ret.update({f'_non_share_unique_{k}': v for k, v in non_share.select(pl.n_unique('*')).to_dicts()[0].items()})
    ret.update({f'_non_share_pct_{k}': v for k, v in non_share.select(pl.col('BRD4', 'HSA', 'eSH').mean() * 100).to_dicts()[0].items()})

    one_non_share = train_df.filter((pl.col('bb1').is_in(test_only_bb1_idxs)) | \
                (pl.col('bb2').is_in(test_only_bb23_idxs)) | \
                (pl.col('bb3').is_in(test_only_bb23_idxs))
               )
    ret['count'] = one_non_share.shape[0]
    ret['pct'] = one_non_share.shape[0] / train_df.shape[0] * 100
    ret.update({f'_unique_{k}': v for k, v in one_non_share.select(pl.n_unique('*')).to_dicts()[0].items()})
    ret.update({f'_pct_{k}': v for k, v in one_non_share.select(pl.col('BRD4', 'HSA', 'eSH').mean() * 100).to_dicts()[0].items()})
    
    return ret
    

In [None]:
all_stats = []
for bb1_fold_idx in range(BB1_SPLIT):
    for bb23_fold_idx in range(BB23_SPLIT):
        print(f'\n\n--------------- {bb1_fold_idx} {bb23_fold_idx} ------------------')
        test_only_bb1_idxs = global_fold_meta[str(bb1_fold_idx)]['bb1']['test']
        test_only_bb23_idxs = global_fold_meta[str(bb1_fold_idx)]['bb23'][str(bb23_fold_idx)]['test']
        assert len(set(test_only_bb1_idxs).intersection(set(test_only_bb23_idxs))) == 0
        test_only_bb_idxs = set(test_only_bb1_idxs).union(set(test_only_bb23_idxs))
        print(len(test_only_bb1_idxs), len(test_only_bb23_idxs), len(test_only_bb_idxs))
        stats = get_statistic(train_df, test_only_bb1_idxs, test_only_bb23_idxs)
        global_fold_meta[str(bb1_fold_idx)]['bb23'][str(bb23_fold_idx)]['stats'] = stats
        all_stats.append([bb1_fold_idx, bb23_fold_idx, stats])
        print(stats)

In [None]:
all_stats

In [None]:
sorted(all_stats, key = lambda x: (x[2]['_non_share_pct_eSH'], x[2]['_non_share_unique_molecule']), reverse=True)

## Best is 4-6 -> 1-7

In [35]:
global_fold_meta['4']['bb23']['6']['stats']

KeyError: 'stats'

In [None]:
global_fold_meta['1']['bb23']['7']['stats']

In [36]:
BB1_FOLD = str(4)
BB23_FOLD = str(6)
test_only_bb1_idxs = global_fold_meta[BB1_FOLD]['bb1']['test']
test_only_bb23_idxs = global_fold_meta[BB1_FOLD]['bb23'][BB23_FOLD]['test']
train_only_bb1_idxs = global_fold_meta[BB1_FOLD]['bb1']['train']
train_only_bb23_idxs = global_fold_meta[BB1_FOLD]['bb23'][BB23_FOLD]['train']


assert len(set(test_only_bb1_idxs).intersection(set(test_only_bb23_idxs))) == 0
assert len(set(train_only_bb1_idxs).intersection(set(train_only_bb23_idxs))) == 0

train_only_bb_idxs = set(train_only_bb1_idxs).union(set(train_only_bb23_idxs))
test_only_bb_idxs = set(test_only_bb1_idxs).union(set(test_only_bb23_idxs))

assert len(set(train_only_bb_idxs).intersection(set(test_only_bb_idxs))) == 0
assert len(train_only_bb_idxs.union(test_only_bb_idxs)) == len(bbs_meta['train_bbs'])

len(test_only_bb1_idxs), len(test_only_bb23_idxs), len(test_only_bb_idxs)

(33, 103, 136)

In [37]:
train_df.shape

(98415610, 10)

In [38]:
train_df.head(5)

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label
str,u16,u16,u16,u8,u8,u8,i64,u32,u8
"""C#CCOc1ccc(CNc…",1640,1653,765,0,0,0,4283326,844,0
"""C#CCOc1ccc(CNc…",1640,1653,205,0,0,0,4486646,9444,0
"""C#CCOc1ccc(CNc…",1640,1653,1653,0,0,0,1015728,9030,0
"""C#CCOc1ccc(CNc…",1640,1653,146,0,0,0,5301385,26814,0
"""C#CCOc1ccc(CNc…",1640,1653,439,0,0,0,5301385,26814,0


In [39]:
train_df = train_df.with_columns(ori_index = pl.Series(list(range(len(train_df)))))
train_df

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index
str,u16,u16,u16,u8,u8,u8,i64,u32,u8,i64
"""C#CCOc1ccc(CNc…",1640,1653,765,0,0,0,4283326,844,0,0
"""C#CCOc1ccc(CNc…",1640,1653,205,0,0,0,4486646,9444,0,1
"""C#CCOc1ccc(CNc…",1640,1653,1653,0,0,0,1015728,9030,0,2
"""C#CCOc1ccc(CNc…",1640,1653,146,0,0,0,5301385,26814,0,3
"""C#CCOc1ccc(CNc…",1640,1653,439,0,0,0,5301385,26814,0,4
"""C#CCOc1ccc(CNc…",1640,1653,196,0,0,0,5301385,26814,0,5
"""C#CCOc1ccc(CNc…",1640,1653,253,0,0,0,5301385,26814,0,6
"""C#CCOc1ccc(CNc…",1640,1653,1219,0,0,0,5301385,26814,0,7
"""C#CCOc1ccc(CNc…",1640,1653,604,0,0,0,543172,844,0,8
"""C#CCOc1ccc(CNc…",1640,1653,121,0,0,0,2571428,864,0,9


In [40]:
final_train_df, final_test_df = train_test_split(train_df, test_only_bb_idxs, train_only_bb_idxs = None, random_state = 42)

##### STAGE 1 #####
Number of test-only building blocks: 136
Number of non-shared molecules: (173831, 12)
Stage2=54200, Stage3=126036
Stage 1 Test set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
31579445,271,693,872,2,2,2,2548438,2388,8,31579445,2


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""31579445""",31579445.0,31579445.0,31579445.0,31579445.0,31579445.0,31579445.0,31579445.0,31579445.0,31579445.0,31579445.0,31579445.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1119.771436,1069.935063,1070.24005,0.004371,0.003866,0.006899,3042400.0,18437.700256,0.032116,48620000.0,0.994495
"""std""",,587.45225,616.061992,613.196835,0.06597,0.06206,0.082771,1726100.0,65057.518335,0.306994,27679000.0,0.073988
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"""1%""",,36.0,25.0,20.0,0.0,0.0,0.0,68630.0,1.0,0.0,315794.0,1.0
"""2.5%""",,76.0,49.0,46.0,0.0,0.0,0.0,158172.0,2.0,0.0,2248088.0,1.0
"""5%""",,171.0,110.0,113.0,0.0,0.0,0.0,289734.0,3.0,0.0,5509581.0,1.0
"""10%""",,346.0,154.0,206.0,0.0,0.0,0.0,614327.0,9.0,0.0,10203882.0,1.0
"""25%""",,603.0,547.0,543.0,0.0,0.0,0.0,1555552.0,72.0,0.0,26148531.0,1.0


Stage 1 NON-SHARED Test set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
173831,33,92,103,2,2,2,29933,1460,6,173831,1


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""173831""",173831.0,173831.0,173831.0,173831.0,173831.0,173831.0,173831.0,173831.0,173831.0,173831.0,173831.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1193.623629,1046.8883,1144.719872,0.003291,0.003095,0.006196,3102200.0,27788.718215,0.025548,47216000.0,0.0
"""std""",,573.001427,624.412412,611.461461,0.057269,0.055546,0.078469,1731800.0,81885.594977,0.270206,25784000.0,0.0
"""min""","""C#CCOc1cccc(CN…",51.0,24.0,24.0,0.0,0.0,0.0,10.0,1.0,0.0,486.0,0.0
"""1%""",,51.0,24.0,28.0,0.0,0.0,0.0,73471.0,1.0,0.0,125117.0,0.0
"""2.5%""",,51.0,84.0,84.0,0.0,0.0,0.0,153758.0,2.0,0.0,289302.0,0.0
"""5%""",,171.0,120.0,135.0,0.0,0.0,0.0,280549.0,4.0,0.0,5677406.0,0.0
"""10%""",,452.0,150.0,206.0,0.0,0.0,0.0,632475.0,12.0,0.0,12468804.0,0.0
"""25%""",,766.0,489.0,578.0,0.0,0.0,0.0,1621649.0,87.0,0.0,28795947.0,0.0


Stage 1 Train set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
66836165,238,601,769,2,2,2,4366332,2388,8,66836165,1


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""66836165""",66836165.0,66836165.0,66836165.0,66836165.0,66836165.0,66836165.0,66836165.0,66836165.0,66836165.0,66836165.0,66836165.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1074.40254,1079.389182,1033.020908,0.004772,0.004284,0.007581,3022700.0,14094.078724,0.035236,49486000.0,255.0
"""std""",,591.520265,609.722111,610.55442,0.068913,0.06531,0.086738,1724700.0,55789.476605,0.321417,28745000.0,0.0
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,363110.0,255.0
"""1%""",,21.0,26.0,15.0,0.0,0.0,0.0,63746.0,1.0,0.0,1227687.0,255.0
"""2.5%""",,76.0,45.0,44.0,0.0,0.0,0.0,155521.0,1.0,0.0,2523805.0,255.0
"""5%""",,141.0,90.0,98.0,0.0,0.0,0.0,286393.0,3.0,0.0,4686832.0,255.0
"""10%""",,273.0,158.0,205.0,0.0,0.0,0.0,606739.0,8.0,0.0,9735670.0,255.0
"""25%""",,557.0,586.0,505.0,0.0,0.0,0.0,1541764.0,48.0,0.0,23786674.0,255.0


##### STAGE 2 #####
Number of folds: 1004
Fold 0:
Fold train/test size: 54358658 54197
Stage 2 Test set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
54197,238,601,748,2,2,2,4348,131,5,54197,1


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""54197""",54197.0,54197.0,54197.0,54197.0,54197.0,54197.0,54197.0,54197.0,54197.0,54197.0,54197.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1061.666365,1148.025887,1031.076646,0.004318,0.005074,0.00738,3035500.0,1029.549403,0.034799,50439000.0,2.0
"""std""",,595.176041,600.693238,591.07644,0.065567,0.071052,0.085593,1508600.0,1831.526517,0.313378,29322000.0,0.0
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,13.0,1.0,0.0,366076.0,2.0
"""1%""",,21.0,26.0,9.0,0.0,0.0,0.0,58419.0,1.0,0.0,1238994.0,2.0
"""2.5%""",,104.0,46.0,34.0,0.0,0.0,0.0,242757.0,1.0,0.0,2597074.0,2.0
"""5%""",,138.0,100.0,81.0,0.0,0.0,0.0,483967.0,2.0,0.0,4730653.0,2.0
"""10%""",,262.0,249.0,215.0,0.0,0.0,0.0,900559.0,5.0,0.0,10396856.0,2.0
"""25%""",,554.0,718.0,579.0,0.0,0.0,0.0,2018678.0,39.0,0.0,24830186.0,2.0


Stage 2 Train set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset,index
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
66781968,238,601,769,2,2,2,4361984,2387,8,66781968,1,66781968


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset,index
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""66781968""",66781968.0,66781968.0,66781968.0,66781968.0,66781968.0,66781968.0,66781968.0,66781968.0,66781968.0,66781968.0,66781968.0,66781968.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1074.412876,1079.33348,1033.022486,0.004772,0.004283,0.007581,3022700.0,14104.681262,0.035236,49485000.0,255.0,33418000.0
"""std""",,591.517182,609.726251,610.569976,0.068916,0.065305,0.086739,1724900.0,55810.843695,0.321424,28744000.0,0.0,19294000.0
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,363110.0,255.0,0.0
"""1%""",,21.0,26.0,15.0,0.0,0.0,0.0,63753.0,1.0,0.0,1227678.0,255.0,668353.0
"""2.5%""",,76.0,45.0,44.0,0.0,0.0,0.0,155462.0,1.0,0.0,2523747.0,255.0,1670854.0
"""5%""",,141.0,90.0,98.0,0.0,0.0,0.0,286317.0,3.0,0.0,4686813.0,255.0,3341790.0
"""10%""",,273.0,158.0,205.0,0.0,0.0,0.0,606564.0,8.0,0.0,9734204.0,255.0,6683312.0
"""25%""",,557.0,586.0,505.0,0.0,0.0,0.0,1541230.0,48.0,0.0,23786062.0,255.0,16708502.0


##### STAGE 3 #####
Number of folds: 930 71839
Fold 0:
Fold train/test size: 66710159 71809
Stage 3 Test set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
71809,238,601,763,2,2,2,48571,2090,5,71809,1


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""71809""",71809.0,71809.0,71809.0,71809.0,71809.0,71809.0,71809.0,71809.0,71809.0,71809.0,71809.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1074.212592,1080.311716,1034.187372,0.004763,0.004289,0.007576,3026800.0,14191.580248,0.035205,49627000.0,3.0
"""std""",,591.663628,610.058349,609.364225,0.068848,0.065352,0.086709,1721500.0,55562.495943,0.321083,28744000.0,0.0
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,0.0,364453.0,3.0
"""1%""",,21.0,26.0,15.0,0.0,0.0,0.0,63782.0,1.0,0.0,1224562.0,3.0
"""2.5%""",,76.0,45.0,44.0,0.0,0.0,0.0,153533.0,1.0,0.0,2496270.0,3.0
"""5%""",,141.0,91.0,100.0,0.0,0.0,0.0,290571.0,3.0,0.0,4609667.0,3.0
"""10%""",,273.0,158.0,208.0,0.0,0.0,0.0,609847.0,8.0,0.0,9697233.0,3.0
"""25%""",,557.0,586.0,506.0,0.0,0.0,0.0,1549324.0,48.0,0.0,23956748.0,3.0


Stage 3 Train set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset,index
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
66710159,238,601,769,2,2,2,4359915,2387,8,66710159,1,66710159


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset,index
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""66710159""",66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1074.413092,1079.332427,1033.021232,0.004772,0.004283,0.007581,3022700.0,14104.587721,0.035236,49485000.0,255.0,33417000.0
"""std""",,591.517029,609.725897,610.571276,0.068916,0.065305,0.086739,1724900.0,55811.110771,0.321424,28744000.0,0.0,19294000.0
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,363110.0,255.0,0.0
"""1%""",,21.0,26.0,15.0,0.0,0.0,0.0,63750.0,1.0,0.0,1227679.0,255.0,668354.0
"""2.5%""",,76.0,45.0,44.0,0.0,0.0,0.0,155462.0,1.0,0.0,2523782.0,255.0,1670883.0
"""5%""",,141.0,90.0,98.0,0.0,0.0,0.0,286317.0,3.0,0.0,4686871.0,255.0,3341846.0
"""10%""",,273.0,158.0,205.0,0.0,0.0,0.0,606564.0,8.0,0.0,9734228.0,255.0,6683334.0
"""25%""",,557.0,586.0,505.0,0.0,0.0,0.0,1541230.0,48.0,0.0,23785911.0,255.0,16708368.0


Test 2+3 set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
126006,238,601,765,2,2,2,52919,2092,5,126006,2


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""126006""",126006.0,126006.0,126006.0,126006.0,126006.0,126006.0,126006.0,126006.0,126006.0,126006.0,126006.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1068.816279,1109.436559,1032.849404,0.004571,0.004627,0.007492,3030600.0,8530.400735,0.03503,49976000.0,2.569886
"""std""",,593.207081,606.972168,601.56613,0.067456,0.067863,0.08623,1633400.0,42464.594824,0.317791,28997000.0,0.495094
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,0.0,364453.0,2.0
"""1%""",,21.0,26.0,9.0,0.0,0.0,0.0,60247.0,1.0,0.0,1234757.0,2.0
"""2.5%""",,76.0,46.0,34.0,0.0,0.0,0.0,170662.0,1.0,0.0,2533940.0,2.0
"""5%""",,138.0,92.0,92.0,0.0,0.0,0.0,355565.0,2.0,0.0,4649220.0,2.0
"""10%""",,271.0,207.0,215.0,0.0,0.0,0.0,690948.0,6.0,0.0,9962280.0,2.0
"""25%""",,556.0,626.0,534.0,0.0,0.0,0.0,1729429.0,47.0,0.0,24576558.0,2.0


Final Test set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
31705451,271,693,872,2,2,2,2560137,2388,8,31705451,4


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""31705451""",31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1119.568927,1070.092052,1070.09145,0.004372,0.003869,0.006901,3042400.0,18398.32598,0.032127,48625000.0,1.000756
"""std""",,587.48397,616.031136,613.155552,0.065976,0.062085,0.082785,1725700.0,64986.265685,0.307038,27684000.0,0.127479
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"""1%""",,36.0,25.0,20.0,0.0,0.0,0.0,68587.0,1.0,0.0,317055.0,1.0
"""2.5%""",,76.0,49.0,45.0,0.0,0.0,0.0,158224.0,2.0,0.0,2248638.0,1.0
"""5%""",,171.0,110.0,113.0,0.0,0.0,0.0,290045.0,3.0,0.0,5508469.0,1.0
"""10%""",,346.0,154.0,206.0,0.0,0.0,0.0,614667.0,9.0,0.0,10203600.0,1.0
"""25%""",,603.0,547.0,543.0,0.0,0.0,0.0,1555939.0,72.0,0.0,26140722.0,1.0


Final Train set statistic:


molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset,index
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
66710159,238,601,769,2,2,2,4359915,2387,8,66710159,1,66710159


statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset,index
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""66710159""",66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1074.413092,1079.332427,1033.021232,0.004772,0.004283,0.007581,3022700.0,14104.587721,0.035236,49485000.0,255.0,33417000.0
"""std""",,591.517029,609.725897,610.571276,0.068916,0.065305,0.086739,1724900.0,55811.110771,0.321424,28744000.0,0.0,19294000.0
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,363110.0,255.0,0.0
"""1%""",,21.0,26.0,15.0,0.0,0.0,0.0,63750.0,1.0,0.0,1227679.0,255.0,668354.0
"""2.5%""",,76.0,45.0,44.0,0.0,0.0,0.0,155462.0,1.0,0.0,2523782.0,255.0,1670883.0
"""5%""",,141.0,90.0,98.0,0.0,0.0,0.0,286317.0,3.0,0.0,4686871.0,255.0,3341846.0
"""10%""",,273.0,158.0,205.0,0.0,0.0,0.0,606564.0,8.0,0.0,9734228.0,255.0,6683334.0
"""25%""",,557.0,586.0,505.0,0.0,0.0,0.0,1541230.0,48.0,0.0,23785911.0,255.0,16708368.0


In [41]:
final_test_df.shape, final_train_df.shape

((31705451, 12), (66710159, 12))

In [43]:
final_test_df.select(pl.col('BRD4', 'HSA', 'sEH').mean() * 100)

BRD4,HSA,sEH
f64,f64,f64
0.43719,0.386946,0.690096


In [44]:
final_test_df.group_by('subset').count().sort('subset')

subset,count
i32,u32
0,173831
1,31405614
2,54197
3,71809


In [2]:
31705451 + 66710159

98415610

In [4]:
(173831 / 98415610) * 100

0.17662950013722417

In [45]:
final_test_df.select(pl.n_unique('*'))

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
31705451,271,693,872,2,2,2,2560137,2388,8,31705451,4


In [46]:
final_test_df.describe(percentiles=PERCENTILES)

statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,subset
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""31705451""",31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0,31705451.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1119.568927,1070.092052,1070.09145,0.004372,0.003869,0.006901,3042400.0,18398.32598,0.032127,48625000.0,1.000756
"""std""",,587.48397,616.031136,613.155552,0.065976,0.062085,0.082785,1725700.0,64986.265685,0.307038,27684000.0,0.127479
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"""1%""",,36.0,25.0,20.0,0.0,0.0,0.0,68587.0,1.0,0.0,317055.0,1.0
"""2.5%""",,76.0,49.0,45.0,0.0,0.0,0.0,158224.0,2.0,0.0,2248638.0,1.0
"""5%""",,171.0,110.0,113.0,0.0,0.0,0.0,290045.0,3.0,0.0,5508469.0,1.0
"""10%""",,346.0,154.0,206.0,0.0,0.0,0.0,614667.0,9.0,0.0,10203600.0,1.0
"""25%""",,603.0,547.0,543.0,0.0,0.0,0.0,1555939.0,72.0,0.0,26140722.0,1.0


In [50]:
final_train_df.select(pl.col('BRD4', 'HSA', 'sEH').mean() * 100)

BRD4,HSA,sEH
f64,f64,f64
0.477215,0.428311,0.758106


In [48]:
final_train_df.select(pl.n_unique('*'))

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,index
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
66710159,238,601,769,2,2,2,4359915,2387,8,66710159,66710159


In [49]:
final_train_df.describe(percentiles=PERCENTILES)

statistic,molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index,index
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""66710159""",66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0,66710159.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,1074.413092,1079.332427,1033.021232,0.004772,0.004283,0.007581,3022700.0,14104.587721,0.035236,49485000.0,33417000.0
"""std""",,591.517029,609.725897,610.571276,0.068916,0.065305,0.086739,1724900.0,55811.110771,0.321424,28744000.0,19294000.0
"""min""","""C#CCOc1ccc(CNc…",7.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,363110.0,0.0
"""1%""",,21.0,26.0,15.0,0.0,0.0,0.0,63750.0,1.0,0.0,1227679.0,668354.0
"""2.5%""",,76.0,45.0,44.0,0.0,0.0,0.0,155462.0,1.0,0.0,2523782.0,1670883.0
"""5%""",,141.0,90.0,98.0,0.0,0.0,0.0,286317.0,3.0,0.0,4686871.0,3341846.0
"""10%""",,273.0,158.0,205.0,0.0,0.0,0.0,606564.0,8.0,0.0,9734228.0,6683334.0
"""25%""",,557.0,586.0,505.0,0.0,0.0,0.0,1541230.0,48.0,0.0,23785911.0,16708368.0


In [51]:
len(final_test_df['ori_index'].to_list())

31705451

In [56]:
final_test_df.select('ori_index', 'subset').rename({'ori_index': 'index'}).write_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/cv/v1/val.csv')

In [57]:
final_train_df.select('ori_index').rename({'ori_index': 'index'}).write_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/cv/v1/train.csv')

In [4]:
tmp = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/cv/v1/train.csv').collect().sort('index')
tmp

index
i64
363110
363111
363112
363113
363114
363115
363116
363118
363119
363120


In [15]:
train_df = train_df.with_columns(ori_index = pl.Series(list(range(len(train_df)))))
train_df

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index
str,u16,u16,u16,u8,u8,u8,i64,u32,u8,i64
"""C#CCOc1ccc(CNc…",1640,1653,765,0,0,0,4283326,844,0,0
"""C#CCOc1ccc(CNc…",1640,1653,205,0,0,0,4486646,9444,0,1
"""C#CCOc1ccc(CNc…",1640,1653,1653,0,0,0,1015728,9030,0,2
"""C#CCOc1ccc(CNc…",1640,1653,146,0,0,0,5301385,26814,0,3
"""C#CCOc1ccc(CNc…",1640,1653,439,0,0,0,5301385,26814,0,4
"""C#CCOc1ccc(CNc…",1640,1653,196,0,0,0,5301385,26814,0,5
"""C#CCOc1ccc(CNc…",1640,1653,253,0,0,0,5301385,26814,0,6
"""C#CCOc1ccc(CNc…",1640,1653,1219,0,0,0,5301385,26814,0,7
"""C#CCOc1ccc(CNc…",1640,1653,604,0,0,0,543172,844,0,8
"""C#CCOc1ccc(CNc…",1640,1653,121,0,0,0,2571428,864,0,9


In [16]:
train_df = train_df[tmp['index']]
train_df

molecule,bb1,bb2,bb3,BRD4,HSA,sEH,scaffold_idx,mol_per_scaf,label,ori_index
str,u16,u16,u16,u8,u8,u8,i64,u32,u8,i64
"""C#CCOc1ccc(CNc…",982,1653,765,0,0,0,4283326,844,0,363110
"""C#CCOc1ccc(CNc…",982,1653,205,0,0,0,4486646,9444,0,363111
"""C#CCOc1ccc(CNc…",982,1653,1653,0,0,0,1015728,9030,0,363112
"""C#CCOc1ccc(CNc…",982,1653,146,0,0,0,5301385,26814,0,363113
"""C#CCOc1ccc(CNc…",982,1653,439,0,0,0,5301385,26814,0,363114
"""C#CCOc1ccc(CNc…",982,1653,196,0,0,0,5301385,26814,0,363115
"""C#CCOc1ccc(CNc…",982,1653,253,0,0,0,5301385,26814,0,363116
"""C#CCOc1ccc(CNc…",982,1653,604,0,0,0,543172,844,0,363118
"""C#CCOc1ccc(CNc…",982,1653,121,0,0,0,2571428,864,0,363119
"""C#CCOc1ccc(CNc…",982,1653,728,0,0,0,5084942,7616,0,363120


In [17]:
tmp = train_df.select(pl.col('label'), pl.col('ori_index').alias('index'))
tmp

label,index
u8,i64
0,363110
0,363111
0,363112
0,363113
0,363114
0,363115
0,363116
0,363118
0,363119
0,363120


In [20]:
tmp.filter(pl.col('label') != 0).shape[0] / tmp.shape[0]

0.015779035993603314

In [22]:
tmp.filter(pl.col('label') != 0).shape

(1052622, 2)

In [23]:
tmp.shape

(66710159, 2)

In [21]:
tmp.write_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/cv/v1/train.csv')

In [None]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/fold0.json', 'w') as f:
    json.dump(
        {
            # 'train': final_train_df['ori_index'].to_list(),
            'val': final_test_df['ori_index'].to_list()
        },
        f)

In [None]:
!head /home/dangnh36/datasets/competitions/leash_belka/processed/meta/fold0.json