In [1]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
from tokenizers.models import WordLevel, BPE
from tokenizers.pre_tokenizers import Whitespace,Split,ByteLevel
from tokenizers.normalizers import Lowercase, NFKC
import os
import polars as pl
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
from tqdm import tqdm
import time
import json
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v3.csv').select(
        pl.col('molecule'),
        pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        pl.col('id_BRD4', 'id_HSA', 'id_sEH'),
    ).collect()
print(test_df.estimated_size('gb'), 'GB')
test_df

0.08581358101218939 GB


molecule,bb1,bb2,bb3,id_BRD4,id_HSA,id_sEH
str,u16,u16,u16,i64,i64,i64
"""C#CCCC[C@H](Nc…",1989,409,409,295246830,295246831,295246832
"""C#CCCC[C@H](Nc…",1989,409,1012,295246833,295246834,295246835
"""C#CCCC[C@H](Nc…",1989,409,1722,295246836,295246837,295246838
"""C#CCCC[C@H](Nc…",1989,409,1078,295246839,295246840,295246841
"""C#CCCC[C@H](Nc…",1989,409,605,295246842,295246843,295246844
…,…,…,…,…,…,…
"""Cn1ncc2cc(Nc3n…",141,1699,307,296921711,296921712,296921713
"""[N-]=[N+]=NCCC…",141,1699,1254,296921714,296921715,296921716
"""COC(=O)c1ccnc(…",141,1415,1390,296921717,296921718,296921719
"""COC1CCC(CCNc2n…",141,1415,1556,296921720,296921721,296921722


https://www.kaggle.com/competitions/leash-BELKA/discussion/496576

**Public LB**
- 50% of shared BBs: 184,519 per protein.
- group 1 OR group 2: 11,271 per protein.

*Rounded to nearest 10K equals 200k "validation" per protein*

**Private LB**
- 50% of shared BBs: 184,520 per protein
- group 1 OR group 2: 11,322 per protein.
- The non-triazine core group: 166,667 per protein on average.

*Rounded to nearest 10K: 360K "test" per protein*

In [10]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/building_blocks.json') as f:
    bb_meta = json.load(f)
for k, v in bb_meta.items():
    print(k, len(v))

train_bbs 1145
train_bb1s 271
train_bb2s 693
train_bb3s 872
test_bb1s 341
test_bb2s 1140
test_bb3s 1389
test_bbs 2110
all_bbs 2110


In [9]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/meta/building_blocks_cluster.json') as f:
    bb_clusters = json.load(f)

for k, v in bb_clusters.items():
    print(k, len(v))

0 1145
1 859
2 53
3 53


In [11]:
test_df = test_df.with_columns(pl.lit(-1).alias('mol_group'))
test_df

molecule,bb1,bb2,bb3,id_BRD4,id_HSA,id_sEH,mol_group
str,u16,u16,u16,i64,i64,i64,i32
"""C#CCCC[C@H](Nc…",1989,409,409,295246830,295246831,295246832,-1
"""C#CCCC[C@H](Nc…",1989,409,1012,295246833,295246834,295246835,-1
"""C#CCCC[C@H](Nc…",1989,409,1722,295246836,295246837,295246838,-1
"""C#CCCC[C@H](Nc…",1989,409,1078,295246839,295246840,295246841,-1
"""C#CCCC[C@H](Nc…",1989,409,605,295246842,295246843,295246844,-1
…,…,…,…,…,…,…,…
"""Cn1ncc2cc(Nc3n…",141,1699,307,296921711,296921712,296921713,-1
"""[N-]=[N+]=NCCC…",141,1699,1254,296921714,296921715,296921716,-1
"""COC(=O)c1ccnc(…",141,1415,1390,296921717,296921718,296921719,-1
"""COC1CCC(CCNc2n…",141,1415,1556,296921720,296921721,296921722,-1


In [39]:
for mol_group_id, group_bbs in bb_clusters.items():
    mol_group_id = int(mol_group_id)
    test_df = test_df.with_columns(
        pl.when(pl.col('bb1').is_in(group_bbs) | pl.col('bb2').is_in(group_bbs) | pl.col('bb3').is_in(group_bbs)).then(pl.lit(int(mol_group_id))).otherwise('mol_group').alias('mol_group')
    )
    # test_df.filter(pl.col('bb1').is_in(group_bbs) | pl.col('bb2').is_in(group_bbs) | pl.col('bb3').is_in(group_bbs))['mol_group'] = mol_group_id
    print(mol_group_id, test_df.filter(pl.col('mol_group') == mol_group_id).select('mol_group').count()[0, 'mol_group'])

0 369039
1 486390
2 11271
3 11322


In [40]:
369039 + 486390 + 11271 + 11322

878022

In [1]:
486390  / 878_022

0.5539610624790723

In [41]:
test_df

molecule,bb1,bb2,bb3,id_BRD4,id_HSA,id_sEH,mol_group
str,u16,u16,u16,i64,i64,i64,i32
"""C#CCCC[C@H](Nc…",1989,409,409,295246830,295246831,295246832,2
"""C#CCCC[C@H](Nc…",1989,409,1012,295246833,295246834,295246835,2
"""C#CCCC[C@H](Nc…",1989,409,1722,295246836,295246837,295246838,2
"""C#CCCC[C@H](Nc…",1989,409,1078,295246839,295246840,295246841,2
"""C#CCCC[C@H](Nc…",1989,409,605,295246842,295246843,295246844,2
…,…,…,…,…,…,…,…
"""Cn1ncc2cc(Nc3n…",141,1699,307,296921711,296921712,296921713,0
"""[N-]=[N+]=NCCC…",141,1699,1254,296921714,296921715,296921716,0
"""COC(=O)c1ccnc(…",141,1415,1390,296921717,296921718,296921719,0
"""COC1CCC(CCNc2n…",141,1415,1556,296921720,296921721,296921722,0


In [42]:
test_df.select(pl.col('mol_group').unique())

mol_group
i32
0
1
2
3


In [43]:
test_df = test_df.with_columns(
    (pl.col('mol_group') * 3 + 0).alias('group_BRD4'),
    (pl.col('mol_group') * 3 + 1).alias('group_HSA'),
    (pl.col('mol_group') * 3 + 2).alias('group_sEH'),
)
test_df

molecule,bb1,bb2,bb3,id_BRD4,id_HSA,id_sEH,mol_group,group_BRD4,group_HSA,group_sEH
str,u16,u16,u16,i64,i64,i64,i32,i32,i32,i32
"""C#CCCC[C@H](Nc…",1989,409,409,295246830,295246831,295246832,2,6,7,8
"""C#CCCC[C@H](Nc…",1989,409,1012,295246833,295246834,295246835,2,6,7,8
"""C#CCCC[C@H](Nc…",1989,409,1722,295246836,295246837,295246838,2,6,7,8
"""C#CCCC[C@H](Nc…",1989,409,1078,295246839,295246840,295246841,2,6,7,8
"""C#CCCC[C@H](Nc…",1989,409,605,295246842,295246843,295246844,2,6,7,8
…,…,…,…,…,…,…,…,…,…,…
"""Cn1ncc2cc(Nc3n…",141,1699,307,296921711,296921712,296921713,0,0,1,2
"""[N-]=[N+]=NCCC…",141,1699,1254,296921714,296921715,296921716,0,0,1,2
"""COC(=O)c1ccnc(…",141,1415,1390,296921717,296921718,296921719,0,0,1,2
"""COC1CCC(CCNc2n…",141,1415,1556,296921720,296921721,296921722,0,0,1,2


In [46]:
test_df.select(pl.col('group_BRD4', 'group_HSA', 'group_sEH').unique())

group_BRD4,group_HSA,group_sEH
i32,i32,i32
0,1,2
3,4,5
6,7,8
9,10,11


In [47]:
test_df.write_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v4.csv')