In [2]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
from tokenizers.models import WordLevel, BPE
from tokenizers.pre_tokenizers import Whitespace,Split,ByteLevel, WhitespaceSplit
from tokenizers.normalizers import Lowercase, NFKC
import os
import polars as pl
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
from tqdm import tqdm
import time
import json
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
import gc
from transformers import AutoConfig, AutoTokenizer, AutoModel, DataCollatorWithPadding

multiprocessing.cpu_count()

80

In [4]:
train_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2.csv').select(
        pl.col('molecule'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(train_df.estimated_size('gb'), 'GB')
train_df

7.1171189760789275 GB


molecule,BRD4,HSA,sEH
str,u8,u8,u8
"""C#CCOc1ccc(CNc…",0,0,0
"""C#CCOc1ccc(CNc…",0,0,0
"""C#CCOc1ccc(CNc…",0,0,0
"""C#CCOc1ccc(CNc…",0,0,0
"""C#CCOc1ccc(CNc…",0,0,0
…,…,…,…
"""[N-]=[N+]=NCCC…",0,0,0
"""[N-]=[N+]=NCCC…",0,0,0
"""[N-]=[N+]=NCCC…",0,0,0
"""[N-]=[N+]=NCCC…",0,0,0


In [5]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v4.csv').select(
        pl.col('molecule'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        # pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(test_df.estimated_size('gb'), 'GB')
test_df

0.06128192972391844 GB


molecule
str
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
…
"""Cn1ncc2cc(Nc3n…"
"""[N-]=[N+]=NCCC…"
"""COC(=O)c1ccnc(…"
"""COC1CCC(CCNc2n…"


A feature is useless if:

- It is constant in test/train
- Low variant?
- Unique values in test is ALL same side (> or <) unique values in train (Tree models)

In [26]:
def analyze_constant_features(features):
    assert features.ndim==2
    n_rows, n_cols = features.shape
    ignore_cols = []
    for i in range(n_cols):
        values, counts = np.unique(features[:, i], return_counts=True)
        if len(values) > 3:
            print(f'{i}: count={len(values)}')
        else:
            value2count = {v:c for v, c in zip(values, counts)}
            print(f'{i}: count={len(values)} {value2count}')

        if len(values) == 1:
            ignore_cols.append(i)
    print('IGNORE COLS:\n', ignore_cols)
    return ignore_cols


def chunk_analyze_constant_features(features, chunksize = 5_000_000, unpackbits = False):
    assert features.ndim==2

    stats = {}
    for chunk_idx, start in enumerate(range(0, len(features), chunksize)):
        end = min(start + chunksize, len(features))
        chunk_features = features[start:end]
        if unpackbits:
            chunk_features = np.unpackbits(chunk_features, axis = -1)
        _n_rows, n_cols = chunk_features.shape
        for i in tqdm(range(n_cols)):
            values, counts = np.unique(chunk_features[:, i], return_counts=True)
            for v, c in zip(values, counts):
                e = stats.setdefault(i, {}).setdefault(v, 0)
                stats[i][v] = e + c

        print(f'---CHUNK {chunk_idx}---')
        # print(stats)
    ignore_cols = [col_idx for col_idx, v in stats.items() if len(v) == 1]
    print('IGNORE COLS:\n', ignore_cols)
    return ignore_cols, stats

In [6]:
train_features = np.load('/home/dangnh36/datasets/competitions/leash_belka/processed/features/ecfp6/train.npy')
train_features.shape

(98415610, 256)

In [9]:
test_features = np.load('/home/dangnh36/datasets/competitions/leash_belka/processed/features/ecfp6/test.npy')
test_features.shape

(878022, 256)

In [12]:
test_features = np.unpackbits(test_features, axis = -1)
test_features.shape

(878022, 2048)

In [13]:
analyze_constant_features(test_features)

0: count=2 {0: 857795, 1: 20227}
1: count=2 {0: 629138, 1: 248884}
2: count=2 {0: 833437, 1: 44585}
3: count=2 {0: 859192, 1: 18830}
4: count=2 {0: 870499, 1: 7523}
5: count=2 {0: 858606, 1: 19416}
6: count=2 {0: 871321, 1: 6701}
7: count=2 {0: 860275, 1: 17747}
8: count=2 {0: 849329, 1: 28693}
9: count=2 {0: 858108, 1: 19914}
10: count=2 {0: 859054, 1: 18968}
11: count=2 {0: 853372, 1: 24650}
12: count=2 {0: 862223, 1: 15799}
13: count=2 {0: 809900, 1: 68122}
14: count=2 {0: 814133, 1: 63889}
15: count=2 {0: 865062, 1: 12960}
16: count=2 {0: 855843, 1: 22179}
17: count=2 {0: 870455, 1: 7567}
18: count=2 {0: 861718, 1: 16304}
19: count=2 {0: 865319, 1: 12703}
20: count=2 {0: 867409, 1: 10613}
21: count=2 {0: 866532, 1: 11490}
22: count=2 {0: 868239, 1: 9783}
23: count=2 {0: 848895, 1: 29127}
24: count=2 {0: 860502, 1: 17520}
25: count=2 {0: 863918, 1: 14104}
26: count=2 {0: 864690, 1: 13332}
27: count=2 {0: 852864, 1: 25158}
28: count=2 {0: 867386, 1: 10636}
29: count=2 {0: 795502, 1: 

[650, 807, 1152, 1182, 1380, 1621, 1917, 2007]

In [28]:
ignore_cols, stats = chunk_analyze_constant_features(train_features, chunksize = 10_000_000, unpackbits = True)

100%|██████████████████████████████████████| 2048/2048 [12:15<00:00,  2.78it/s]


---CHUNK 0---


100%|██████████████████████████████████████| 2048/2048 [11:05<00:00,  3.08it/s]


---CHUNK 1---


100%|██████████████████████████████████████| 2048/2048 [11:14<00:00,  3.04it/s]


---CHUNK 2---


100%|██████████████████████████████████████| 2048/2048 [10:54<00:00,  3.13it/s]


---CHUNK 3---


100%|██████████████████████████████████████| 2048/2048 [10:59<00:00,  3.11it/s]


---CHUNK 4---


100%|██████████████████████████████████████| 2048/2048 [10:56<00:00,  3.12it/s]


---CHUNK 5---


100%|██████████████████████████████████████| 2048/2048 [11:13<00:00,  3.04it/s]


---CHUNK 6---


100%|██████████████████████████████████████| 2048/2048 [11:19<00:00,  3.01it/s]


---CHUNK 7---


100%|██████████████████████████████████████| 2048/2048 [11:57<00:00,  2.86it/s]


---CHUNK 8---


100%|██████████████████████████████████████| 2048/2048 [09:40<00:00,  3.53it/s]

---CHUNK 9---
IGNORE COLS:
 [39, 378, 397, 450, 650, 807, 1152, 1182, 1184, 1201, 1350, 1380, 1582, 1621, 1855, 1917, 1967, 2007]





In [29]:
stats

{0: {0: 94737221, 1: 3678389},
 1: {0: 52052609, 1: 46363001},
 2: {0: 90763612, 1: 7651998},
 3: {0: 94105554, 1: 4310056},
 4: {0: 97571841, 1: 843769},
 5: {0: 97100146, 1: 1315464},
 6: {0: 97349111, 1: 1066499},
 7: {0: 97295216, 1: 1120394},
 8: {0: 94707113, 1: 3708497},
 9: {0: 95776174, 1: 2639436},
 10: {0: 95142831, 1: 3272779},
 11: {0: 96542381, 1: 1873229},
 12: {0: 96633948, 1: 1781662},
 13: {0: 93165017, 1: 5250593},
 14: {0: 89197772, 1: 9217838},
 15: {0: 96619257, 1: 1796353},
 16: {0: 95171431, 1: 3244179},
 17: {0: 97758367, 1: 657243},
 18: {0: 96182294, 1: 2233316},
 19: {0: 96259740, 1: 2155870},
 20: {0: 97474757, 1: 940853},
 21: {0: 96480707, 1: 1934903},
 22: {0: 96230010, 1: 2185600},
 23: {0: 96588162, 1: 1827448},
 24: {0: 96543714, 1: 1871896},
 25: {0: 97475140, 1: 940470},
 26: {0: 97055966, 1: 1359644},
 27: {0: 95936037, 1: 2479573},
 28: {0: 96852101, 1: 1563509},
 29: {0: 88433502, 1: 9982108},
 30: {0: 96231765, 1: 2183845},
 31: {0: 97455280, 1:

In [30]:
list(set([650, 807, 1152, 1182, 1380, 1621, 1917, 2007, 39, 378, 397, 450, 650, 807, 1152, 1182, 1184, 1201, 1350, 1380, 1582, 1621, 1855, 1917, 1967, 2007]))

[1152,
 1184,
 450,
 1380,
 1350,
 807,
 39,
 650,
 397,
 1582,
 1967,
 1201,
 1621,
 2007,
 378,
 1917,
 1182,
 1855]

In [None]:
set(range(2048)).difference()

In [2]:
print(sorted([1152,
 1184,
 450,
 1380,
 1350,
 807,
 39,
 650,
 397,
 1582,
 1967,
 1201,
 1621,
 2007,
 378,
 1917,
 1182,
 1855]))

[39, 378, 397, 450, 650, 807, 1152, 1182, 1184, 1201, 1350, 1380, 1582, 1621, 1855, 1917, 1967, 2007]
