In [1]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
from tokenizers.models import WordLevel, BPE
from tokenizers.pre_tokenizers import Whitespace,Split,ByteLevel, WhitespaceSplit
from tokenizers.normalizers import Lowercase, NFKC
import os
import polars as pl
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
from tqdm import tqdm
import time
import json
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
import gc
from transformers import AutoConfig, AutoTokenizer, AutoModel, DataCollatorWithPadding
import mapply
from collections import Counter
from rdkit import Chem
from rdkit.Chem import AllChem
from functools import partial

multiprocessing.cpu_count()

80

In [2]:
train_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2.csv').select(
        pl.col('molecule'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
#         pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(train_df.estimated_size('gb'), 'GB')
train_df

6.842148938216269 GB


molecule
str
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
…
"""[N-]=[N+]=NCCC…"
"""[N-]=[N+]=NCCC…"
"""[N-]=[N+]=NCCC…"
"""[N-]=[N+]=NCCC…"


In [3]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v4.csv').select(
        pl.col('molecule'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        # pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(test_df.estimated_size('gb'), 'GB')
test_df

0.06128192972391844 GB


molecule
str
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
…
"""Cn1ncc2cc(Nc3n…"
"""[N-]=[N+]=NCCC…"
"""COC(=O)c1ccnc(…"
"""COC1CCC(CCNc2n…"


In [4]:
tokenizer = AutoTokenizer.from_pretrained('/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/smiles_char/')
print('TOKENIZER:', tokenizer)
print('VOCAB SIZE:', tokenizer.vocab_size)
print('VOCAB:', tokenizer.get_vocab())
print('SPECIAL TOKENS:', tokenizer.special_tokens_map)

TOKENIZER: PreTrainedTokenizerFast(name_or_path='/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/smiles_char/', vocab_size=44, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False,

In [47]:
from datasets import load_from_disk
ds = load_from_disk(
    '/home/dangnh36/datasets/competitions/leash_belka/processed/hf/datasets/train/',
    keep_in_memory=False)

Loading dataset from disk:   0%|          | 0/17 [00:00<?, ?it/s]

In [57]:
# smiles_list = train_df[:123456, 'molecule'].to_list()
smiles_list = ds[:123456]['smiles']

In [58]:
len(smiles_list)

123456

In [59]:
ret = tokenizer(
            [f'[CLS]{s}' for s in smiles_list],
            add_special_tokens=True,
            padding='longest',
            truncation=False,
            max_length=2048,
            is_split_into_words=False,
            pad_to_multiple_of=None,
            return_tensors='pt',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_special_tokens_mask=False,
            return_length=True,
            verbose=True)
ret

{'input_ids': tensor([[ 2,  8, 24,  ...,  0,  0,  0],
        [ 2,  8, 24,  ...,  0,  0,  0],
        [ 2,  8, 24,  ...,  0,  0,  0],
        ...,
        [ 2,  8, 24,  ...,  0,  0,  0],
        [ 2,  8, 24,  ...,  0,  0,  0],
        [ 2,  8, 24,  ...,  0,  0,  0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'length': tensor([115, 115, 115,  ..., 115, 115, 115])}

In [60]:
features_v2 = np.load('/home/dangnh36/datasets/competitions/leash_belka/processed/features/smiles_char/train.npy', mmap_mode='r')
features_v2 = np.array(features_v2[:123456])
print(features_v2.shape)
features_v2

(123456, 139)


array([[ 2, 18,  2, ...,  0,  0,  0],
       [ 2, 18,  2, ...,  0,  0,  0],
       [ 2, 18,  2, ...,  0,  0,  0],
       ...,
       [ 2, 18,  2, ...,  0,  0,  0],
       [ 2, 18,  2, ...,  0,  0,  0],
       [ 2, 18,  2, ...,  0,  0,  0]], dtype=uint8)

In [61]:
vocab_v1 = tokenizer.get_vocab()
print(vocab_v1)

{'O': 10, '+': 27, '(': 30, '9': 42, 'c': 19, '/': 25, '1': 34, 'C': 8, 'S': 12, '4': 37, '[BOS]': 5, '6': 39, 'H': 11, '[EOS]': 6, '#': 24, '[SEP]': 4, '-': 26, '[': 28, '=': 23, '7': 40, '8': 41, 'B': 15, 'n': 20, '[CLS]': 2, '[Dy]': 43, 'Br': 7, '@': 33, 'N': 9, 'i': 21, 'I': 16, '5': 38, ']': 29, '@@': 32, '[PAD]': 0, ')': 31, '[MASK]': 3, 'Cl': 14, 'F': 13, '.': 22, '2': 35, '3': 36, '[UNK]': 1, 'o': 18, 's': 17}


In [62]:
vocab_v2 = {'[PAD]': 0, 'Br': 1, 'C': 2, 'N': 3, 'O': 4, 'H': 5, 'S': 6, 'F': 7, 'Cl': 8, 'B': 9, 'I': 10, 's': 11, 'o': 12, 'c': 13, 'n': 14, 'i': 15, '.': 16, '=': 17, '#': 18, '/': 19, '-': 20, '+': 21, '[': 22, ']': 23, '(': 24, ')': 25, '@@': 26, '@': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, '[Dy]': 37, '[UNK]': 38}
print(vocab_v2)

{'[PAD]': 0, 'Br': 1, 'C': 2, 'N': 3, 'O': 4, 'H': 5, 'S': 6, 'F': 7, 'Cl': 8, 'B': 9, 'I': 10, 's': 11, 'o': 12, 'c': 13, 'n': 14, 'i': 15, '.': 16, '=': 17, '#': 18, '/': 19, '-': 20, '+': 21, '[': 22, ']': 23, '(': 24, ')': 25, '@@': 26, '@': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, '[Dy]': 37, '[UNK]': 38}


In [63]:
# vocab_v2_inv = {v: k for k, v in vocab_v2.items()}
v1_to_v2 = {v: vocab_v2[k] if k in vocab_v2 else -999 for k, v in vocab_v1.items()}
v1_to_v2

{10: 4,
 27: 21,
 30: 24,
 42: 36,
 19: 13,
 25: 19,
 34: 28,
 8: 2,
 12: 6,
 37: 31,
 5: -999,
 39: 33,
 11: 5,
 6: -999,
 24: 18,
 4: -999,
 26: 20,
 28: 22,
 23: 17,
 40: 34,
 41: 35,
 15: 9,
 20: 14,
 2: -999,
 43: 37,
 7: 1,
 33: 27,
 9: 3,
 21: 15,
 16: 10,
 38: 32,
 29: 23,
 32: 26,
 0: 0,
 31: 25,
 3: -999,
 14: 8,
 13: 7,
 22: 16,
 35: 29,
 36: 30,
 1: 38,
 18: 12,
 17: 11}

In [65]:
print(v1_to_v2)

{10: 4, 27: 21, 30: 24, 42: 36, 19: 13, 25: 19, 34: 28, 8: 2, 12: 6, 37: 31, 5: -999, 39: 33, 11: 5, 6: -999, 24: 18, 4: -999, 26: 20, 28: 22, 23: 17, 40: 34, 41: 35, 15: 9, 20: 14, 2: -999, 43: 37, 7: 1, 33: 27, 9: 3, 21: 15, 16: 10, 38: 32, 29: 23, 32: 26, 0: 0, 31: 25, 3: -999, 14: 8, 13: 7, 22: 16, 35: 29, 36: 30, 1: 38, 18: 12, 17: 11}


In [64]:
def compare(features_v1, features_v2, pads_v1):
    for i, (row1, row2, pad1) in tqdm(enumerate(zip(features_v1, features_v2, pads_v1))):
        pad1 = pad1[1:].cpu().numpy()
        assert row1[0] == 2
        assert pad1[0] == True
        row1 = row1[1:].cpu().numpy()
        row1_new = np.array([v1_to_v2[e] for e in row1])
        non_zero1 = np.where(row1_new)[0]
        non_zero2 = np.where(row2)[0]
        row1_new = row1_new[non_zero1]
        row2 = row2[non_zero2]
#         print(row1_new)
#         print(row2)
        assert np.all(row1_new == row2)
        assert np.all(np.where(pad1)[0] == non_zero1)

compare(ret['input_ids'].long(), features_v2, ret['attention_mask'].bool())

123456it [00:12, 9693.81it/s] 
