In [1]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
from tokenizers.models import WordLevel, BPE
from tokenizers.pre_tokenizers import Whitespace,Split,ByteLevel, WhitespaceSplit
from tokenizers.normalizers import Lowercase, NFKC
import os
import polars as pl
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
from tqdm import tqdm
import time
import json
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
import gc
from transformers import AutoConfig, AutoTokenizer, AutoModel, DataCollatorWithPadding
import mapply
from collections import Counter
from rdkit import Chem
from rdkit.Chem import AllChem
from functools import partial

multiprocessing.cpu_count()

80

In [2]:
train_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2.csv').select(
        pl.col('molecule'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
#         pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(train_df.estimated_size('gb'), 'GB')
train_df

6.842148938216269 GB


molecule
str
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
"""C#CCOc1ccc(CNc…"
…
"""[N-]=[N+]=NCCC…"
"""[N-]=[N+]=NCCC…"
"""[N-]=[N+]=NCCC…"
"""[N-]=[N+]=NCCC…"


In [3]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v4.csv').select(
        pl.col('molecule'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        # pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(test_df.estimated_size('gb'), 'GB')
test_df

0.06128192972391844 GB


molecule
str
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
"""C#CCCC[C@H](Nc…"
…
"""Cn1ncc2cc(Nc3n…"
"""[N-]=[N+]=NCCC…"
"""COC(=O)c1ccnc(…"
"""COC1CCC(CCNc2n…"


In [4]:
smiles_list = test_df.sample(2000)['molecule'].to_list()
len(smiles_list)

2000

In [5]:
def extract_features(func, inputs, save_dir, feature_name, subset, method = 'batch', num_workers = -1, joblib_backend = 'loky', batch_size = None, save_formats = ['npy', 'mmap'], hook = None):
    if method == 'batch':
        assert batch_size is not None and batch_size > 0
        num_samples = len(inputs)
        starts = np.arange(0, num_samples, batch_size)
        ends = [min(start + batch_size, num_samples) for start in starts]
        ret = Parallel(n_jobs=num_workers, backend=joblib_backend)(delayed(func)(inputs[starts[i]:ends[i]]) for i in tqdm(range(len(starts))))
        ret = np.concatenate(ret, axis = 0)
        if hook is not None:
            ret = hook(ret)
        print(f'SHAPE={ret.shape} DTYPE={ret.dtype}')

        for fmt in save_formats:
            save_name = f'{subset}.{fmt}'
            save_path = os.path.join(save_dir, feature_name, save_name)
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            print(f'Saving {fmt} at {save_path}')
            
            start = time.time()
            if fmt == 'mmap':
                fp = np.memmap(save_path,
                               dtype=ret.dtype, mode='w+',
                               shape=ret.shape)
                fp[:] = ret[:]
            elif fmt == 'npy':
                np.save(save_path, ret)
            else:
                raise ValueError
            end = time.time()
            print('Take:', end - start, 's')

            meta_json_path = os.path.join(save_dir, feature_name, 'meta.json')
            try:
                with open(meta_json_path, 'r') as f:
                    meta = json.load(f)
            except:
                meta = {}
            with open(meta_json_path, 'w') as f:
                meta[save_name] = {
                    'fmt': fmt,
                    'dtype': str(ret.dtype),
                    'shape': list(ret.shape),
                }
                json.dump(meta, f)
    elif method == 'element':
        pass
    
    
def trim_padding(arr, pad_idx=0):
    num_samples, num_cols = arr.shape
    first_non_zero_col_idx = num_cols - 1
    while not np.any(arr[:, first_non_zero_col_idx] != pad_idx):
        first_non_zero_col_idx -= 1
    assert np.any((arr[:, first_non_zero_col_idx] != pad_idx))
    assert np.all(arr[:, first_non_zero_col_idx + 1] == pad_idx)
    return arr[:, :first_non_zero_col_idx + 1]

# SMILES Char Tokenizer

In [14]:
SAVE_DIR = '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/smiles_char/'

VOCAB = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[MASK]': 3, '[SEP]': 4, '[BOS]': 5, '[EOS]': 6,
         'Br': 7, 'C': 8, 'N': 9, 'O': 10, 'H': 11, 'S': 12, 'F': 13, 'Cl': 14, 'B': 15, 'I': 16, 's': 17, 'o': 18, 'c': 19, 'n': 20, 'i': 21, '.': 22, '=': 23, '#': 24, '/': 25, '-': 26, '+': 27, '[': 28, ']': 29, '(': 30, ')': 31, '@@': 32, '@': 33, '1': 34, '2': 35, '3': 36, '4': 37, '5': 38, '6': 39, '7': 40, '8': 41, '9': 42, '[Dy]': 43}
len(VOCAB)

44

In [12]:
tokenizer = Tokenizer(BPE(vocab=VOCAB,
                          merges = [],
                          # merges=[('@','@')],
                          unk_token='[UNK]'))
tokenizer.add_tokens(['@@', 'Br', 'Cl', '[Dy]'])
print(len(tokenizer.get_vocab()))
print(sorted(tokenizer.get_vocab().items(), key = lambda x: x[1]))

44
[('[PAD]', 0), ('[UNK]', 1), ('[CLS]', 2), ('[MASK]', 3), ('[SEP]', 4), ('[BOS]', 5), ('[EOS]', 6), ('Br', 7), ('C', 8), ('N', 9), ('O', 10), ('H', 11), ('S', 12), ('F', 13), ('Cl', 14), ('B', 15), ('I', 16), ('s', 17), ('o', 18), ('c', 19), ('n', 20), ('i', 21), ('.', 22), ('=', 23), ('#', 24), ('/', 25), ('-', 26), ('+', 27), ('[', 28), (']', 29), ('(', 30), (')', 31), ('@@', 32), ('@', 33), ('1', 34), ('2', 35), ('3', 36), ('4', 37), ('5', 38), ('6', 39), ('7', 40), ('8', 41), ('9', 42), ('[Dy]', 43)]


In [None]:
# os.makedirs(os.path.dirname(SAVE_PATH))
# tokenizer.save(SAVE_PATH)

In [13]:
tokenizer = PreTrainedTokenizerFast(
    model_max_length = None,
    padding_side = 'right',
    tokenizer_object = tokenizer,
    bos_token = '[BOS]',
    eos_token = '[EOS]',
    unk_token='[UNK]',
    sep_token = '[SEP]',
    pad_token = '[PAD]',
    cls_token = '[CLS]',
    mask_token = '[MASK]'
)
print(tokenizer.vocab)

{'/': 25, ']': 29, '[CLS]': 2, 's': 17, '4': 37, 'B': 15, 'o': 18, '+': 27, 'Br': 7, 'F': 13, '=': 23, '[': 28, '2': 35, 'Cl': 14, '[Dy]': 43, 'N': 9, '#': 24, ')': 31, '5': 38, '[UNK]': 1, 'C': 8, 'O': 10, '@@': 32, '@': 33, '7': 40, '[MASK]': 3, '[EOS]': 6, 'H': 11, '1': 34, '3': 36, 'c': 19, '[PAD]': 0, '[SEP]': 4, '[BOS]': 5, '-': 26, 'I': 16, '.': 22, 'n': 20, '6': 39, 'S': 12, 'i': 21, '8': 41, '9': 42, '(': 30}


In [15]:
tokenizer.save_pretrained(SAVE_DIR)

('/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/smiles_char/tokenizer_config.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/smiles_char/special_tokens_map.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/smiles_char/tokenizer.json')

In [16]:
tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
print('TOKENIZER:', tokenizer)
print('VOCAB SIZE:', tokenizer.vocab_size)
print('VOCAB:', tokenizer.get_vocab())
print('SPECIAL TOKENS:', tokenizer.special_tokens_map)

TOKENIZER: PreTrainedTokenizerFast(name_or_path='/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/smiles_char/', vocab_size=44, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False,

In [17]:
# test
for smiles in smiles_list:
    smiles = f'[CLS][BOS][SEP]{smiles}[SEP][EOS][UNK][MASK]'
    if not 'Cl' in smiles:
        continue
    if not 'Br' in smiles:
        continue
        
    decoded = tokenizer.decode(tokenizer.encode(smiles))
    if 1:
        print(smiles)
        print(decoded)
        print('\n')

[CLS][BOS][SEP]O=C(N[Dy])c1nc(Cl)cc(Nc2nc(Nc3ccc(O)cc3C(F)(F)F)nc(Nc3nc(=O)[nH]cc3Br)n2)c1Cl[SEP][EOS][UNK][MASK]
[CLS] [BOS] [SEP] O = C ( N [Dy] ) c 1 n c ( Cl ) c c ( N c 2 n c ( N c 3 c c c ( O ) c c 3 C ( F ) ( F ) F ) n c ( N c 3 n c ( = O ) [ n H ] c c 3 Br ) n 2 ) c 1 Cl [SEP] [EOS] [UNK] [MASK]


[CLS][BOS][SEP]O=C(N[Dy])c1nc(Cl)cc(Nc2nc(Nc3ccc4nccn4c3)nc(Nc3ccc4nccnc4c3Br)n2)c1Cl[SEP][EOS][UNK][MASK]
[CLS] [BOS] [SEP] O = C ( N [Dy] ) c 1 n c ( Cl ) c c ( N c 2 n c ( N c 3 c c c 4 n c c n 4 c 3 ) n c ( N c 3 c c c 4 n c c n c 4 c 3 Br ) n 2 ) c 1 Cl [SEP] [EOS] [UNK] [MASK]


[CLS][BOS][SEP]O=C(N[Dy])[C@H](Cc1ccccc1F)Nc1nc(Nc2cc(F)c(F)cc2Br)nc(Nc2ccc(Cl)nc2Cl)n1[SEP][EOS][UNK][MASK]
[CLS] [BOS] [SEP] O = C ( N [Dy] ) [ C @ H ] ( C c 1 c c c c c 1 F ) N c 1 n c ( N c 2 c c ( F ) c ( F ) c c 2 Br ) n c ( N c 2 c c c ( Cl ) n c 2 Cl ) n 1 [SEP] [EOS] [UNK] [MASK]


[CLS][BOS][SEP]C=C(C)COCCNc1nc(Nc2nc(NCc3ccc(F)cc3)ccc2[N+](=O)[O-])nc(Nc2c(C(=O)N[Dy])cc(Cl)c(Br)c2F)n1[SEP][EOS][

# AIS Tokenizer

In [18]:
import atomInSmiles

In [19]:
SAVE_DIR = '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/ais_train/'

In [20]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/features/ais/train_counter.json', 'r') as f:
    train_counter = json.load(f)
    print(len(train_counter))

with open('/home/dangnh36/datasets/competitions/leash_belka/processed/features/ais/test_counter.json', 'r') as f:
    test_counter = json.load(f)
    print(len(test_counter))

208
222


In [21]:
test_counter = Counter(test_counter)
test_counter.most_common()

[('(', 6532149),
 (')', 6532149),
 ('[cH;R;CC]', 5568170),
 ('[c;R;CCC]', 2554097),
 ('1', 2478596),
 ('2', 2464676),
 ('[n;R;CC]', 2135243),
 ('3', 2097610),
 ('=', 2014722),
 ('[O;!R;C]', 1687485),
 ('[CH2;R;CC]', 1663878),
 ('[C;!R;CNO]', 1470627),
 ('[c;R;CCN]', 1323266),
 ('[NH;!R;CC]', 1311475),
 ('[CH2;R;CN]', 1299482),
 ('[c;R;NNN]', 1212854),
 ('4', 1061140),
 ('[cH;R;CN]', 1010361),
 ('[NH;!R;CDy]', 878022),
 ('[Dy;!R;N]', 878022),
 ('[CH3;!R;C]', 850528),
 ('[F;!R;C]', 799229),
 ('[N;R;CCC]', 737228),
 ('-', 600945),
 ('[CH2;!R;CN]', 572601),
 ('[CH2;!R;CC]', 492951),
 ('[c;R;CNN]', 458964),
 ('[c;R;CCO]', 449185),
 ('[O;!R;CC]', 443575),
 ('5', 363928),
 ('[CH2;R;CO]', 323436),
 ('[n;R;CN]', 321708),
 ('[c;R;CCF]', 292119),
 ('[CH;R;CCC]', 261400),
 ('[O;R;CC]', 254098),
 ('[CH3;!R;N]', 207532),
 ('[CH3;!R;O]', 201802),
 ('[c;R;CCS]', 195090),
 ('[s;R;CC]', 183072),
 ('[Cl;!R;C]', 175542),
 ('[n;R;CCN]', 174961),
 ('[[C@@H];R;CCN]', 172535),
 ('[CH2;!R;CO]', 168762),
 ('[O;

In [22]:
train_counter = Counter(train_counter)
train_counter.most_common()

[('(', 724464228),
 (')', 724464228),
 ('[cH;R;CC]', 557940715),
 ('[n;R;CC]', 399809759),
 ('[c;R;NNN]', 301983031),
 ('2', 299117572),
 ('[NH;!R;CC]', 293918352),
 ('1', 269640426),
 ('3', 256174720),
 ('[c;R;CCC]', 189955444),
 ('=', 184898262),
 ('[c;R;CCN]', 171726338),
 ('[O;!R;C]', 149345761),
 ('[CH2;R;CC]', 140023654),
 ('[CH2;!R;CN]', 120080014),
 ('[C;!R;CNO]', 109895961),
 ('[NH;!R;CDy]', 98415610),
 ('[Dy;!R;N]', 98415610),
 ('[cH;R;CN]', 94036837),
 ('[CH3;!R;C]', 83331324),
 ('[F;!R;C]', 81837675),
 ('4', 76736360),
 ('[CH2;!R;CC]', 73357132),
 ('[CH2;R;CN]', 51502355),
 ('[O;!R;CC]', 51189135),
 ('[c;R;CNN]', 46817969),
 ('[Cl;!R;C]', 46182354),
 ('[c;R;CCF]', 39540214),
 ('[c;R;CCO]', 38649511),
 ('[CH3;!R;O]', 33336407),
 ('[CH2;R;CO]', 32578891),
 ('[N;R;CCC]', 32317315),
 ('[n;R;CN]', 30138788),
 ('[c;R;CCCl]', 29474875),
 ('[s;R;CC]', 24648580),
 ('[CH;R;CCC]', 24596493),
 ('[O;R;CC]', 23425739),
 ('[Br;!R;C]', 20038508),
 ('[c;R;CCS]', 19418285),
 ('[CH2;!R;CO]', 

In [23]:
vocab = {
    '[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[MASK]': 3, '[SEP]': 4, '[BOS]': 5, '[EOS]': 6,
}
vocab.update({token: i + 7 for i, (token, count) in enumerate(train_counter.most_common())})
print(len(vocab))
print(vocab)

215
{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[MASK]': 3, '[SEP]': 4, '[BOS]': 5, '[EOS]': 6, '(': 7, ')': 8, '[cH;R;CC]': 9, '[n;R;CC]': 10, '[c;R;NNN]': 11, '2': 12, '[NH;!R;CC]': 13, '1': 14, '3': 15, '[c;R;CCC]': 16, '=': 17, '[c;R;CCN]': 18, '[O;!R;C]': 19, '[CH2;R;CC]': 20, '[CH2;!R;CN]': 21, '[C;!R;CNO]': 22, '[NH;!R;CDy]': 23, '[Dy;!R;N]': 24, '[cH;R;CN]': 25, '[CH3;!R;C]': 26, '[F;!R;C]': 27, '4': 28, '[CH2;!R;CC]': 29, '[CH2;R;CN]': 30, '[O;!R;CC]': 31, '[c;R;CNN]': 32, '[Cl;!R;C]': 33, '[c;R;CCF]': 34, '[c;R;CCO]': 35, '[CH3;!R;O]': 36, '[CH2;R;CO]': 37, '[N;R;CCC]': 38, '[n;R;CN]': 39, '[c;R;CCCl]': 40, '[s;R;CC]': 41, '[CH;R;CCC]': 42, '[O;R;CC]': 43, '[Br;!R;C]': 44, '[c;R;CCS]': 45, '[CH2;!R;CO]': 46, '[[C@@H];!R;CCN]': 47, '[c;R;BrCC]': 48, '[CH3;!R;N]': 49, '[OH;!R;C]': 50, '[[C@H];!R;CCN]': 51, '[c;R;CNO]': 52, '[O;!R;S]': 53, '[C;!R;COO]': 54, '[cH;R;CS]': 55, '[c;R;CClN]': 56, '#': 57, '-': 58, '[nH;R;CC]': 59, '[n;R;CCN]': 60, '[[O-];!R;N]': 61, '[c;R;NNS]': 62, '[[N+]

In [24]:
tokenizer = Tokenizer(WordLevel(vocab=vocab,
                          unk_token='[UNK]'
                        ))
tokenizer.pre_tokenizer = WhitespaceSplit()

tokenizer = PreTrainedTokenizerFast(
    model_max_length = None,
    padding_side = 'right',
    tokenizer_object = tokenizer,
    bos_token = '[BOS]',
    eos_token = '[EOS]',
    unk_token='[UNK]',
    sep_token = '[SEP]',
    pad_token = '[PAD]',
    cls_token = '[CLS]',
    mask_token = '[MASK]'
)
tokenizer.save_pretrained(SAVE_DIR)

('/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/ais_train/tokenizer_config.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/ais_train/special_tokens_map.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/ais_train/tokenizer.json')

In [25]:
tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
print('TOKENIZER:', tokenizer)
print('VOCAB SIZE:', tokenizer.vocab_size)
print('VOCAB:', tokenizer.get_vocab())
print('SPECIAL TOKENS:', tokenizer.special_tokens_map)
print('PRETOKENIZER:', tokenizer._tokenizer.pre_tokenizer)

TOKENIZER: PreTrainedTokenizerFast(name_or_path='/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/ais_train/', vocab_size=215, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, 

In [30]:
# test
for smiles in tqdm(test_df[:1000, 'molecule']):
#     if not 'Cl' in smiles:
#         continue
#     if not 'Br' in smiles:
#         continue
    ais = atomInSmiles.encode(smiles)
    ais = f'[CLS][BOS][SEP]{ais} [EOS]'
    decoded = tokenizer.decode(tokenizer.encode(ais))
    if decoded != ais:
        print(ais, decoded, sep = '\n-->')
    if 0:
        print(smiles)
        print('-->', ais)
        print('-->', decoded)
        print('\n')

  7%|██▉                                    | 74/1000 [00:00<00:02, 361.45it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH;!R;CC] = [CH2;!R;C] ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH;!R;CC] = [CH2;!R;C] ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH;!R;CC] = [CH2;!R;C] ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH;!R;CC] = [CH2;!R;C] ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN

 11%|████▏                                 | 111/1000 [00:00<00:02, 337.11it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCN] ( [N;R;CCC] 3 [CH2;R;CN] [CH2;R;CC] [CH2;R;CC] [CH2;R;CC] [CH2;R;CN] 3 ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCF] ( [F;!R;C] ) [c;R;CCO] ( [O;!R;CC] [CH3;!R;O] ) [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCN] ( [N;R;CCC] 3 [CH2;R;CN] [CH2;R;CC] [CH2;R;CC] [CH2;R;CC] [CH2;R;CN] 3 ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCF] ( [F;!R;C] ) [c;R;CCO] ( [O;!R;CC] [CH3;!R;O] ) [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BOS][SEP][CH;!

 18%|██████▋                               | 177/1000 [00:00<00:02, 302.44it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [C;R;CCCC] 2 ( [CH2;!R;CO] [O;!R;CC] [CH3;!R;O] ) [CH2;R;CC] [CH2;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [cH;R;CC] [c;R;CClN] ( [Cl;!R;C] ) [n;R;CC] [c;R;NNS] ( [S;!R;CC] [CH3;!R;S] ) [n;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [C;R;CCCC] 2 ( [CH2;!R;CO] [O;!R;CC] [CH3;!R;O] ) [CH2;R;CC] [CH2;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [cH;R;CC] [c;R;CClN] ( [Cl;!R;C] ) [n;R;CC] [c;R;NNS] ( [S;!R;CC] [CH3;!R;S] ) [n;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;C

 24%|█████████                             | 238/1000 [00:00<00:02, 286.89it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCC] 2 [cH;R;CC] [cH;R;CC] [c;R;CCO] ( [O;!R;CC] [CH2;!R;CO] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CC] [c;R;CCN] 3 [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] 3 ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCC] 2 [cH;R;CC] [cH;R;CC] [c;R;CCO] ( [O;!R;CC] [CH2;!R;CO] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CC] [c;R;CCN] 3 [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] 3 ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCC] 2 [cH;R;CC] [cH;R;CC] [c;R;CCO] ( [O;!R;CC] [CH2;!R;CO] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CC] [c;R;CCN] 3 [c

 30%|███████████▎                          | 299/1000 [00:00<00:02, 291.85it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [cH;R;CN] [n;R;CCC] 3 [cH;R;CN] [cH;R;CN] [n;R;CC] [c;R;CNN] 3 [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [n;R;CC] [cH;R;NN] [n;R;CC] [c;R;CNN] 3 [c;R;CCN] 2 [n;R;CC] [cH;R;NN] [n;R;CCC] 3 [CH2;!R;CN] [CH;!R;CCO] ( [CH3;!R;C] ) [OH;!R;C] ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [cH;R;CN] [n;R;CCC] 3 [cH;R;CN] [cH;R;CN] [n;R;CC] [c;R;CNN] 3 [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [n;R;CC] [cH;R;NN] [n;R;CC] [c;R;CNN] 3 [c;R;CCN] 2 [n;R;CC] [cH;R;NN] [n;R;CCC] 3 [CH2;!R;CN] [CH;!R;CCO] ( [CH3;!R;C] ) [OH;!R;C] ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N

 36%|█████████████▌                        | 358/1000 [00:01<00:02, 276.13it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CNO] 2 [n;R;CN] [n;R;CN] [c;R;NNO] ( [N;R;CCC] 3 [CH2;R;CN] [CH2;R;CC] [CH2;R;CC] [CH2;R;CN] 3 ) [o;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCN] 3 [nH;R;CC] [cH;R;NN] [n;R;CC] [c;R;CCN] 3 [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CNO] 2 [n;R;CN] [n;R;CN] [c;R;NNO] ( [N;R;CCC] 3 [CH2;R;CN] [CH2;R;CC] [CH2;R;CC] [CH2;R;CN] 3 ) [o;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCN] 3 [nH;R;CC] [cH;R;NN] [n;R;CC] [c;R;CCN] 3 [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;

 41%|███████████████▋                      | 414/1000 [00:01<00:02, 268.15it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCS] 2 [cH;R;CC] [cH;R;CC] [c;R;CSS] ( [S;!R;CNOO] ( [NH2;!R;S] ) ( = [O;!R;S] ) = [O;!R;S] ) [s;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CNO] 2 [n;R;CN] [n;R;CN] [c;R;NOS] ( [SH;!R;C] ) [o;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCS] 2 [cH;R;CC] [cH;R;CC] [UNK] ( [S;!R;CNOO] ( [NH2;!R;S] ) ( = [O;!R;S] ) = [O;!R;S] ) [s;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CNO] 2 [n;R;CN] [n;R;CN] [c;R;NOS] ( [SH;!R;C] ) [o;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];

 47%|█████████████████▊                    | 469/1000 [00:01<00:01, 267.30it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCC] 2 [cH;R;CS] [s;R;CC] [c;R;CCS] 3 [cH;R;CC] [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] 2 3 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [c;R;BrCC] ( [Br;!R;C] ) [cH;R;CC] [cH;R;CC] [c;R;CCO] 2 [O;!R;CC] [CH;!R;FFO] ( [F;!R;C] ) [F;!R;C] ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCC] 2 [cH;R;CS] [s;R;CC] [c;R;CCS] 3 [cH;R;CC] [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] 2 3 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [c;R;BrCC] ( [Br;!R;C] ) [cH;R;CC] [cH;R;CC] [c;R;CCO] 2 [O;!R;CC] [CH;!R;FFO] ( [F;!R;C] ) [F;!R;C] ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [D

 52%|███████████████████▊                  | 523/1000 [00:01<00:01, 257.09it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [cH;R;CN] [n;R;CC] [cH;R;CN] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] [c;R;CCN] 2 [N;R;CCC] 2 [CH2;R;CN] [CH2;R;CC] [CH2;R;CC] [CH2;R;CN] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [cH;R;CN] [n;R;CC] [cH;R;CN] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] [c;R;CCN] 2 [N;R;CCC] 2 [CH2;R;CN] [CH2;R;CC] [CH2;R;CC] [CH2;R;CN] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [

 58%|██████████████████████                | 580/1000 [00:02<00:01, 268.07it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [C;R;CCCO] 2 ( [OH;!R;C] ) [CH2;R;CC] [CH2;R;CC] [CH2;R;CC] [C;R;CCCC] 2 ( [CH3;!R;C] ) [CH3;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCF] ( [F;!R;C] ) [c;R;CCF] ( [F;!R;C] ) [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [C;R;CCCO] 2 ( [OH;!R;C] ) [CH2;R;CC] [CH2;R;CC] [CH2;R;CC] [C;R;CCCC] 2 ( [CH3;!R;C] ) [CH3;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCF] ( [F;!R;C] ) [c;R;CCF] ( [F;!R;C] ) [cH;R;CC] 2 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R

 64%|████████████████████████▏             | 637/1000 [00:02<00:01, 273.50it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCC] 2 [cH;R;CC] [cH;R;CC] [c;R;CCO] ( [O;!R;CC] [CH2;!R;CO] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CC] [c;R;CCN] 3 [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] 3 ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] 3 [c;R;CCC] 2 [C;R;CNO] ( = [O;!R;C] ) [NH;R;CC] [C;R;CNO] 3 = [O;!R;C] ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CC] [c;R;CCC] 2 [cH;R;CC] [cH;R;CC] [c;R;CCO] ( [O;!R;CC] [CH2;!R;CO] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CC] [c;R;CCN] 3 [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] 3 ) [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] [c;R;N

 70%|██████████████████████████▍           | 696/1000 [00:02<00:01, 279.43it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;NNS] 2 [n;R;CC] [c;R;CCN] ( - [c;R;CCC] 3 [cH;R;CC] [c;R;CCCl] ( [Cl;!R;C] ) [cH;R;CC] [cH;R;CC] [c;R;CCCl] 3 [Cl;!R;C] ) [cH;R;CS] [s;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] 3 [cH;R;CN] [n;R;CN] [nH;R;CN] [c;R;CCN] 2 3 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [CH2;!R;CC] [[C@H];!R;CCN] ( [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;NNS] 2 [n;R;CC] [c;R;CCN] ( - [c;R;CCC] 3 [cH;R;CC] [c;R;CCCl] ( [Cl;!R;C] ) [cH;R;CC] [cH;R;CC] [c;R;CCCl] 3 [Cl;!R;C] ) [cH;R;CS] [s;R;CC] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [cH;R;CC] [c;R;CCC] 3 [cH;R;CN] [n;R;CN] [nH;R;CN] [c;R;CCN] 2 3 ) [n;R;CC] 1 ) [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] [EOS]
[CLS][BO

 75%|████████████████████████████▌         | 752/1000 [00:02<00:00, 274.94it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CN] [N;R;CCC] 2 [CH2;R;CN] [CH2;R;CC] [C;R;CCCF] ( [CH3;!R;C] ) ( [F;!R;C] ) [CH2;R;CN] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [n;R;CC] [c;R;NNS] ( [S;!R;CC] [CH3;!R;S] ) [n;R;CC] [cH;R;CN] [c;R;CCC] 2 [C;!R;COO] ( = [O;!R;C] ) [O;!R;CC] [CH2;!R;CO] [CH3;!R;C] ) [n;R;CC] 1 [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CN] [N;R;CCC] 2 [CH2;R;CN] [CH2;R;CC] [C;R;CCCF] ( [CH3;!R;C] ) ( [F;!R;C] ) [CH2;R;CN] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [n;R;CC] [c;R;NNS] ( [S;!R;CC] [CH3;!R;S] ) [n;R;CC] [cH;R;CN] [c;R;CCC] 2 [C;!R;COO] ( = [O;!R;C] ) [O;!R;CC] [CH2;!R;CO] [CH3;!R;C] ) [n;R;CC] 1 [EOS]
[CLS][BO

 81%|██████████████████████████████▊       | 811/1000 [00:02<00:00, 281.04it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [C;R;CCCC] 2 3 [CH2;R;CC] [CH;R;CC] = [CH;R;CC] [CH;R;CCO] ( [CH2;R;CC] 2 ) [O;R;CC] [CH2;R;CO] 3 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [C;R;CCCO] 2 ( [OH;!R;C] ) [CH2;R;CC] [CH2;R;CC] [CH;R;CCS] 2 [S;!R;CC] [CH2;!R;CS] [CH3;!R;C] ) [n;R;CC] 1 [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [C;R;CCCC] 2 3 [CH2;R;CC] [CH;R;CC] = [CH;R;CC] [CH;R;CCO] ( [CH2;R;CC] 2 ) [O;R;CC] [CH2;R;CO] 3 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [C;R;CCCO] 2 ( [OH;!R;C] ) [CH2;R;CC] [CH2;R;CC] [CH;R;CCS] 2 [S;!R;CC] [CH2;!R;CS] [CH3;!R;C] ) [n;R;CC] 1 [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;C

 87%|█████████████████████████████████     | 870/1000 [00:03<00:00, 276.38it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCS] 2 [s;R;CN] [n;R;NS] [n;R;CN] [c;R;CCN] 2 [CH3;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [cH;R;CC] [c;R;CCC] ( [C;!R;COO] ( = [O;!R;C] ) [O;!R;CC] [CH3;!R;O] ) [cH;R;CC] [cH;R;CN] [n;R;CC] 2 ) [n;R;CC] 1 [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCS] 2 [s;R;CN] [n;R;NS] [n;R;CN] [c;R;CCN] 2 [CH3;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [cH;R;CC] [c;R;CCC] ( [C;!R;COO] ( = [O;!R;C] ) [O;!R;CC] [CH3;!R;O] ) [cH;R;CC] [cH;R;CN] [n;R;CC] 2 ) [n;R;CC] 1 [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R

 93%|███████████████████████████████████▏  | 927/1000 [00:03<00:00, 277.91it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCS] 2 [s;R;CN] [n;R;NS] [n;R;CN] [c;R;CCN] 2 [CH3;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCF] ( [F;!R;C] ) [cH;R;CC] [c;R;CCO] 2 [O;!R;CC] [CH3;!R;O] ) [n;R;CC] 1 [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCS] 2 [s;R;CN] [n;R;NS] [n;R;CN] [c;R;CCN] 2 [CH3;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;CCF] ( [F;!R;C] ) [cH;R;CC] [c;R;CCO] 2 [O;!R;CC] [CH3;!R;O] ) [n;R;CC] 1 [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH

 98%|█████████████████████████████████████▍| 985/1000 [00:03<00:00, 281.61it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CN] [NH;!R;CS] [S;!R;CNOO] ( = [O;!R;S] ) ( = [O;!R;S] ) [c;R;CCS] 2 [cH;R;CC] [cH;R;CN] [n;R;CCC] ( [CH3;!R;N] ) [cH;R;CN] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;BrCN] ( [Br;!R;C] ) [n;R;CC] [c;R;CNO] 2 [O;!R;CC] [CH3;!R;O] ) [n;R;CC] 1 [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [CH2;!R;CN] [NH;!R;CS] [S;!R;CNOO] ( = [O;!R;S] ) ( = [O;!R;S] ) [c;R;CCS] 2 [cH;R;CC] [cH;R;CN] [n;R;CCC] ( [CH3;!R;N] ) [cH;R;CN] 2 ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [c;R;BrCN] ( [Br;!R;C] ) [n;R;CC] [c;R;CNO] 2 [O;!R;CC] [CH3;!R;O] ) [n;R;CC] 1 [EOS]
[CLS][BOS][SEP][CH;!R;C] # [C;

100%|█████████████████████████████████████| 1000/1000 [00:03<00:00, 281.10it/s]

[CLS][BOS][SEP][CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] [c;R;CCN] ( [CH3;!R;C] ) [nH;R;CC] [c;R;CNO] 2 = [O;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [c;R;CCC] ( [C;!R;CN] # [N;!R;C] ) [cH;R;CN] [n;R;CN] [n;R;CCN] 2 - [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [cH;R;CC] [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] 1 [EOS]
-->[CLS] [BOS] [SEP] [CH;!R;C] # [C;!R;CC] [CH2;!R;CC] [[C@@H];!R;CCN] ( [CH2;!R;CC] [C;!R;CNO] ( = [O;!R;C] ) [NH;!R;CDy] [Dy;!R;N] ) [NH;!R;CC] [c;R;NNN] 1 [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [CH2;!R;CN] [c;R;CCC] 2 [c;R;CCC] ( [CH3;!R;C] ) [cH;R;CC] [c;R;CCN] ( [CH3;!R;C] ) [nH;R;CC] [c;R;CNO] 2 = [O;!R;C] ) [n;R;CC] [c;R;NNN] ( [NH;!R;CC] [c;R;CNN] 2 [c;R;CCC] ( [C;!R;CN] # [N;!R;C] ) [cH;R;CN] [n;R;CN] [n;R;CCN] 2 - [c;R;CCN] 2 [cH;R;CC] [cH;R;CC] [cH;R;CC] [cH;R;CC] [cH;R;CC] 2 ) [n;R;CC] 1 [EOS




# SELFIES Tokenizer

In [31]:
import selfies

In [33]:
SAVE_DIR = '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/selfies/'

In [34]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/features/selfies/train_counter.json', 'r') as f:
    counter = json.load(f)
    print(len(counter))

40


In [35]:
counter = Counter(counter)
print(counter.most_common())

[('[C]', 2157032471), ('[=C]', 669993382), ('[Ring1]', 564086306), ('[N]', 544912723), ('[Branch1]', 498434052), ('[=N]', 431841232), ('[Ring2]', 279529692), ('[=Branch1]', 261913290), ('[Branch2]', 258020307), ('[O]', 169883795), ('[=O]', 154931879), ('[#Branch1]', 109554744), ('[Dy]', 98415610), ('[F]', 81837675), ('[S]', 74124100), ('[=Branch2]', 62190215), ('[#Branch2]', 58490781), ('[Cl]', 46182354), ('[#C]', 38702239), ('[P]', 30102869), ('[C@@H1]', 28119629), ('[C@H1]', 26992387), ('[NH1]', 20969646), ('[Br]', 19802680), ('[=Ring1]', 15929836), ('[N+1]', 10504678), ('[O-1]', 10504678), ('[#N]', 7278625), ('[I]', 3995499), ('[=Ring2]', 3255416), ('[/C]', 1688688), ('[C@@]', 724607), ('[=N+1]', 363032), ('[=N-1]', 261741), ('[C@]', 237538), ('[/Br]', 235828), ('[B]', 235703), ('[Si]', 235671), ('[=S]', 187322), ('[N-1]', 101291)]


In [36]:
vocab = {
    '[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[MASK]': 3, '[SEP]': 4, '[BOS]': 5, '[EOS]': 6,
}
vocab.update({token: i + 7 for i, (token, count) in enumerate(counter.most_common())})
print(len(vocab))
print(vocab)

47
{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[MASK]': 3, '[SEP]': 4, '[BOS]': 5, '[EOS]': 6, '[C]': 7, '[=C]': 8, '[Ring1]': 9, '[N]': 10, '[Branch1]': 11, '[=N]': 12, '[Ring2]': 13, '[=Branch1]': 14, '[Branch2]': 15, '[O]': 16, '[=O]': 17, '[#Branch1]': 18, '[Dy]': 19, '[F]': 20, '[S]': 21, '[=Branch2]': 22, '[#Branch2]': 23, '[Cl]': 24, '[#C]': 25, '[P]': 26, '[C@@H1]': 27, '[C@H1]': 28, '[NH1]': 29, '[Br]': 30, '[=Ring1]': 31, '[N+1]': 32, '[O-1]': 33, '[#N]': 34, '[I]': 35, '[=Ring2]': 36, '[/C]': 37, '[C@@]': 38, '[=N+1]': 39, '[=N-1]': 40, '[C@]': 41, '[/Br]': 42, '[B]': 43, '[Si]': 44, '[=S]': 45, '[N-1]': 46}


In [37]:
tokenizer = Tokenizer(WordLevel(vocab=vocab,
                          unk_token='[UNK]'
                        ))
tokenizer.pre_tokenizer = WhitespaceSplit()

tokenizer = PreTrainedTokenizerFast(
    model_max_length = None,
    padding_side = 'right',
    tokenizer_object = tokenizer,
    bos_token = '[BOS]',
    eos_token = '[EOS]',
    unk_token='[UNK]',
    sep_token = '[SEP]',
    pad_token = '[PAD]',
    cls_token = '[CLS]',
    mask_token = '[MASK]'
)
tokenizer.save_pretrained(SAVE_DIR)

('/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/selfies/tokenizer_config.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/selfies/special_tokens_map.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/selfies/tokenizer.json')

In [38]:
tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
print('TOKENIZER:', tokenizer)
print('VOCAB SIZE:', tokenizer.vocab_size)
print('VOCAB:', tokenizer.get_vocab())
print('SPECIAL TOKENS:', tokenizer.special_tokens_map)
print('PRETOKENIZER:', tokenizer._tokenizer.pre_tokenizer)

TOKENIZER: PreTrainedTokenizerFast(name_or_path='/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/selfies/', vocab_size=47, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, spe

In [39]:
tokenizer.unk_token_id, tokenizer.pad_token_id

(1, 0)

In [40]:
# test
for smiles in tqdm(test_df[:1000, 'molecule']):
#     if not 'Cl' in smiles:
#         continue
#     if not 'Br' in smiles:
#         continue
    s = ' '.join(list(selfies.split_selfies(selfies.encoder(smiles))))
    s = f'[CLS][MASK][UNK][BOS]{s}[EOS]'
    decoded = tokenizer.decode(tokenizer.encode(s))
    if decoded != s:
        print(s, decoded, sep = '\n-->')
    if 1:
        print(smiles)
        print('-->', decoded)
        print('\n')

  6%|██▏                                    | 56/1000 [00:00<00:01, 558.52it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [=Branch2] [N] [C] [=N] [C] [Branch1] [=C] [N] [C] [=C] [C] [=C] [Branch1] [Ring1] [C] [=C] [C] [=C] [Ring1] [Branch2] [=N] [C] [Branch1] [=C] [N] [C] [=C] [C] [=C] [Branch1] [Ring1] [C] [=C] [C] [=C] [Ring1] [Branch2] [=N] [Ring2] [Ring1] [Branch2] [C] [=Branch1] [C] [=O] [N] [Dy][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [=Branch2] [N] [C] [=N] [C] [Branch1] [=C] [N] [C] [=C] [C] [=C] [Branch1] [Ring1] [C] [=C] [C] [=C] [Ring1] [Branch2] [=N] [C] [Branch1] [=C] [N] [C] [=C] [C] [=C] [Branch1] [Ring1] [C] [=C] [C] [=C] [Ring1] [Branch2] [=N] [Ring2] [Ring1] [Branch2] [C] [=Branch1] [C] [=O] [N] [Dy] [EOS]
C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [=Branch2] [N] [C] [=N] [C] [Branch1] [=C] [N] [C] [=C] [C] [=C] [Branch1] [Ring1] [C] [=C] [C] [=C] [Ring1] [Branch2] [=N] [C] [Branch1] [=C]

 17%|██████▌                               | 172/1000 [00:00<00:01, 543.53it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C] [C] [C@H1] [Branch2] [Branch1] [Branch2] [N] [C] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [C] [=Branch1] [C] [=O] [N] [C] [Ring1] [=Branch1] [=O] [=N] [C] [Branch2] [Ring1] [=Branch1] [N] [C] [=C] [Branch1] [C] [C] [C] [=C] [Branch1] [=Branch1] [N+1] [=Branch1] [C] [=O] [O-1] [C] [=C] [Ring1] [#Branch2] [O] [C] [=N] [Ring2] [Ring1] [#C] [C] [=Branch1] [C] [=O] [N] [Dy][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Branch1] [Branch2] [N] [C] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [C] [=Branch1] [C] [=O] [N] [C] [Ring1] [=Branch1] [=O] [=N] [C] [Branch2] [Ring1] [=Branch1] [N] [C] [=C] [Branch1] [C] [C] [C] [=C] [Branch1] [=Branch1] [N+1] [=Branch1] [C] [=O] [O-1] [C] [=C] [Ring1] [#Branch2] [O] [C] [=N] [Ring2] [Ring1] [#C] [C] [=Branch1] [C] [=O] [N] [Dy] [EOS]
C#CCCC[C@H](Nc1nc(Nc2cccc3c2C(=O)NC3=O)nc(Nc2c(C)cc([N+](=O)[O-])cc2OC)n1)

 28%|██████████▋                           | 281/1000 [00:00<00:01, 527.02it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [P] [N] [C] [=N] [C] [Branch1] [P] [N] [C] [C] [Branch1] [C] [O] [C] [C] [C] [C] [Ring1] [=Branch1] [Branch1] [C] [C] [C] [=N] [C] [Branch2] [Ring1] [C] [N] [C] [=Branch1] [C] [=N] [C] [=C] [C] [=C] [C] [Branch1] [Ring1] [C] [N] [=C] [Ring1] [Branch2] [=N] [Ring2] [Ring1] [O] [C] [=Branch1] [C] [=O] [N] [Dy][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [P] [N] [C] [=N] [C] [Branch1] [P] [N] [C] [C] [Branch1] [C] [O] [C] [C] [C] [C] [Ring1] [=Branch1] [Branch1] [C] [C] [C] [=N] [C] [Branch2] [Ring1] [C] [N] [C] [=Branch1] [C] [=N] [C] [=C] [C] [=C] [C] [Branch1] [Ring1] [C] [N] [=C] [Ring1] [Branch2] [=N] [Ring2] [Ring1] [O] [C] [=Branch1] [C] [=O] [N] [Dy] [EOS]
C#CCCC[C@H](Nc1nc(NCC2(O)CCCC2(C)C)nc(NC(=N)c2cccc(CN)c2)n1)C(=O)N[Dy]
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [P] [N] [C] [=N] [C] [Branch1] [P] [N] [C] [C] [Branch1] [C] [O] [C] [C] [C] [C] [

 40%|███████████████                       | 395/1000 [00:00<00:01, 549.50it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [=C] [N] [C] [=N] [C] [Branch1] [=N] [N] [C] [C] [O] [C] [C] [C] [S] [C] [C] [Ring1] [=Branch1] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [C] [=N] [N] [=C] [Branch1] [Branch2] [N] [C] [C] [C] [C] [Ring1] [Branch1] [O] [Ring1] [#Branch2] [=N] [Ring2] [Ring1] [N] [C] [=Branch1] [C] [=O] [N] [Dy][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [=C] [N] [C] [=N] [C] [Branch1] [=N] [N] [C] [C] [O] [C] [C] [C] [S] [C] [C] [Ring1] [=Branch1] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [C] [=N] [N] [=C] [Branch1] [Branch2] [N] [C] [C] [C] [C] [Ring1] [Branch1] [O] [Ring1] [#Branch2] [=N] [Ring2] [Ring1] [N] [C] [=Branch1] [C] [=O] [N] [Dy] [EOS]
C#CCCC[C@H](Nc1nc(NCCOC2CCSCC2)nc(NCc2nnc(N3CCCC3)o2)n1)C(=O)N[Dy]
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [=C] [N] [C] [=N] [C] [Branch1] [=N] [N] [C] [C] [O] [C] [C] [C] [S] [C] [C] [Ring1] [=Branch1] [=N] [C] [Branch2]

 51%|███████████████████▎                  | 507/1000 [00:00<00:00, 507.31it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [#Branch2] [N] [C] [=N] [C] [Branch1] [=C] [N] [C] [C] [C] [=C] [Branch1] [C] [Cl] [S] [C] [=Ring1] [=Branch1] [Cl] [=N] [C] [Branch1] [#C] [N] [C] [=C] [C] [=C] [C] [C] [=N] [NH1] [C] [Ring1] [=Branch2] [=Ring1] [Branch1] [=N] [Ring2] [Ring1] [=Branch2] [C] [=Branch1] [C] [=O] [N] [Dy][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [#Branch2] [N] [C] [=N] [C] [Branch1] [=C] [N] [C] [C] [C] [=C] [Branch1] [C] [Cl] [S] [C] [=Ring1] [=Branch1] [Cl] [=N] [C] [Branch1] [#C] [N] [C] [=C] [C] [=C] [C] [C] [=N] [NH1] [C] [Ring1] [=Branch2] [=Ring1] [Branch1] [=N] [Ring2] [Ring1] [=Branch2] [C] [=Branch1] [C] [=O] [N] [Dy] [EOS]
C#CCCC[C@H](Nc1nc(NCc2cc(Cl)sc2Cl)nc(Nc2cccc3cn[nH]c23)n1)C(=O)N[Dy]
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [#Branch2] [N] [C] [=N] [C] [Branch1] [=C] [N] [C] [C] [C] [=C] [Branch1] [C] [Cl] [S] [C] [=Ring1] [=Branch1] [Cl] [=N] [C] [Br

 62%|███████████████████████▍              | 616/1000 [00:01<00:00, 521.42it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [P] [N] [C] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [C] [=N] [N] [=C] [Branch1] [Branch2] [N] [C] [C] [C] [C] [Ring1] [Branch1] [O] [Ring1] [#Branch2] [=N] [C] [Branch1] [S] [N] [C] [=C] [C] [=C] [Branch1] [C] [F] [C] [Branch1] [C] [F] [=C] [Ring1] [Branch2] [=N] [Ring2] [Ring1] [O] [C] [=Branch1] [C] [=O] [N] [Dy][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [P] [N] [C] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [C] [=N] [N] [=C] [Branch1] [Branch2] [N] [C] [C] [C] [C] [Ring1] [Branch1] [O] [Ring1] [#Branch2] [=N] [C] [Branch1] [S] [N] [C] [=C] [C] [=C] [Branch1] [C] [F] [C] [Branch1] [C] [F] [=C] [Ring1] [Branch2] [=N] [Ring2] [Ring1] [O] [C] [=Branch1] [C] [=O] [N] [Dy] [EOS]
C#CCCC[C@H](Nc1nc(NCc2nnc(N3CCCC3)o2)nc(Nc2ccc(F)c(F)c2)n1)C(=O)N[Dy]
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Ring2] [P] [N] [C] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [C] [=N]

 72%|███████████████████████████▌          | 725/1000 [00:01<00:00, 528.97it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C] [C] [C@H1] [Branch2] [Branch1] [#Branch2] [N] [C] [=N] [C] [Branch2] [Ring2] [C] [N] [C] [C] [C] [=C] [C] [=C] [Branch2] [Ring1] [Ring1] [O] [C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [C] [Branch1] [C] [C] [=C] [Ring1] [#Branch1] [C] [=C] [Ring2] [Ring1] [C] [=N] [C] [Branch1] [#Branch2] [N] [C] [=N] [C] [=C] [N] [=N] [Ring1] [=Branch1] [=N] [Ring2] [Ring2] [C] [C] [=Branch1] [C] [=O] [N] [Dy][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C] [C] [C@H1] [Branch2] [Branch1] [#Branch2] [N] [C] [=N] [C] [Branch2] [Ring2] [C] [N] [C] [C] [C] [=C] [C] [=C] [Branch2] [Ring1] [Ring1] [O] [C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [=C] [C] [Branch1] [C] [C] [=C] [Ring1] [#Branch1] [C] [=C] [Ring2] [Ring1] [C] [=N] [C] [Branch1] [#Branch2] [N] [C] [=N] [C] [=C] [N] [=N] [Ring1] [=Branch1] [=N] [Ring2] [Ring2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [EOS]
C#CCCC[C@H](Nc1nc(NCCc2ccc(OCC(=O)Nc3cccc(C)c3)cc2)nc(Nc2nccnn2)n1)C(=O)N[Dy]
--> [CLS] [MASK] [UN

 84%|███████████████████████████████▊      | 838/1000 [00:01<00:00, 540.47it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [P] [N] [C] [C] [=C] [C] [=C] [Branch1] [C] [C] [C] [Branch1] [C] [C] [=N] [Ring1] [Branch2] [=N] [C] [Branch1] [S] [N] [C] [C] [=N] [C] [=C] [C] [=C] [Ring1] [=Branch1] [N] [Branch1] [C] [C] [C] [=N] [Ring2] [Ring1] [O][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [P] [N] [C] [C] [=C] [C] [=C] [Branch1] [C] [C] [C] [Branch1] [C] [C] [=N] [Ring1] [Branch2] [=N] [C] [Branch1] [S] [N] [C] [C] [=N] [C] [=C] [C] [=C] [Ring1] [=Branch1] [N] [Branch1] [C] [C] [C] [=N] [Ring2] [Ring1] [O] [EOS]
C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCc2ccc(C)c(C)n2)nc(NCc2ncccc2N(C)C)n1
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [P] [N] [C] [C] [=C] [C] [=C] [Branch1] [C] [C] [C] [Branch1] [C] [C] [=N] [Ring1] [Br

 96%|████████████████████████████████████▎ | 957/1000 [00:01<00:00, 567.74it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [=N] [N] [C] [C] [N] [C] [C] [C] [O] [C] [C] [Ring1] [#Branch1] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [C] [=C] [C] [=C] [Branch1] [Ring1] [O] [C] [C] [Branch1] [Ring1] [C] [#N] [=C] [Ring1] [#Branch2] [=N] [Ring2] [Ring1] [N][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [=N] [N] [C] [C] [N] [C] [C] [C] [O] [C] [C] [Ring1] [#Branch1] [=N] [C] [Branch2] [Ring1] [Ring1] [N] [C] [C] [=C] [C] [=C] [Branch1] [Ring1] [O] [C] [C] [Branch1] [Ring1] [C] [#N] [=C] [Ring1] [#Branch2] [=N] [Ring2] [Ring1] [N] [EOS]
C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCCN2CCCOCC2)nc(NCc2ccc(OC)c(C#N)c2)n1
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [=N] [N] [C] [C] [N] [C] [C] [C] [O] [C] [C] [Ring1] [#Branch1] [=

100%|█████████████████████████████████████| 1000/1000 [00:01<00:00, 532.28it/s]

[CLS][MASK][UNK][BOS][C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [S] [N] [C] [C] [=C] [C] [=C] [S] [C] [=N] [C] [Ring1] [Branch1] [=C] [Ring1] [=Branch2] [=N] [C] [Branch1] [#C] [N] [C] [=N] [C] [=C] [Branch1] [Ring1] [C] [#N] [C] [=C] [Ring1] [Branch2] [C] [=N] [Ring2] [Ring1] [O][EOS]
-->[CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [S] [N] [C] [C] [=C] [C] [=C] [S] [C] [=N] [C] [Ring1] [Branch1] [=C] [Ring1] [=Branch2] [=N] [C] [Branch1] [#C] [N] [C] [=N] [C] [=C] [Branch1] [Ring1] [C] [#N] [C] [=C] [Ring1] [Branch2] [C] [=N] [Ring2] [Ring1] [O] [EOS]
C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCc2ccc3scnc3c2)nc(Nc2ncc(C#N)cc2C)n1
--> [CLS] [MASK] [UNK] [BOS] [C] [#C] [C] [C@@H1] [Branch1] [Branch2] [C] [C] [=Branch1] [C] [=O] [N] [Dy] [N] [C] [=N] [C] [Branch1] [S] [N] [C] [C] [=C] [C] [=C] [S] [C] [=N] [C] [Ring1] [Branch1] [=C] [Ring1] [=Branch2] [=N]




In [11]:
import selfies

In [12]:
def smiles_to_selfies(smiles, keep_order=True, max_len = 10e6):
    assert keep_order
    sf = list(selfies.split_selfies(selfies.encoder(smiles)))
    assert len(sf) <= max_len
    sf = ' '.join(sf)
    return sf


def batch_selfies_tokenize(input_batch):
    MAX_LEN = 160
    SAVE_DIR = '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer/selfies/'
    tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
    input_batch = [smiles_to_selfies(smiles, max_len = MAX_LEN) for smiles in input_batch]
    output = tokenizer.batch_encode_plus(input_batch,
                           padding = 'max_length',
                            max_length = MAX_LEN,
                           return_tensors='np',
                           return_token_type_ids = False,
                           return_attention_mask = False,
                           )
    input_ids = output['input_ids'].astype(np.uint8)
    # no UNK
    assert np.all(input_ids != tokenizer.unk_token_id)
    return input_ids

In [13]:
extract_features(func = batch_selfies_tokenize,
                 inputs = test_df['molecule'].to_list(),
                 save_dir = '/home/dangnh36/datasets/competitions/leash_belka/processed/features/',
                 feature_name = 'selfies',
                 subset = 'test',
                 method = 'batch',
                 num_workers = 40, 
                 joblib_backend='loky',
                 batch_size = 50_000,
                 save_formats = ['npy'],
                hook=partial(trim_padding, pad_idx = 0)
                )

100%|█████████████████████████████████████████| 18/18 [00:00<00:00, 949.87it/s]


SHAPE=(878022, 121) DTYPE=uint8
Saving npy at /home/dangnh36/datasets/competitions/leash_belka/processed/features/selfies/test.npy
Take: 2.4946253299713135 s


In [14]:
extract_features(func = batch_selfies_tokenize,
                 inputs = train_df['molecule'].to_list(),
                 save_dir = '/home/dangnh36/datasets/competitions/leash_belka/processed/features/',
                 feature_name = 'selfies',
                 subset = 'train',
                 method = 'batch',
                 num_workers = 40, 
                 joblib_backend='loky',
                 batch_size = 50_000,
                 save_formats = ['npy'],
                hook=partial(trim_padding, pad_idx = 0)
                )

100%|████████████████████████████████████| 1969/1969 [1:05:28<00:00,  2.00s/it]


SHAPE=(98415610, 130) DTYPE=uint8
Saving npy at /home/dangnh36/datasets/competitions/leash_belka/processed/features/selfies/train.npy
Take: 368.6767086982727 s


1

# DeepSMILES Tokenizer

In [47]:
import deepsmiles
from deepsmiles.encode import encode as deepsmiles_encode

In [48]:
VOCAB = {'Br': 1, 'C': 2, 'N': 3, 'O': 4, 'H': 5, 'S': 6, 'F': 7, 'Cl': 8, 'B': 9, 'I': 10, 's': 11, 'o': 12, 'c': 13, 'n': 14, 'i': 15, '.': 16, '=': 17, '#': 18, '/': 19, '-': 20, '+': 21, '[': 22, ']': 23, '(': 24, ')': 25, '@@': 26, '@': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, '[Dy]': 37, '%': 38 , '0': 39}
VOCAB = {k:v + 6 for k, v in VOCAB.items()}
print(VOCAB)

{'Br': 7, 'C': 8, 'N': 9, 'O': 10, 'H': 11, 'S': 12, 'F': 13, 'Cl': 14, 'B': 15, 'I': 16, 's': 17, 'o': 18, 'c': 19, 'n': 20, 'i': 21, '.': 22, '=': 23, '#': 24, '/': 25, '-': 26, '+': 27, '[': 28, ']': 29, '(': 30, ')': 31, '@@': 32, '@': 33, '1': 34, '2': 35, '3': 36, '4': 37, '5': 38, '6': 39, '7': 40, '8': 41, '9': 42, '[Dy]': 43, '%': 44, '0': 45}


In [49]:
SAVE_DIR = '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/deepsmiles/'

VOCAB = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[MASK]': 3, '[SEP]': 4, '[BOS]': 5, '[EOS]': 6, 'Br': 7, 'C': 8, 'N': 9, 'O': 10, 'H': 11, 'S': 12, 'F': 13, 'Cl': 14, 'B': 15, 'I': 16, 's': 17, 'o': 18, 'c': 19, 'n': 20, 'i': 21, '.': 22, '=': 23, '#': 24, '/': 25, '-': 26, '+': 27, '[': 28, ']': 29, '(': 30, ')': 31, '@@': 32, '@': 33, '1': 34, '2': 35, '3': 36, '4': 37, '5': 38, '6': 39, '7': 40, '8': 41, '9': 42, '[Dy]': 43, '%': 44, '0': 45 }
# indeed, if '0' appear then it must be '%10' in BELKA dataset
# for simplicity, we just treat 0 as a normal digit, CNN will do the job to identify %10, %20, %30, %(102),..
len(VOCAB)

46

In [50]:
tokenizer = Tokenizer(BPE(vocab=VOCAB,
                          merges = [],
                          # merges=[('@','@')],
                          unk_token='[UNK]'))
tokenizer.add_tokens(['@@', 'Br', 'Cl', '[Dy]'])
print(len(tokenizer.get_vocab()))
print(sorted(tokenizer.get_vocab().items(), key = lambda x: x[1]))

46
[('[PAD]', 0), ('[UNK]', 1), ('[CLS]', 2), ('[MASK]', 3), ('[SEP]', 4), ('[BOS]', 5), ('[EOS]', 6), ('Br', 7), ('C', 8), ('N', 9), ('O', 10), ('H', 11), ('S', 12), ('F', 13), ('Cl', 14), ('B', 15), ('I', 16), ('s', 17), ('o', 18), ('c', 19), ('n', 20), ('i', 21), ('.', 22), ('=', 23), ('#', 24), ('/', 25), ('-', 26), ('+', 27), ('[', 28), (']', 29), ('(', 30), (')', 31), ('@@', 32), ('@', 33), ('1', 34), ('2', 35), ('3', 36), ('4', 37), ('5', 38), ('6', 39), ('7', 40), ('8', 41), ('9', 42), ('[Dy]', 43), ('%', 44), ('0', 45)]


In [51]:
tokenizer = PreTrainedTokenizerFast(
    model_max_length = None,
    padding_side = 'right',
    tokenizer_object = tokenizer,
    bos_token = '[BOS]',
    eos_token = '[EOS]',
    unk_token='[UNK]',
    sep_token = '[SEP]',
    pad_token = '[PAD]',
    cls_token = '[CLS]',
    mask_token = '[MASK]'
)
print(tokenizer.vocab)

{'4': 37, '6': 39, 'c': 19, 's': 17, '[UNK]': 1, '5': 38, '0': 45, '(': 30, '.': 22, 'C': 8, '[': 28, 'I': 16, '%': 44, '1': 34, '=': 23, '9': 42, '[EOS]': 6, 'F': 13, ')': 31, '/': 25, '[MASK]': 3, '[SEP]': 4, ']': 29, '[PAD]': 0, 'Cl': 14, '[Dy]': 43, '#': 24, 'n': 20, '3': 36, '@': 33, '@@': 32, 'Br': 7, 'S': 12, 'o': 18, 'O': 10, 'N': 9, 'i': 21, '[CLS]': 2, 'B': 15, '-': 26, '+': 27, '[BOS]': 5, '2': 35, '8': 41, 'H': 11, '7': 40}


In [52]:
tokenizer.save_pretrained(SAVE_DIR)

('/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/deepsmiles/tokenizer_config.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/deepsmiles/special_tokens_map.json',
 '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/deepsmiles/tokenizer.json')

In [53]:
tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
print('TOKENIZER:', tokenizer)
print('VOCAB SIZE:', tokenizer.vocab_size)
print('VOCAB:', tokenizer.get_vocab())
print('SPECIAL TOKENS:', tokenizer.special_tokens_map)

TOKENIZER: PreTrainedTokenizerFast(name_or_path='/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer_v2/deepsmiles/', vocab_size=46, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, 

In [59]:
# test

converter = deepsmiles.Converter(rings=True, branches=True)

for smiles in smiles_list:
    if not 'Cl' in smiles:
        continue
    if not 'Br' in smiles:
        continue
#     deep_smiles = converter.encode(smiles)
    deep_smiles = deepsmiles_encode(smiles, True, True)
    deep_smiles2 = deepsmiles_encode(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), doRandom=True), True, True)
    deep_smiles = f'[CLS] [SEP][UNK]{deep_smiles}[EOS][MASK]'
    decoded = tokenizer.decode(tokenizer.encode(deep_smiles))
    if 1:
        print(deep_smiles)
        print(deep_smiles2)
        print(decoded)
        print('\n')

[CLS] [SEP][UNK]O=CN[Dy]))cncCl)ccNcncNccccO)cc6CF)F)F)))))))))ncNcnc=O)[nH]cc6Br))))))))n6)))))))c6Cl[EOS][MASK]
[Dy]NC=O)ccCl)cNcncNcnc=O)[nH]cc6Br))))))))ncNcccccc6CF)F)F))))O))))))n6)))))))ccCl)n6
[CLS] [UNK] [SEP] [UNK] O = C N [Dy] ) ) c n c Cl ) c c N c n c N c c c c O ) c c 6 C F ) F ) F ) ) ) ) ) ) ) ) ) n c N c n c = O ) [ n H ] c c 6 Br ) ) ) ) ) ) ) ) n 6 ) ) ) ) ) ) ) c 6 Cl [EOS] [MASK]


[CLS] [SEP][UNK]O=CN[Dy]))cncCl)ccNcncNccccnccn5c9))))))))))ncNccccnccnc6c%10Br))))))))))))n6)))))))c6Cl[EOS][MASK]
ccccnccn6))))cc6))))Br))NcncncNccnccnc5cc9))))))))))n6)))NccCl)cC=O)N[Dy])))ncCl)c6
[CLS] [UNK] [SEP] [UNK] O = C N [Dy] ) ) c n c Cl ) c c N c n c N c c c c n c c n 5 c 9 ) ) ) ) ) ) ) ) ) ) n c N c c c c n c c n c 6 c % 1 0 Br ) ) ) ) ) ) ) ) ) ) ) ) n 6 ) ) ) ) ) ) ) c 6 Cl [EOS] [MASK]


[CLS] [SEP][UNK]O=CN[Dy]))[C@H]Ccccccc6F))))))))NcncNcccF)cF)cc6Br))))))))ncNccccCl)nc6Cl))))))))n6[EOS][MASK]
FccccNcncncN[C@@H]CccF)cccc6)))))))C=O)N[Dy])))))n6)))NccCl)nccc6))Cl)))))

In [7]:
def batch_deepsmiles_tokenize(input_batch):
    MAX_LEN = 200
    SAVE_DIR = '/home/dangnh36/datasets/competitions/leash_belka/processed/tokenizer/deepsmiles/'
    tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
    input_batch = [deepsmiles_encode(smiles, True, True) for smiles in input_batch]
    output = tokenizer.batch_encode_plus(input_batch,
                           padding = 'max_length',
                            max_length = MAX_LEN,
                           return_tensors='np',
                           return_token_type_ids = False,
                           return_attention_mask = False,
                           )
    input_ids = output['input_ids'].astype(np.uint8)
    # no UNK
    assert np.all(input_ids != tokenizer.unk_token_id)
    return input_ids

In [9]:
extract_features(func = batch_deepsmiles_tokenize,
                 inputs = test_df['molecule'].to_list(),
                 save_dir = '/home/dangnh36/datasets/competitions/leash_belka/processed/features/',
                 feature_name = 'deepsmiles',
                 subset = 'test',
                 method = 'batch',
                 num_workers = 40, 
                 joblib_backend='loky',
                 batch_size = 50_000,
                 save_formats = ['npy'],
                hook=partial(trim_padding, pad_idx = 0)
                )

100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 55.43it/s]


SHAPE=(878022, 153) DTYPE=uint8
Saving npy at /home/dangnh36/datasets/competitions/leash_belka/processed/features/deepsmiles/test.npy
Take: 3.4898571968078613 s


In [11]:
extract_features(func = batch_deepsmiles_tokenize,
                 inputs = train_df['molecule'].to_list(),
                 save_dir = '/home/dangnh36/datasets/competitions/leash_belka/processed/features/',
                 feature_name = 'deepsmiles',
                 subset = 'train',
                 method = 'batch',
                 num_workers = 40, 
                 joblib_backend='loky',
                 batch_size = 50_000,
                 save_formats = ['npy'],
                hook=partial(trim_padding, pad_idx = 0)
                )

100%|██████████████████████████████████████| 1969/1969 [11:01<00:00,  2.98it/s]


SHAPE=(98415610, 162) DTYPE=uint8
Saving npy at /home/dangnh36/datasets/competitions/leash_belka/processed/features/deepsmiles/train.npy
Take: 437.04083609580994 s
