In [1]:
%load_ext autoreload
%autoreload 2

In [22]:
import os
from pathlib import Path
import sys

if '..' not in sys.path: sys.path.append('..')

from pydantic import BaseModel
from pydantic_yaml import to_yaml_file, parse_yaml_file_as
from transformers import GPT2Tokenizer

from mllm.config.model import create_mllm_encdec_cfg, create_mllm_ranker_cfg, TokenizerCfg, \
    VocabEncoderCfg, EncoderCfg, MllmRankerCfg
from mllm.tokenization.chunk_tokenizer import calc_max_inp_size, gen_all_tokens, tokenizer_from_config


## Configuration generation
### Setup

In [3]:
cfg_dpath = Path(os.path.abspath('.')).parent / 'mllm' / 'config' / 'cfg_v001'
cfg_dpath

PosixPath('/home/misha/prog/mllm/mllm/config/cfg_v001')

In [11]:
def save_config_to_yaml(cfg: BaseModel, fpath: Path, overwrite: bool = False):
    if not fpath.exists() or overwrite:
        to_yaml_file(fpath, cfg)
    else:
        print(f'File {fpath} already exists')

### Tokenizer config

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', model_max_length=10000)
n_tokens_init = len(tokenizer)
tok_dict = gen_all_tokens(tokenizer)

In [12]:
tkz_cfg = TokenizerCfg(name='gpt2', n_tokens_init=n_tokens_init, model_max_length=10000, custom_tokens=tok_dict)
tkz_cfg

TokenizerCfg(name='gpt2', n_tokens_init=50257, model_max_length=10000, custom_tokens={'doc_begin': CustomToken(name='doc_begin', repr='<|doc_begin|>', special=False, ind=50257), 'doc_end': CustomToken(name='doc_end', repr='<|doc_end|>', special=False, ind=50258), 'doc_id_begin': CustomToken(name='doc_id_begin', repr='<|doc_id_begin|>', special=False, ind=50259), 'doc_id_end': CustomToken(name='doc_id_end', repr='<|doc_id_end|>', special=False, ind=50260), 'doc_offset_begin': CustomToken(name='doc_offset_begin', repr='<|doc_offset_begin|>', special=False, ind=50261), 'doc_offset_end': CustomToken(name='doc_offset_end', repr='<|doc_offset_end|>', special=False, ind=50262), 'doc_title_begin': CustomToken(name='doc_title_begin', repr='<|doc_title_begin|>', special=False, ind=50263), 'doc_title_end': CustomToken(name='doc_title_end', repr='<|doc_title_end|>', special=False, ind=50264), 'doc_body_begin': CustomToken(name='doc_body_begin', repr='<|doc_body_begin|>', special=False, ind=50265),

In [13]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
save_config_to_yaml(tkz_cfg, tkz_cfg_fpath)

File /home/misha/prog/mllm/mllm/config/cfg_v001/tokenizer_cfg_01.yaml already exists


In [15]:
print(f'n_tokens_init = {n_tokens_init}')
tkz = tokenizer_from_config(tkz_cfg)
tkz

n_tokens_init = 50257


GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=False,

### MLLM Ranker first layer config

In [16]:
model_cfg = create_mllm_ranker_cfg(
    n_vocab=len(tokenizer), inp_len=100, d_word_wec=256,
    n_levels=1, enc_n_layers=1, dec_n_layers=1,
    n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024,
    pad_idx=-1, dropout_rate=0.0, enc_with_emb_mat=True,
)
model_cfg

MllmRankerCfg(vocab_encoder=VocabEncoderCfg(n_vocab=50270, d_word_vec=256, d_model=256, pad_idx=-1, inp_len=100, dropout_rate=0.0), encoders=[EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=-1, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True)], decoders=[EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=-1, with_graph_mat=False, inp_len=0, dropout_rate=0.0, with_emb_mat=False)])

In [17]:
ranker_cfg_fpath = cfg_dpath / 'ranker_model_cfg_01.yaml'
save_config_to_yaml(model_cfg, ranker_cfg_fpath)

File /home/misha/prog/mllm/mllm/config/cfg_v001/ranker_model_cfg_01.yaml already exists


### MLLM Ranker multi layers config

In [24]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)
print(tkz.pad_token_id)
print(tkz)

50267
GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=

In [23]:
n_vocab = len(tkz)
inp_len = 100
d_word_wec = 256,
n_heads, d_k, d_v, d_model, d_inner = 8, 32, 32, 256, 1024
pad_idx = tkz.pad_tok
dropout_rate = 0.0
enc_with_emb_mat = True

cfg_vocab_enc = VocabEncoderCfg(
    n_vocab=n_vocab, d_word_vec=d_word_wec, d_model=d_model, pad_idx=pad_idx, inp_len=inp_len, dropout_rate=dropout_rate,
)
cfgs_enc = [
    EncoderCfg(
        n_layers=n_layers, n_heads=n_heads, d_k=d_k, d_v=d_v, d_model=d_model, d_inner=d_inner, pad_idx=pad_idx,
        with_graph_mat=enc_with_graph_mat, inp_len=inp_len, dropout_rate=dropout_rate, with_emb_mat=enc_with_emb_mat,
    ),
    EncoderCfg(
        n_layers=n_layers, n_heads=n_heads, d_k=d_k, d_v=d_v, d_model=d_model, d_inner=d_inner, pad_idx=pad_idx,
        with_graph_mat=enc_with_graph_mat, inp_len=inp_len, dropout_rate=dropout_rate, with_emb_mat=enc_with_emb_mat,
    )
]

cfgs_dec = [
    EncoderCfg(
        n_layers=n_layers, n_heads=n_heads, d_k=d_k, d_v=d_v, d_model=d_model, d_inner=d_inner, pad_idx=pad_idx,
        with_graph_mat=False, inp_len=0, dropout_rate=dropout_rate, with_emb_mat=False,
    ),
    EncoderCfg(
        n_layers=n_layers, n_heads=n_heads, d_k=d_k, d_v=d_v, d_model=d_model, d_inner=d_inner, pad_idx=pad_idx,
        with_graph_mat=False, inp_len=0, dropout_rate=dropout_rate, with_emb_mat=False,
    )
]

cfg_mllm_ranker = MllmRankerCfg(
    vocab_encoder=cfg_vocab_enc, encoders=cfgs_enc, decoders=cfgs_dec,
)

cfg_mllm_ranker

AttributeError: 'GPT2Tokenizer' object has no attribute 'pad_tok'