In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import sys

if '..' not in sys.path: sys.path.append('..')

from pydantic import BaseModel
from pydantic_yaml import to_yaml_file, parse_yaml_file_as
from transformers import GPT2Tokenizer

from mllm.config.model import create_mllm_encdec_cfg, create_mllm_ranker_cfg, TokenizerCfg, \
    VocabEncoderCfg, EncoderCfg, MllmRankerCfg
from mllm.tokenization.chunk_tokenizer import calc_max_inp_size, gen_all_tokens, tokenizer_from_config


## Configuration generation
### Setup

In [3]:
cfg_dpath = Path(os.path.abspath('.')).parent / 'mllm' / 'config' / 'cfg'
cfg_dpath

PosixPath('/home/misha/prog/mllm/mllm/config/cfg')

In [4]:
def save_config_to_yaml(cfg: BaseModel, fpath: Path, rewrite: bool = False):
    if not fpath.exists() or rewrite:
        to_yaml_file(fpath, cfg)
    else:
        print(f'File {fpath} already exists')

### Tokenizer config

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', model_max_length=10000)
n_tokens_init = len(tokenizer)
tok_dict = gen_all_tokens(tokenizer)

In [6]:
tkz_cfg = TokenizerCfg(name='gpt2', n_tokens_init=n_tokens_init, model_max_length=10000, custom_tokens=tok_dict)
tkz_cfg

TokenizerCfg(name='gpt2', n_tokens_init=50257, model_max_length=10000, custom_tokens={'doc_begin': CustomToken(name='doc_begin', repr='<|doc_begin|>', special=False, ind=50257), 'doc_end': CustomToken(name='doc_end', repr='<|doc_end|>', special=False, ind=50258), 'doc_id_begin': CustomToken(name='doc_id_begin', repr='<|doc_id_begin|>', special=False, ind=50259), 'doc_id_end': CustomToken(name='doc_id_end', repr='<|doc_id_end|>', special=False, ind=50260), 'doc_offset_begin': CustomToken(name='doc_offset_begin', repr='<|doc_offset_begin|>', special=False, ind=50261), 'doc_offset_end': CustomToken(name='doc_offset_end', repr='<|doc_offset_end|>', special=False, ind=50262), 'doc_title_begin': CustomToken(name='doc_title_begin', repr='<|doc_title_begin|>', special=False, ind=50263), 'doc_title_end': CustomToken(name='doc_title_end', repr='<|doc_title_end|>', special=False, ind=50264), 'doc_body_begin': CustomToken(name='doc_body_begin', repr='<|doc_body_begin|>', special=False, ind=50265),

In [7]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
save_config_to_yaml(tkz_cfg, tkz_cfg_fpath)

File /home/misha/prog/mllm/mllm/config/cfg/tokenizer_cfg_01.yaml already exists


In [8]:
print(f'n_tokens_init = {n_tokens_init}')

n_tokens_init = 50257


### MLLM Ranker first layer config

In [11]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)
print(tkz.pad_token_id)
print(tkz)

50267
GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=

In [13]:
n_vocab = len(tkz)
seq_len = 100
d_word_wec = 256
n_heads, d_model, d_inner = 8, 256, 1024
dropout_rate = 0.0
n_levels = 1
enc_n_layers = 1
enc_with_graph_mat = False
enc_with_emb_mat = True
dec_n_layers = 1
pad_tok = tkz_cfg.custom_tokens['pad'].ind


ranker_cfg_01 = create_mllm_ranker_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok,
)
ranker_cfg_01

MllmRankerCfg(vocab_encoder=VocabEncoderCfg(n_vocab=50270, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0), encoders=[EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True)], decoders=[EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=False)])

In [14]:
ranker_cfg_01_fpath = cfg_dpath / 'ranker_model_cfg_01.yaml'
save_config_to_yaml(ranker_cfg_01, ranker_cfg_01_fpath)

File /home/misha/prog/mllm/mllm/config/cfg/ranker_model_cfg_01.yaml already exists


### MLLM Encdec and Ranker multi layers configs

In [10]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)
print(tkz.pad_token_id)
print(tkz)

50267
GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=

In [11]:
n_vocab = len(tkz)
seq_len = 100
d_word_wec = 256
n_heads, d_model, d_inner = 8, 256, 1024
dropout_rate = 0.0
n_levels = 2
enc_n_layers = 1
enc_with_graph_mat = False
enc_with_emb_mat = True, False
dec_n_layers = 1
pad_tok = tkz_cfg.custom_tokens['pad'].ind

encdec_cfg_02 = create_mllm_encdec_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok,
)

ranker_cfg_02 = create_mllm_ranker_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok,
)

print(encdec_cfg_02)
print(ranker_cfg_02)

vocab_encoder=VocabEncoderCfg(n_vocab=50270, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True), EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=False)] decoders=[EmbDecoderCfg(d_emb=256, n_layers=1, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0), EmbDecoderCfg(d_emb=256, n_layers=1, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0)]
vocab_encoder=VocabEncoderCfg(n_vocab=50270, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True), EncoderCfg(n_layers=1, n_heads=8, d_k=32, d_v=32, d_model=25

In [13]:
encdec_cfg_02_fpath = cfg_dpath / 'encdec_model_cfg_02.yaml'
ranker_cfg_02_fpath = cfg_dpath / 'ranker_model_cfg_02.yaml'
rewrite = False
# rewrite = True
save_config_to_yaml(encdec_cfg_02, encdec_cfg_02_fpath, rewrite)
save_config_to_yaml(ranker_cfg_02, ranker_cfg_02_fpath, rewrite)

File /home/misha/prog/mllm/mllm/config/cfg/encdec_model_cfg_02.yaml already exists
File /home/misha/prog/mllm/mllm/config/cfg/ranker_model_cfg_02.yaml already exists


In [14]:
n_vocab = len(tkz)
seq_len = 100
d_word_wec = 256
n_heads, d_model, d_inner = 8, 256, 1024
dropout_rate = 0.0
n_levels = 2
enc_n_layers = 2
enc_with_graph_mat = False
enc_with_emb_mat = True, False
dec_n_layers = 2
pad_tok = tkz_cfg.custom_tokens['pad'].ind

encdec_cfg_03 = create_mllm_encdec_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok,
)

ranker_cfg_03 = create_mllm_ranker_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok,
)

print(encdec_cfg_03)
print(ranker_cfg_03)

vocab_encoder=VocabEncoderCfg(n_vocab=50270, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True), EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=False)] decoders=[EmbDecoderCfg(d_emb=256, n_layers=2, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0), EmbDecoderCfg(d_emb=256, n_layers=2, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0)]
vocab_encoder=VocabEncoderCfg(n_vocab=50270, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True), EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=25

In [15]:
encdec_cfg_03_fpath = cfg_dpath / 'encdec_model_cfg_03.yaml'
ranker_cfg_03_fpath = cfg_dpath / 'ranker_model_cfg_03.yaml'
# rewrite = False
rewrite = True
save_config_to_yaml(encdec_cfg_03, encdec_cfg_03_fpath, rewrite)
save_config_to_yaml(ranker_cfg_03, ranker_cfg_03_fpath, rewrite)