In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import os
from pathlib import Path
from pprint import pprint
import sys
from typing import TypeVar

if '..' not in sys.path: sys.path.append('..')

from pydantic import BaseModel
from pydantic_yaml import to_yaml_file, parse_yaml_file_as
from transformers import GPT2Tokenizer, PreTrainedTokenizer

from mllm.config.model import create_mllm_encdec_cfg, create_mllm_ranker_cfg, TokenizerCfg, \
    VocabEncoderCfg, EncoderCfg, MllmRankerCfg, EncdecHgCfg, create_encdec_hg_cfg, CustomToken, HgReductType, \
    create_ranker_hg_cfg, DecRankType
from mllm.tokenization.chunk_tokenizer import calc_max_inp_size, gen_all_tokens, tokenizer_from_config


## Configuration generation
### Setup

In [28]:
cfg_dpath = Path(os.path.abspath('.')).parent / 'mllm' / 'config' / 'cfg'
cfg_dpath

PosixPath('/home/misha/prog/mllm/mllm/config/cfg')

In [31]:
TCfg = TypeVar('TCfg', bound=BaseModel)

def save_config_to_yaml(cfg: TCfg, fpath: Path, overwrite: bool = False):
    if not fpath.exists() or overwrite:
        to_yaml_file(fpath, cfg)
    else:
        cfg_2 = parse_yaml_file_as(cfg.__class__, fpath)
        print(f'File {fpath} already exists. Equal:', cfg == cfg_2)

### Tokenizer config

In [5]:
tkz = GPT2Tokenizer.from_pretrained('gpt2', model_max_length=10000)
n_tokens_init = len(tkz)
tok_dict = gen_all_tokens(tkz, with_train=True)

In [6]:
tkz_cfg = TokenizerCfg(name='gpt2', n_tokens_init=n_tokens_init, model_max_length=10000, custom_tokens=tok_dict)
tkz_cfg

TokenizerCfg(name='gpt2', n_tokens_init=50257, model_max_length=10000, custom_tokens={'doc_begin': CustomToken(name='doc_begin', repr='<|doc_begin|>', special=False, ind=50257), 'doc_end': CustomToken(name='doc_end', repr='<|doc_end|>', special=False, ind=50258), 'doc_id_begin': CustomToken(name='doc_id_begin', repr='<|doc_id_begin|>', special=False, ind=50259), 'doc_id_end': CustomToken(name='doc_id_end', repr='<|doc_id_end|>', special=False, ind=50260), 'doc_offset_begin': CustomToken(name='doc_offset_begin', repr='<|doc_offset_begin|>', special=False, ind=50261), 'doc_offset_end': CustomToken(name='doc_offset_end', repr='<|doc_offset_end|>', special=False, ind=50262), 'doc_title_begin': CustomToken(name='doc_title_begin', repr='<|doc_title_begin|>', special=False, ind=50263), 'doc_title_end': CustomToken(name='doc_title_end', repr='<|doc_title_end|>', special=False, ind=50264), 'doc_body_begin': CustomToken(name='doc_body_begin', repr='<|doc_body_begin|>', special=False, ind=50265),

In [7]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
save_config_to_yaml(tkz_cfg, tkz_cfg_fpath)

File /home/misha/prog/mllm/mllm/config/cfg/tokenizer_cfg_01.yaml already exists. Equal: True


In [8]:
print(f'n_tokens_init = {n_tokens_init}. Tokens: {len(tkz)}')

n_tokens_init = 50257. Tokens: 50271


### MLLM Encdec and Ranker multi level, multi layer configs

#### Transformer with 2 layers

In [36]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)
print(tkz.pad_token_id)
print(tkz)

50267
GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=

In [39]:
n_vocab = len(tkz)
seq_len = 100
d_word_wec = 256
n_heads, d_model, d_inner = 8, 256, 1024
dropout_rate = 0.0
n_levels = 2
enc_n_layers = 2
enc_with_graph_mat = False
enc_with_emb_mat = False, True
dec_n_layers = 2
pad_tok = tkz_cfg.custom_tokens['pad'].ind
dec_with_vocab_decoder = True

encdec_cfg_01 = create_mllm_encdec_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok, with_vocab_decoder=dec_with_vocab_decoder,
)

ranker_cfg_01 = create_mllm_ranker_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok,
)

print(encdec_cfg_01)
print(ranker_cfg_01)

vocab_encoder=VocabEncoderCfg(n_vocab=50271, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=False), EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True)] decoders=[EmbDecoderCfg(d_emb=256, n_layers=2, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0), EmbDecoderCfg(d_emb=256, n_layers=2, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0)] with_vocab_decoder=True
vocab_encoder=VocabEncoderCfg(n_vocab=50271, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=False), EncoderCfg(n_layers=2, n_heads=8, d

In [40]:
encdec_cfg_01_fpath = cfg_dpath / 'encdec_model_cfg_01.yaml'
ranker_cfg_01_fpath = cfg_dpath / 'ranker_model_cfg_01.yaml'
overwrite = False
# overwrite = True
save_config_to_yaml(encdec_cfg_01, encdec_cfg_01_fpath, overwrite)
save_config_to_yaml(ranker_cfg_01, ranker_cfg_01_fpath, overwrite)

File /home/misha/prog/mllm/mllm/config/cfg/encdec_model_cfg_01.yaml already exists. Equal: True
File /home/misha/prog/mllm/mllm/config/cfg/ranker_model_cfg_01.yaml already exists. Equal: True


### Encoder Decoder Hourglass model

In [5]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)
print(tkz.pad_token_id)
print(tkz)

50267
GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=

#### d_model = 256, d_inner = 1024

In [6]:
cfg_encdec_hg = create_encdec_hg_cfg(
    n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=256, n_heads=8, d_inner=1024, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul,
)
pprint(cfg_encdec_hg.dict())

{'dec_pyr': {'d_inner': 1024,
             'd_k': 32,
             'd_model': 256,
             'd_v': 32,
             'dropout_rate': 0.0,
             'inp_len': 256,
             'n_heads': 8,
             'n_layers': 8,
             'n_similar_layers': 1,
             'n_vocab': 50271,
             'step': 2},
 'enc_pyr': {'d_inner': 1024,
             'd_k': 32,
             'd_model': 256,
             'd_v': 32,
             'dropout_rate': 0.0,
             'inp_len': 256,
             'n_heads': 8,
             'n_layers': 8,
             'n_similar_layers': 1,
             'pad_idx': 50267,
             'reduct_type': <HgReductType.Matmul: 'matmul'>,
             'step': 2,
             'vocab_encoder': {'d_model': 256,
                               'd_word_vec': 256,
                               'dropout_rate': 0.0,
                               'inp_len': 256,
                               'n_vocab': 50271,
                               'pad_idx': 50267}}}


In [7]:
model_cfg_fpath = cfg_dpath / 'encdec_hg_cfg_01.yaml'
print(f'Save {model_cfg_fpath}')
save_config_to_yaml(cfg_encdec_hg, model_cfg_fpath)

File /home/misha/prog/mllm/mllm/config/cfg/encdec_hg_cfg_01.yaml already exists. Equal: True


In [8]:
cfg = parse_yaml_file_as(EncdecHgCfg, model_cfg_fpath)
cfg.enc_pyr.reduct_type

<HgReductType.Matmul: 'matmul'>

#### d_model = 512, d_inner = 2048

In [9]:
cfg_encdec_hg = create_encdec_hg_cfg(
    n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=512, n_heads=8, d_inner=2048, inp_len=128, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul,
)
pprint(cfg_encdec_hg.dict())
model_cfg_fpath = cfg_dpath / 'encdec_hg_cfg_02.yaml'
print(f'Save {model_cfg_fpath}')
save_config_to_yaml(cfg_encdec_hg, model_cfg_fpath)

{'dec_pyr': {'d_inner': 2048,
             'd_k': 64,
             'd_model': 512,
             'd_v': 64,
             'dropout_rate': 0.0,
             'inp_len': 128,
             'n_heads': 8,
             'n_layers': 7,
             'n_similar_layers': 1,
             'n_vocab': 50271,
             'step': 2},
 'enc_pyr': {'d_inner': 2048,
             'd_k': 64,
             'd_model': 512,
             'd_v': 64,
             'dropout_rate': 0.0,
             'inp_len': 128,
             'n_heads': 8,
             'n_layers': 7,
             'n_similar_layers': 1,
             'pad_idx': 50267,
             'reduct_type': <HgReductType.Matmul: 'matmul'>,
             'step': 2,
             'vocab_encoder': {'d_model': 512,
                               'd_word_vec': 512,
                               'dropout_rate': 0.0,
                               'inp_len': 128,
                               'n_vocab': 50271,
                               'pad_idx': 50267}}}
Save /home

#### d_model = 768, d_inner = 3072

In [38]:
cfg_encdec_hg = create_encdec_hg_cfg(
    n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=768, n_heads=12, d_inner=3072, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul,
)
pprint(cfg_encdec_hg.dict())
model_cfg_fpath = cfg_dpath / 'encdec_hg_cfg_03.yaml'
print(f'Save {model_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(cfg_encdec_hg, model_cfg_fpath, overwrite=overwrite)

{'dec_pyr': {'d_inner': 3072,
             'd_k': 64,
             'd_model': 768,
             'd_v': 64,
             'dropout_rate': 0.0,
             'enhance_type': <HgEnhanceType.Matmul: 'matmul'>,
             'inp_len': 256,
             'n_heads': 12,
             'n_layers': 8,
             'n_similar_layers': 1,
             'n_vocab': 50271,
             'step': 2},
 'enc_pyr': {'d_inner': 3072,
             'd_k': 64,
             'd_model': 768,
             'd_v': 64,
             'dropout_rate': 0.0,
             'inp_len': 256,
             'n_heads': 12,
             'n_layers': 8,
             'n_similar_layers': 1,
             'pad_idx': 50267,
             'reduct_type': <HgReductType.Matmul: 'matmul'>,
             'step': 2,
             'vocab_encoder': {'d_model': 768,
                               'd_word_vec': 768,
                               'dropout_rate': 0.0,
                               'inp_len': 256,
                               'n_vocab': 502

### Ranker Hourglass model

#### d_model = 256, d_inner = 1024

In [19]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)

In [20]:
ranker_hg_cfg = create_ranker_hg_cfg(n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=256, n_heads=8, d_inner=1024, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul, dec_type=DecRankType.Simple)
ranker_hg_cfg_fpath = cfg_dpath / 'ranker_hg_cfg_01.yaml'
print(f'Save {ranker_hg_cfg_fpath}')
save_config_to_yaml(ranker_hg_cfg, ranker_hg_cfg_fpath)

Save /home/misha/prog/mllm/mllm/config/cfg/ranker_hg_cfg_01.yaml
File /home/misha/prog/mllm/mllm/config/cfg/ranker_hg_cfg_01.yaml already exists. Equal: True


#### d_model = 512, d_inner = 2048

In [23]:
encdec_hg_cfg_fpath = cfg_dpath / 'encdec_hg_cfg_02.yaml'
ranker_hg_cfg = create_ranker_hg_cfg(n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=512, n_heads=8, d_inner=2048, inp_len=128, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul, dec_type=DecRankType.Simple)
ranker_hg_cfg_fpath = cfg_dpath / 'ranker_hg_cfg_02.yaml'
print(f'Save {ranker_hg_cfg_fpath}')
save_config_to_yaml(ranker_hg_cfg, ranker_hg_cfg_fpath)

Save /home/misha/prog/mllm/mllm/config/cfg/ranker_hg_cfg_02.yaml
File /home/misha/prog/mllm/mllm/config/cfg/ranker_hg_cfg_02.yaml already exists. Equal: True
