In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
from pprint import pprint
import sys
from typing import TypeVar

if '..' not in sys.path: sys.path.append('..')

from pydantic import BaseModel
from pydantic_yaml import to_yaml_file, parse_yaml_file_as
import torch
from transformers import GPT2Tokenizer, PreTrainedTokenizer, BertModel, BertConfig, AutoTokenizer

from mllm.config.model import create_mllm_encdec_cfg, create_mllm_ranker_cfg, TokenizerCfg, \
    VocabEncoderCfg, EncoderCfg, MllmRankerCfg, EncdecHgCfg, create_encdec_hg_cfg, CustomToken, HgReductType, \
    create_ranker_hg_cfg,PosEncType, create_encdec_bert_cfg, BertEmbType, HgEnhanceType, create_ranker_bert_cfg, \
    create_encdecrnk_bert_cfg, create_encmix_bert_cfg, EncmixOutEmbsType, create_genmix_bert_cfg
from mllm.tokenization.chunk_tokenizer import calc_max_inp_size, gen_all_tokens, tokenizer_from_config




## Configuration generation
### Setup

In [3]:
cfg_dpath = Path(os.path.abspath('.')).parent / 'mllm' / 'config' / 'cfg'
cfg_dpath

PosixPath('/home/misha/prog/mllm/mllm/config/cfg')

In [4]:
TCfg = TypeVar('TCfg', bound=BaseModel)

def save_config_to_yaml(cfg: TCfg, fpath: Path, overwrite: bool = False):
    if not fpath.exists() or overwrite:
        to_yaml_file(fpath, cfg)
    else:
        cfg_2 = parse_yaml_file_as(cfg.__class__, fpath)
        print(f'File {fpath} already exists. Equal:', cfg == cfg_2)

### Tokenizer config

In [7]:
tkz = GPT2Tokenizer.from_pretrained('gpt2', model_max_length=10000)
n_tokens_init = len(tkz)
tok_dict = gen_all_tokens(tkz, with_train=True)

In [8]:
tkz_cfg = TokenizerCfg(name='gpt2', n_tokens_init=n_tokens_init, model_max_length=10000, custom_tokens=tok_dict)
print(f'pad_token_id: {tkz.pad_token_id}')
tkz_cfg

pad_token_id: 50267


TokenizerCfg(name='gpt2', n_tokens_init=50257, model_max_length=10000, custom_tokens={'doc_begin': CustomToken(name='doc_begin', repr='<|doc_begin|>', special=False, ind=50257), 'doc_end': CustomToken(name='doc_end', repr='<|doc_end|>', special=False, ind=50258), 'doc_id_begin': CustomToken(name='doc_id_begin', repr='<|doc_id_begin|>', special=False, ind=50259), 'doc_id_end': CustomToken(name='doc_id_end', repr='<|doc_id_end|>', special=False, ind=50260), 'doc_offset_begin': CustomToken(name='doc_offset_begin', repr='<|doc_offset_begin|>', special=False, ind=50261), 'doc_offset_end': CustomToken(name='doc_offset_end', repr='<|doc_offset_end|>', special=False, ind=50262), 'doc_title_begin': CustomToken(name='doc_title_begin', repr='<|doc_title_begin|>', special=False, ind=50263), 'doc_title_end': CustomToken(name='doc_title_end', repr='<|doc_title_end|>', special=False, ind=50264), 'doc_body_begin': CustomToken(name='doc_body_begin', repr='<|doc_body_begin|>', special=False, ind=50265),

In [7]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
save_config_to_yaml(tkz_cfg, tkz_cfg_fpath)

File /home/misha/prog/mllm/mllm/config/cfg/tokenizer_cfg_01.yaml already exists. Equal: True


In [8]:
print(f'n_tokens_init = {n_tokens_init}. Tokens: {len(tkz)}')

n_tokens_init = 50257. Tokens: 50271


### MLLM Encdec and Ranker multi level, multi layer configs

#### Transformer with 2 layers

In [36]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)
print(tkz.pad_token_id)
print(tkz)

50267
GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=

In [39]:
n_vocab = len(tkz)
seq_len = 100
d_word_wec = 256
n_heads, d_model, d_inner = 8, 256, 1024
dropout_rate = 0.0
n_levels = 2
enc_n_layers = 2
enc_with_graph_mat = False
enc_with_emb_mat = False, True
dec_n_layers = 2
pad_tok = tkz_cfg.custom_tokens['pad'].ind
dec_with_vocab_decoder = True

encdec_cfg_01 = create_mllm_encdec_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok, with_vocab_decoder=dec_with_vocab_decoder,
)

ranker_cfg_01 = create_mllm_ranker_cfg(
    n_vocab=n_vocab, inp_len=seq_len, d_word_wec=d_word_wec, dropout_rate=dropout_rate, n_levels=n_levels,
    enc_n_layers=enc_n_layers, n_heads=n_heads, d_model=d_model, d_inner=d_inner,
    enc_with_graph_mat=enc_with_graph_mat, enc_with_emb_mat=enc_with_emb_mat,
    dec_n_layers=dec_n_layers, pad_idx=pad_tok,
)

print(encdec_cfg_01)
print(ranker_cfg_01)

vocab_encoder=VocabEncoderCfg(n_vocab=50271, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=False), EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=True)] decoders=[EmbDecoderCfg(d_emb=256, n_layers=2, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0), EmbDecoderCfg(d_emb=256, n_layers=2, n_heads=8, d_hid=1024, seq_len=100, dp_rate=0.0)] with_vocab_decoder=True
vocab_encoder=VocabEncoderCfg(n_vocab=50271, d_word_vec=256, d_model=256, pad_idx=50267, inp_len=100, dropout_rate=0.0) encoders=[EncoderCfg(n_layers=2, n_heads=8, d_k=32, d_v=32, d_model=256, d_inner=1024, pad_idx=50267, with_graph_mat=False, inp_len=100, dropout_rate=0.0, with_emb_mat=False), EncoderCfg(n_layers=2, n_heads=8, d

In [40]:
encdec_cfg_01_fpath = cfg_dpath / 'encdec_model_cfg_01.yaml'
ranker_cfg_01_fpath = cfg_dpath / 'ranker_model_cfg_01.yaml'
overwrite = False
# overwrite = True
save_config_to_yaml(encdec_cfg_01, encdec_cfg_01_fpath, overwrite)
save_config_to_yaml(ranker_cfg_01, ranker_cfg_01_fpath, overwrite)

File /home/misha/prog/mllm/mllm/config/cfg/encdec_model_cfg_01.yaml already exists. Equal: True
File /home/misha/prog/mllm/mllm/config/cfg/ranker_model_cfg_01.yaml already exists. Equal: True


### Encoder Decoder Hourglass model

In [5]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)
print(tkz.pad_token_id)
print(tkz)

50267
GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=10000, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|doc_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("<|doc_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("<|doc_id_begin|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("<|doc_id_end|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("<|doc_offset_begin|>", rstrip=False, lstrip=False, single_word=

#### EncdecHg d_model = 256, d_inner = 1024

In [6]:
cfg_encdec_hg = create_encdec_hg_cfg(
    n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=256, n_heads=8, d_inner=1024, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul,
)
pprint(cfg_encdec_hg.dict())

{'dec_pyr': {'d_inner': 1024,
             'd_k': 32,
             'd_model': 256,
             'd_v': 32,
             'dropout_rate': 0.0,
             'inp_len': 256,
             'n_heads': 8,
             'n_layers': 8,
             'n_similar_layers': 1,
             'n_vocab': 50271,
             'step': 2},
 'enc_pyr': {'d_inner': 1024,
             'd_k': 32,
             'd_model': 256,
             'd_v': 32,
             'dropout_rate': 0.0,
             'inp_len': 256,
             'n_heads': 8,
             'n_layers': 8,
             'n_similar_layers': 1,
             'pad_idx': 50267,
             'reduct_type': <HgReductType.Matmul: 'matmul'>,
             'step': 2,
             'vocab_encoder': {'d_model': 256,
                               'd_word_vec': 256,
                               'dropout_rate': 0.0,
                               'inp_len': 256,
                               'n_vocab': 50271,
                               'pad_idx': 50267}}}


In [7]:
model_cfg_fpath = cfg_dpath / 'encdec_hg_cfg_01.yaml'
print(f'Save {model_cfg_fpath}')
save_config_to_yaml(cfg_encdec_hg, model_cfg_fpath)

File /home/misha/prog/mllm/mllm/config/cfg/encdec_hg_cfg_01.yaml already exists. Equal: True


In [8]:
cfg = parse_yaml_file_as(EncdecHgCfg, model_cfg_fpath)
cfg.enc_pyr.reduct_type

<HgReductType.Matmul: 'matmul'>

#### EncdecHg d_model = 512, d_inner = 2048

In [9]:
cfg_encdec_hg = create_encdec_hg_cfg(
    n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=512, n_heads=8, d_inner=2048, inp_len=128, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul,
)
pprint(cfg_encdec_hg.dict())
model_cfg_fpath = cfg_dpath / 'encdec_hg_cfg_02.yaml'
print(f'Save {model_cfg_fpath}')
save_config_to_yaml(cfg_encdec_hg, model_cfg_fpath)

{'dec_pyr': {'d_inner': 2048,
             'd_k': 64,
             'd_model': 512,
             'd_v': 64,
             'dropout_rate': 0.0,
             'inp_len': 128,
             'n_heads': 8,
             'n_layers': 7,
             'n_similar_layers': 1,
             'n_vocab': 50271,
             'step': 2},
 'enc_pyr': {'d_inner': 2048,
             'd_k': 64,
             'd_model': 512,
             'd_v': 64,
             'dropout_rate': 0.0,
             'inp_len': 128,
             'n_heads': 8,
             'n_layers': 7,
             'n_similar_layers': 1,
             'pad_idx': 50267,
             'reduct_type': <HgReductType.Matmul: 'matmul'>,
             'step': 2,
             'vocab_encoder': {'d_model': 512,
                               'd_word_vec': 512,
                               'dropout_rate': 0.0,
                               'inp_len': 128,
                               'n_vocab': 50271,
                               'pad_idx': 50267}}}
Save /home

#### EncdecHg d_model = 768, d_inner = 3072

In [38]:
cfg_encdec_hg = create_encdec_hg_cfg(
    n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=768, n_heads=12, d_inner=3072, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul,
)
pprint(cfg_encdec_hg.dict())
model_cfg_fpath = cfg_dpath / 'encdec_hg_cfg_03.yaml'
print(f'Save {model_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(cfg_encdec_hg, model_cfg_fpath, overwrite=overwrite)

{'dec_pyr': {'d_inner': 3072,
             'd_k': 64,
             'd_model': 768,
             'd_v': 64,
             'dropout_rate': 0.0,
             'enhance_type': <HgEnhanceType.Matmul: 'matmul'>,
             'inp_len': 256,
             'n_heads': 12,
             'n_layers': 8,
             'n_similar_layers': 1,
             'n_vocab': 50271,
             'step': 2},
 'enc_pyr': {'d_inner': 3072,
             'd_k': 64,
             'd_model': 768,
             'd_v': 64,
             'dropout_rate': 0.0,
             'inp_len': 256,
             'n_heads': 12,
             'n_layers': 8,
             'n_similar_layers': 1,
             'pad_idx': 50267,
             'reduct_type': <HgReductType.Matmul: 'matmul'>,
             'step': 2,
             'vocab_encoder': {'d_model': 768,
                               'd_word_vec': 768,
                               'dropout_rate': 0.0,
                               'inp_len': 256,
                               'n_vocab': 502

### Ranker Hourglass model

#### RankerHg d_model = 256, d_inner = 1024

In [6]:
tkz_cfg_fpath = cfg_dpath / 'tokenizer_cfg_01.yaml'
tkz_cfg = parse_yaml_file_as(TokenizerCfg, tkz_cfg_fpath)
tkz = tokenizer_from_config(tkz_cfg)

In [13]:
d_model = 256
ranker_hg_cfg = create_ranker_hg_cfg(n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=d_model, n_heads=8, d_inner=1024, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul, pos_enc_type=PosEncType.Emb, dec_mlp_layers=f'{d_model}', temperature=0,
)
ranker_hg_cfg_fpath = cfg_dpath / 'ranker_hg_cfg_01.yaml'
print(f'Save {ranker_hg_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(ranker_hg_cfg, ranker_hg_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/ranker_hg_cfg_01.yaml


#### RankerHg d_model = 512, d_inner = 2048

In [14]:
d_model = 512
ranker_hg_cfg = create_ranker_hg_cfg(n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=d_model, n_heads=8, d_inner=2048, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul, pos_enc_type=PosEncType.Emb, dec_mlp_layers=f'{d_model}', temperature=0,
)
ranker_hg_cfg_fpath = cfg_dpath / 'ranker_hg_cfg_02.yaml'
print(f'Save {ranker_hg_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(ranker_hg_cfg, ranker_hg_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/ranker_hg_cfg_02.yaml


#### RankerHg d_model = 768, d_inner = 3072

In [15]:
d_model = 768
ranker_hg_cfg = create_ranker_hg_cfg(n_vocab=len(tkz), pad_idx=tkz.pad_token_id,
    d_model=d_model, n_heads=12, d_inner=3072, inp_len=256, step=2, dropout_rate=0.0, n_similar_layers=1,
    reduct_type=HgReductType.Matmul, pos_enc_type=PosEncType.Emb, dec_mlp_layers=f'{d_model}', temperature=0,
)
ranker_hg_cfg_fpath = cfg_dpath / 'ranker_hg_cfg_03.yaml'
print(f'Save {ranker_hg_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(ranker_hg_cfg, ranker_hg_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/ranker_hg_cfg_03.yaml


### Encoder Decoder Bert model
#### EncdecBert bert-base-uncased

In [6]:
encdec_bert_cfg = create_encdec_bert_cfg(
    pretrained_model_name='bert-base-uncased', tokenizer_name='', emb_type=BertEmbType.Cls,
    inp_len=128, dec_enhance_type=HgEnhanceType.Matmul,
    dec_n_layers=7, dec_n_similar_layers=1, dec_dropout_rate=0.0, dec_temperature=0,
)
encdec_bert_cfg_fpath = cfg_dpath / 'encdec_bert_cfg_01.yaml'
print(f'Save {encdec_bert_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(encdec_bert_cfg, encdec_bert_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/encdec_bert_cfg_01.yaml


### Ranker Bert model
#### RankerBert bert-base-uncased

In [7]:
ranker_bert_cfg = create_ranker_bert_cfg(
    pretrained_model_name='bert-base-uncased', tokenizer_name='', emb_type=BertEmbType.Cls,
    inp_len=128, dec_mlp_layers='',
)
ranker_bert_cfg_fpath = cfg_dpath / 'ranker_bert_cfg_01.yaml'
print(f'Save {ranker_bert_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(ranker_bert_cfg, ranker_bert_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/ranker_bert_cfg_01.yaml


### Encoder Decoder Ranker Bert model
#### Add custom special token (experimental)

In [16]:
pretrained_model_name = 'bert-base-uncased'
tkz = AutoTokenizer.from_pretrained(pretrained_model_name)
tkz


BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [21]:
tkz.add_special_tokens({
    'additional_special_tokens': ['[RNK]']
})
len(tkz), tkz

(30523,
 BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[RNK]']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	30522: AddedToken("[RNK]", rstrip=False, lstrip=False, single_word=False,

In [20]:
model = BertModel.from_pretrained(pretrained_model_name, torch_dtype=torch.float32)
print(model.config)

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [23]:
model.resize_token_embeddings(len(tkz))
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30523
}

In [25]:
tkz.additional_special_tokens, tkz.additional_special_tokens_ids

(['[RNK]'], [30522])

In [31]:
print(tkz('Michael Jackson')['input_ids'], tkz('[RNK] Michael Jackson')['input_ids'])

[101, 2745, 4027, 102] [101, 30522, 2745, 4027, 102]


#### EncdecRankBert bert-base-uncased

In [34]:
encdecrnk_bert_cfg = create_encdecrnk_bert_cfg(
    pretrained_model_name='bert-base-uncased', tokenizer_name='', emb_type=BertEmbType.Cls,
    inp_len=128, dec_pyr_enhance_type=HgEnhanceType.Matmul,
    dec_pyr_n_layers=7, dec_pyr_n_similar_layers=1, dec_pyr_dropout_rate=0.0, dec_pyr_temperature=0,
    dec_rank_mlp_layers='',
)
encdecrnk_bert_cfg_fpath = cfg_dpath / 'encdecrnk_bert_cfg_01.yaml'
print(f'Save {encdecrnk_bert_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(encdecrnk_bert_cfg, encdecrnk_bert_cfg_fpath, overwrite=overwrite)

Save /Users/misha/prog/mllm/mllm/config/cfg/encdecrnk_bert_cfg_01.yaml


In [37]:
import re
SPEC_TOK_PAT = re.compile(r'^\[[A-Z]+]$')
for t in ['tok', 'TOK', '[T', 'TT]', '[tok]', '[ABC]', '[A]', '[]']:
    matched =  SPEC_TOK_PAT.match(t) is not None
    print(t, matched)

tok False
TOK False
[T False
TT] False
[tok] False
[ABC] True
[A] True
[] False


### Encoder Mixed Bert model
#### EncmixBert bert-base-uncased

In [5]:
encmix_bert_cfg = create_encmix_bert_cfg(
    pretrained_model_name='bert-base-uncased', tokenizer_name='', inp_len=128,
    out_embs_type=EncmixOutEmbsType.Inp, token_types_for_embs=False,
)
encmix_bert_cfg_fpath = cfg_dpath / 'encmix_bert_cfg_01_base.yaml'
print(f'Save {encmix_bert_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(encmix_bert_cfg, encmix_bert_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/encmix_bert_cfg_01_base.yaml


#### EncmixBert bert-large-uncased

In [6]:
encmix_bert_cfg = create_encmix_bert_cfg(
    pretrained_model_name='bert-large-uncased', tokenizer_name='', inp_len=128,
    out_embs_type=EncmixOutEmbsType.Inp, token_types_for_embs=False,
)
encmix_bert_cfg_fpath = cfg_dpath / 'encmix_bert_cfg_02_large.yaml'
print(f'Save {encmix_bert_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(encmix_bert_cfg, encmix_bert_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/encmix_bert_cfg_02_large.yaml


#### EncmixBert bert-base-uncased with 3 token types

In [7]:
encmix_bert_cfg = create_encmix_bert_cfg(
    pretrained_model_name='bert-base-uncased', tokenizer_name='', inp_len=128,
    out_embs_type=EncmixOutEmbsType.Inp, token_types_for_embs=True,
)
encmix_bert_cfg_fpath = cfg_dpath / 'encmix_bert_cfg_03_base_tte.yaml'
print(f'Save {encmix_bert_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(encmix_bert_cfg, encmix_bert_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/encmix_bert_cfg_03_base_tte.yaml


In [9]:
tkz = AutoTokenizer.from_pretrained('bert-base-uncased')
tkz

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [11]:
tkz(['hey hey how are you', 'meeting'], padding=True, return_attention_mask=True, return_special_tokens_mask=True, return_token_type_ids=True)

{'input_ids': [[101, 4931, 4931, 2129, 2024, 2017, 102], [101, 3116, 102, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0]], 'special_tokens_mask': [[1, 0, 0, 0, 0, 0, 1], [1, 0, 1, 1, 1, 1, 1]]}

In [12]:
sequence_a = "HuggingFace is based in NYC"
sequence_b = "Where is HuggingFace based?"
encoded_dict = tkz(sequence_a, sequence_b)
decoded = tkz.decode(encoded_dict["input_ids"])
print(decoded)

[CLS] huggingface is based in nyc [SEP] where is huggingface based? [SEP]


In [15]:
print(encoded_dict['input_ids'])
print(encoded_dict['token_type_ids'])

[101, 17662, 12172, 2003, 2241, 1999, 16392, 102, 2073, 2003, 17662, 12172, 2241, 1029, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


### EncoderDecoder Mixed Bert model
#### GenmixBert bert-base-uncased

In [6]:
genmix_bert_cfg = create_genmix_bert_cfg(
    pretrained_model_name='bert-base-uncased', tokenizer_name='', inp_len=128, max_inp_chunks=10, max_out_toks=128,
)
genmix_bert_cfg_fpath = cfg_dpath / 'genmix_bert_cfg_01_base_tte.yaml'
print(f'Save {genmix_bert_cfg_fpath}')
overwrite = True
# overwrite = False
save_config_to_yaml(genmix_bert_cfg, genmix_bert_cfg_fpath, overwrite=overwrite)

Save /home/misha/prog/mllm/mllm/config/cfg/genmix_bert_cfg_01_base_tte.yaml


### EncoderDecoder Attention2 Bert model
#### EncAt2Dec config

In [19]:
import dataclasses
import yaml

from mllm.model.encoder_at2_decoder_bert import EncoderAt2DecoderConfig
from mllm.model.encoder_at2_decoder_cont import EncAt2DecCfg


In [8]:
a2_cfg = EncAt2DecCfg.create(
    inp_len = 128, pretrained_model_name = 'bert-base-uncased', max_inp_chunks = 10, max_out_toks = 50,
    enc_at2_enabled = True, dec_at2_enabled = True, last_dec_to_all_enc_at2_enabled = True,
)

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


You are using a model of type bert to instantiate a model of type bert-at2-generation. This is not supported for all configurations of models and can yield errors.
You are using a model of type bert to instantiate a model of type bert-at2-generation. This is not supported for all configurations of models and can yield errors.
Some weights of BertGenerationAt2Decoder were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.

In [29]:
a2_cfg_dict = dataclasses.asdict(a2_cfg)
a2_cfg_dict['bert'] = a2_cfg_dict['bert'].to_dict()

In [30]:
a2_cfg_fpath = cfg_dpath / 'a2_cfg_01_base.yaml'
print(f'Save {a2_cfg_fpath}')
with open(a2_cfg_fpath, 'w') as f:
    yaml.dump(a2_cfg_dict, f, default_flow_style=False, allow_unicode=True)

Save /home/misha/prog/mllm/mllm/config/cfg/a2_cfg_01_base.yaml


In [33]:
with open(a2_cfg_fpath, 'r') as f:
    a2_cfg_dict_2 = yaml.safe_load(f)
    a2_cfg_dict_2['bert'] = EncoderAt2DecoderConfig.from_dict(a2_cfg_dict_2['bert'])

In [32]:
a2_cfg.bert.to_dict() == a2_cfg_dict_2['bert'].to_dict()

True