The purpose of this notebook is to 

In [1]:
! export PYTHONPATH=.

In [2]:
from huggingface_upload.scratch_transformer_model.configuration_scratch_transformer import ScratchTransformerConfig
from huggingface_upload.scratch_transformer_model.model_scratch_transformer import ScratchTransformerModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from train import get_ds, get_config
from config import get_weights_file_path
from tokenizers import Tokenizer

cfg = get_config()
cfg['model_folder'] = 'weights'
cfg['tokenizer_file'] = 'vocab/tokenizer{0}.json'
cfg['preload'] = '29'

# train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(cfg)
tokenizer_src = Tokenizer.from_file(str('vocab/tokenizeren.json'))
tokenizer_tgt = Tokenizer.from_file(str('vocab/tokenizerit.json'))

In [17]:
tokenizer_tgt.get_vocab_size()

22463

In [4]:
scratch_transformer_config = ScratchTransformerConfig(
                                                    src_vocab_size=tokenizer_src.get_vocab_size(), 
                                                    tgt_vocab_size=tokenizer_tgt.get_vocab_size(), 
                                                    )
scratch_transformer = ScratchTransformerModel(scratch_transformer_config)

In [5]:
scratch_transformer.config.decoder_start_token_id = tokenizer_tgt.token_to_id("[SOS]")
scratch_transformer.config.pad_token_id = tokenizer_tgt.token_to_id("[PAD]")
scratch_transformer.config.eos_token_id = tokenizer_tgt.token_to_id("[EOS]")

In [6]:
import torch

sentence = "Translate this sentence to italian for me."

source = tokenizer_src.encode(sentence)
source = torch.cat([
    torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64), 
    torch.tensor(source.ids, dtype=torch.int64),
    torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64),
    torch.tensor([tokenizer_src.token_to_id('[PAD]')] * (scratch_transformer.config.seq_len - len(source.ids) - 2), dtype=torch.int64)
], dim=0)

# Mask out all of the padding tokens
source_mask = (source != tokenizer_src.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int()


In [7]:
results = scratch_transformer(input_ids=source, attention_mask=source_mask)
results

Seq2SeqLMOutput(loss=None, logits=tensor([[    2, 16616, 18779, 16119, 18675,  7345, 12435, 14926,  4444, 15145,
         19756, 20220,  2067,  2067, 20220,  4444,  9227, 21822, 18675, 21822,
          9227,  2067,  2067,  2067, 20220,  2067,  2067, 20220,  2067, 21112,
          8727, 20220,  8727, 16616, 19562,  1171,  2067, 21112, 21112, 21112,
         21112, 21112, 21112, 21112, 21112, 20220, 12908, 19158, 21112, 20220,
          5186, 20237, 20220, 18779, 12435, 21112,  2338,  9227,  2067, 12435,
         18779,  2338, 20220, 11664,  2338, 20220, 20220, 16119, 16119,  2338,
          9227,  9227,  2067, 13641,  2338,  9227,  9227, 17586,  2338,  1171,
          7831,  9227,  1171,  2338,  9227,  9227,  9227,  2067, 21112,  2338,
          9227, 20237, 19756,  9227,  4726,  1095,  2338,  9227,  2067,  4029,
         20220,  9227, 20220,  2338,  2338, 10991, 19562,  2338,  9227, 20220,
         16616, 18779,  4029, 19562,  5186,  9227, 19562, 21112,   577, 18779,
         15145,   

In [8]:
tokenizer_tgt.decode(results.logits[0].tolist())

'Spirito finanziario Bisognerà fabbriferrai accordato scherzava permisero focolare rammentarsi meraviglierei pegno governo governo pegno focolare Michajla staffa fabbriferrai staffa Michajla governo governo governo pegno governo governo pegno governo rivoluzionaria occorsero pegno occorsero Spirito lascierò sedia governo rivoluzionaria rivoluzionaria rivoluzionaria rivoluzionaria rivoluzionaria rivoluzionaria rivoluzionaria rivoluzionaria pegno Eccellenza imperator rivoluzionaria pegno tuttora percepiva pegno finanziario scherzava rivoluzionaria trovar Michajla governo scherzava finanziario trovar pegno golfo trovar pegno pegno Bisognerà Bisognerà trovar Michajla Michajla governo celebrare trovar Michajla Michajla calare trovar sedia predicare Michajla sedia trovar Michajla Michajla Michajla governo rivoluzionaria trovar Michajla percepiva meraviglierei Michajla italiana appunto trovar Michajla governo nascondeva pegno Michajla pegno trovar trovar apprendere lascierò trovar Michajla pe

Now try preloading the model from a weights file:

In [9]:
model_filename = get_weights_file_path(cfg, cfg['preload'])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state = torch.load(model_filename, map_location=torch.device(device))

scratch_transformer.model.load_state_dict(state['model_state_dict'])

<All keys matched successfully>

In [10]:
pretrained_logits = scratch_transformer(input_ids=source, attention_mask=source_mask)
tokenizer_tgt.decode(pretrained_logits.logits[0].tolist())

'La terminò questa frase le vostre di me .'

Now try with generating loss:

In [13]:
tgt_sentence = "Traducimi questa frase in italiano."

tgt_tokens = tokenizer_tgt.encode(tgt_sentence)
dec_num_padding_tokens = scratch_transformer.config.seq_len - len(tgt_tokens) - 1

decoder_input = torch.cat(
            [
                torch.tensor(tgt_tokens.ids, dtype=torch.int64),
                torch.tensor([tokenizer_tgt.token_to_id('[EOS]')], dtype=torch.int64),
                torch.tensor([tokenizer_tgt.token_to_id('[PAD]')] * dec_num_padding_tokens, dtype=torch.int64)
            ],
            dim=0,
        )

pretrained_results = scratch_transformer(input_ids=source, attention_mask=source_mask, decoder_input_ids=decoder_input)

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


In [15]:
print(f"TARGET: {tgt_sentence}")
print(f"PREDICTED: {tokenizer_tgt.decode(pretrained_logits.logits[0].tolist())}")
print(f"Loss: {pretrained_results.loss}")

TARGET: Traducimi questa frase in italiano.
PREDICTED: La terminò questa frase le vostre di me .
Loss: 11.37087631225586
