# Transformer Model Training Notebook

This notebook demonstrates how to train a transformer model. We will go through the steps of setting up the environment, loading data, preprocessing, training the model, and evaluating its performance.

However, Architecture is available in the github repo: **eryash15/transformer_pytorch**

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path

from torchmetrics.text import BLEUScore
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm

2025-04-18 12:07:12.886152: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import nbimporter
from model import build_transformer, count_parameters # type: ignore
from config import get_config, get_weights_file_path, latest_weights_file_path

In [3]:
# Define the Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using Device:", device)

Using Device: cpu


# Tokenizer

In [4]:
def get_all_sentences(ds, lang):
    """Generates the sentence"""
    for item in ds:
        yield item['translation'][lang]

In [5]:
# Example
def test_get_all_sentences():
    input_dataset = [
        {'id': 1, 'translation': {'en': 'Hello', 'fr': 'Bonjour'}},
        {'id': 2, 'translation': {'en': 'Goodbye', 'fr': 'Au revoir'}}
    ]    
    expected_output = ['Bonjour', 'Au revoir'] 

    result_output = []

    lang = 'fr'
    for sentence in get_all_sentences(input_dataset, lang):
        result_output.append(sentence)
    
    assert expected_output == result_output 
    print("test_get_all_sentences: test success")
    
test_get_all_sentences()

test_get_all_sentences: test success


In [6]:
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        print(f"{tokenizer_path} does not exists.")
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", "[SOS]", "[EOS]"])
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        print(f"{tokenizer_path} exists.")
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer


In [7]:
# Example
def test_get_or_build_tokenizer():

    config = {
        'tokenizer_file': 'tokenizer_{0}.json'
    }

    dataset = [
        {'id': 1, 'translation': {'en': 'Hello world', 'fr': 'Bonjour tout le monde'}},
        {'id': 2, 'translation': {'en': 'How are you?', 'fr': 'Comment ça va ?'}}
    ]

    lang = 'fr'
    tokenizer_not_exist = get_or_build_tokenizer(config, dataset, lang)
    tokenizer = get_or_build_tokenizer(config, dataset, lang)
    
    assert tokenizer.encode("Bonjour tout le monde yash").tokens == ['Bonjour', 'tout', 'le', 'monde', '[UNK]']
    assert tokenizer.encode("Bonjour tout le monde yash").ids == [5, 9, 7, 8, 0]
    
    print("test_get_or_build_tokenizer: test success")
    
test_get_or_build_tokenizer()

tokenizer_fr.json exists.
tokenizer_fr.json exists.
test_get_or_build_tokenizer: test success


## Dataset

In [10]:
class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len

        # since all these token for both lang remain same we can use tokenizer_tgt or tokenizer_src
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    @staticmethod
    def causal_mask(size):
        mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int) # create right upper diagonal matrix
        return mask == 0

    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, index):
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # SOS and EOS
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # no EOS or SOS

        if enc_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
        
        # add SOS and EOS to the source text
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype = torch.int64)
            ]   
        )

        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype = torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64)
            ]   
        )

        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64)
            ]   
        )

        return {
            "encoder_input": encoder_input, #seq_len
            "decoder_input": decoder_input, #seq_len
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & BilingualDataset.causal_mask(decoder_input.size(0)), # (1,seq_len) & (1, seq_len, seq_len)
            "label": label,
            "src_text": src_text,
            "tgt_text": tgt_text
        }

In [11]:
# Example 
def test_BilingualDataset():
    dataset = [
        {'id': 1, 'translation': {'en_test': 'Hello world', 'fr_test': 'Bonjour tout le monde'}},
        {'id': 2, 'translation': {'en_test': 'How are you?', 'fr_test': 'Comment ça va ?'}}
    ]

    # Example config
    config = {
        'tokenizer_file': 'tokenizer_{0}.json'
    }

    tokenizer_src = get_or_build_tokenizer(config, dataset, "en_test")
    tokenizer_tgt = get_or_build_tokenizer(config, dataset, "fr_test")

    # Example usage
    lang = 'fr_test'
    bilingual_dataset = BilingualDataset(dataset, tokenizer_src, tokenizer_tgt, 'en_test', lang, 10)

    # Create a DataLoader for batching the data
    dataloader = DataLoader(bilingual_dataset, batch_size=1, shuffle=True)
    
    batch_num = 0
    # Iterate over the DataLoader to print batches
    for batch in dataloader:
        print("Encoder Input:", batch["encoder_input"].shape)
        print("Decoder Input:", batch["decoder_input"].shape)
        print("Encoder Mask:", batch["encoder_mask"].shape)
        print("Decoder Mask:", batch["decoder_mask"].shape)
        print("Label:", batch["label"].shape)
        print("Source Text:", batch["src_text"])
        print("Target Text:", batch["tgt_text"])
        print("---")
        batch_num += 1
    
    assert batch_num == len(dataset)//dataloader.batch_size
    
test_BilingualDataset()

tokenizer_en_test.json exists.
tokenizer_fr_test.json exists.
Encoder Input: torch.Size([1, 10])
Decoder Input: torch.Size([1, 10])
Encoder Mask: torch.Size([1, 1, 1, 10])
Decoder Mask: torch.Size([1, 1, 10, 10])
Label: torch.Size([1, 10])
Source Text: ['How are you?']
Target Text: ['Comment ça va ?']
---
Encoder Input: torch.Size([1, 10])
Decoder Input: torch.Size([1, 10])
Encoder Mask: torch.Size([1, 1, 1, 10])
Decoder Mask: torch.Size([1, 1, 10, 10])
Label: torch.Size([1, 10])
Source Text: ['Hello world']
Target Text: ['Bonjour tout le monde']
---


In [22]:
def get_ds(config):
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split='train')
    # build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])
    # keep 90% for training and 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(
        train_ds_raw, 
        tokenizer_src, 
        tokenizer_tgt, 
        config['lang_src'], 
        config['lang_tgt'], 
        config['seq_len']
    )
    val_ds = BilingualDataset(
        val_ds_raw, 
        tokenizer_src, 
        tokenizer_tgt, 
        config['lang_src'], 
        config['lang_tgt'], 
        config['seq_len']
    )

    max_len_src = 0
    max_len_tgt = 0
    
    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_src']]).ids
        
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f"Max length for source senstences: {max_len_src}")
    print(f"Max length for target senstences: {max_len_tgt}")

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=False)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt
    

In [23]:
def test_get_ds():
    config = {
        'lang_src': 'en',
        'lang_tgt': 'it',
        'seq_len': 500,
        'batch_size': 32,
        'tokenizer_file': 'tokenizer_{0}.json'
    }

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

    # Print some information about the datasets and tokenizers
    print(f"Train Dataset Size: {len(train_dataloader.dataset)}")
    print(f"Validation Dataset Size: {len(val_dataloader.dataset)}")
    print(f"Source Language Tokenizer Vocabulary Size: {len(tokenizer_src.get_vocab())}")
    print(f"Target Language Tokenizer Vocabulary Size: {len(tokenizer_tgt.get_vocab())}")

    # Iterate over a few batches from the dataloaders to check the data
    for batch in train_dataloader:
        print("Encoder Input Shape:", batch["encoder_input"].shape)
        print("Decoder Input Shape:", batch["decoder_input"].shape)
        print("Encoder Mask Shape:", batch["encoder_mask"].shape)
        print("Decoder Mask Shape:", batch["decoder_mask"].shape)
        print("Label Shape:", batch["label"].shape)
        print("---")
        break  # Stop after printing the first batch
        
    assert len(tokenizer_src.get_vocab()) > 1000 and len(tokenizer_tgt.get_vocab()) > 1000
        
test_get_ds()   

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

tokenizer_en.json does not exists.
tokenizer_it.json does not exists.
Max length for source senstences: 309
Max length for target senstences: 309
Train Dataset Size: 29098
Validation Dataset Size: 3234
Source Language Tokenizer Vocabulary Size: 25138
Target Language Tokenizer Vocabulary Size: 30000
Encoder Input Shape: torch.Size([32, 500])
Decoder Input Shape: torch.Size([32, 500])
Encoder Mask Shape: torch.Size([32, 1, 1, 500])
Decoder Mask Shape: torch.Size([32, 1, 500, 500])
Label Shape: torch.Size([32, 500])
---


In [24]:
def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

In [25]:
def test_get_model():
    config  = {
        'seq_len': 500,
        'd_model': 512,
    }
    vocab_src_len = 512
    vocab_tgt_len = 512

    model = get_model(config, vocab_src_len, vocab_tgt_len)
    model
    
test_get_model()

  ]


In [26]:
def causal_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask == 0

def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len):
    """
    Performs greedy decoding for sequence generation.

    Args:
        model: The sequence-to-sequence model to use for decoding.
        source: The input sequence to encode.
        source_mask: The mask for the source sequence.
        tokenizer_src: The tokenizer for the source language.
        tokenizer_tgt: The tokenizer for the target language.
        max_len: The maximum length for the generated sequence.
        device: The device to perform the computations on.

    Returns:
        torch.Tensor: The generated sequence.
    """
    # Get the special tokens' IDs
    sos_idx = tokenizer_tgt.token_to_id("[SOS]")
    eos_idx = tokenizer_tgt.token_to_id("[EOS]")

    # Encode the source sequence
    encoder_output = model.encode(source, source_mask)

    # Initialize the decoder input with the start-of-sequence token
    decoder_input = torch.tensor([[sos_idx]], dtype=source_mask.dtype, device=device)

    while True:
        # Check if the maximum length is reached
        if decoder_input.size(1) >= max_len:
            break

        # Create a causal mask for the decoder input
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # Decode the current sequence
        decoder_output = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # Project the decoder output to the vocabulary size and get the next token
        logits = model.project(decoder_output[:, -1])
        next_word = torch.argmax(logits, dim=1).item()

        # Append the next token to the decoder input
        next_word_tensor = torch.tensor([[next_word]], dtype=source.dtype, device=device)
        decoder_input = torch.cat([decoder_input, next_word_tensor], dim=1)

        # Stop if the end-of-sequence token is generated
        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


In [27]:
# Example to test greedy_decode
class DummySeq2SeqModel:
    def encode(self, source, source_mask):
        # Fake encoding, in practice use your actual model
        return torch.randn(1, source.size(1), 768)
    
    def decode(self, encoder_output, source_mask, decoder_input, decoder_mask):
        # Fake decoding, in practice use your actual model
        return torch.randn(1, decoder_input.size(1), 768)
    
    def project(self, decoder_output):
        # Fake projection, in practice use your actual model
        vocab_size = 30522  # Assume BERT tokenizer vocab size
        return torch.randn(decoder_output.size(0), vocab_size)

config = get_config()

# Load pre-trained tokenizers
tokenizer_src = get_or_build_tokenizer(config, None, config['lang_src'])
tokenizer_tgt = get_or_build_tokenizer(config, None, config['lang_tgt'])
    
# Example input sequence
source_text = "Hello, how are you?"
source = torch.tensor(tokenizer_src.encode(source_text).ids).unsqueeze(0)

source_mask = torch.ones_like(source).to(device)

# Instantiate the dummy model
model = DummySeq2SeqModel()

# Define other parameters
max_len = 20

# Perform greedy decoding
output_sequence = greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len)

# Convert the output sequence to text
output_text = tokenizer_tgt.decode(output_sequence.tolist())
# tokenizer_tgt.
# Print the result
print("Input Text:", source_text)
print("Output Text:", output_text)

tokenizer_en.json exists.
tokenizer_it.json exists.
Input Text: Hello, how are you?
Output Text: diportai ravvisassi coprissero cagin disseminate Passammo signora situati contrariamente bacchetta involto tintinnio mormorando chiudevano affermando portico generato Potemmo abbastanza


In [28]:
def bleu_loss(model, validation_ds, tokenizer_src, tokenizer_tgt, num_examples=2, max_len=20):
    """
    Computes validation loss using a given model and dataset, and calculates BLEU score.

    Args:
        model (torch.nn.Module): The trained model to evaluate.
        validation_ds (torch.utils.data.Dataset): Dataset for validation.
        tokenizer_src (Tokenizer): Tokenizer for source language.
        tokenizer_tgt (Tokenizer): Tokenizer for target language.
        num_examples (int, optional): Number of examples to evaluate. Defaults to 2.
        max_len (int, optional): Maximum length for decoding. Defaults to 50.
        device (str, optional): Device for tensor computations ('cuda' or 'cpu'). Defaults to 'cuda'.

    Returns:
        float: BLEU score for the predicted outputs compared to expected outputs.
    """
    try:
        model.eval()  # Set model to evaluation mode
    except:
        pass
        
    source_texts = []
    expected = []
    predicted = []

    with torch.no_grad():
        for count, batch in enumerate(validation_ds):
            if count == num_examples:
                break

            encoder_input = batch["encoder_input"].to(device)
            encoder_mask = batch["encoder_mask"].to(device)

            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

            # Generate predictions using greedy decoding
            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.tolist())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

    # Compute the BLEU metric
    metric = BLEUScore()
    bleu = metric(predicted, expected)

#     print(source_texts, expected, predicted)
    
    return bleu


In [29]:
# Example to test greedy_decode
class DummySeq2SeqModel:
    def encode(self, source, source_mask):
        # Fake encoding, in practice use your actual model
        return torch.randn(1, source.size(1), 768)
    
    def decode(self, encoder_output, source_mask, decoder_input, decoder_mask):
        # Fake decoding, in practice use your actual model
        return torch.randn(1, decoder_input.size(1), 768)
    
    def project(self, decoder_output):
        # Fake projection, in practice use your actual model
        vocab_size = 30522  # Assume BERT tokenizer vocab size
        return torch.randn(decoder_output.size(0), vocab_size)

config = get_config()

train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

# Instantiate the dummy model
model = DummySeq2SeqModel()

# Define other parameters
max_len = 20

# Perform greedy decoding
loss = bleu_loss(model, val_dataloader, tokenizer_src, tokenizer_tgt, num_examples=2, max_len=5)

print("BLEUScore:", float(loss))

tokenizer_en.json exists.
tokenizer_it.json exists.
Max length for source senstences: 309
Max length for target senstences: 309
BLEUScore: 0.0


In [31]:
def custom_trainer(config):

    # Make sure the weights folder exists
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

    model = get_model(
        config, 
        tokenizer_src.get_vocab_size(), 
        tokenizer_tgt.get_vocab_size()
    ).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None

    if model_filename:
        print(f"Preloading model: {model_filename}")
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)    
        
    # Example tensors
    # logits = torch.tensor([[2.0, 0.5, -0.5], [0.1, 1.0, -1.0], [0.2, 0.3, 0.8]], dtype=torch.float32)
    # targets = torch.tensor([0, 1, 2], dtype=torch.int64)
    # loss = loss_fn(logits, targets)
    
    for epoch in range(initial_epoch, config['num_epochs']):

        torch.cuda.empty_cache()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")

        for i, batch in tqdm(enumerate(batch_iterator)):
                
            model.train()
            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            print('h')
#             train_bleu_loss = bleu_loss(model, train_dataloader, tokenizer_src, tokenizer_tgt, num_examples=100, max_len=config['seq_len'])
            val_bleu_loss = bleu_loss(model, val_dataloader, tokenizer_src, tokenizer_tgt, num_examples=100, max_len=config['seq_len'])
            print('2')
            # Log the loss
            print(f'epoch_{epoch}__global_step_{global_step}__batch_no_{i}__train Cross Entropy loss_{loss.item()}__train BLEUloss_{float(val_bleu_loss)}__val BLEUloss_{float(val_bleu_loss)}')

            global_step += 1

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)


In [32]:
config = get_config()
custom_trainer(config)

tokenizer_en.json exists.
tokenizer_it.json exists.
Max length for source senstences: 309
Max length for target senstences: 309
No model to preload, starting from scratch


Processing Epoch 00:   0%|          | 0/4850 [00:00<?, ?it/s]

0it [00:00, ?it/s]

IndexError: index out of range in self

In [None]:
epoch = 0
