### Load necessary packages & define device

In [None]:
# !pip install -r requirements.txt

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm


# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Split

import random
import pandas as pd

In [2]:
# Define the device (accelerate the training process)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_built() or torch.backends.mps.is_available() else "cpu"
print("Using device:", device)
if (device == 'cuda'):
    print(f"Device name: {torch.cuda.get_device_name(device.index)}")
    print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
elif (device == 'mps'):
    print(f"Device name: <mps>")
else:
    print("NOTE: If you have a GPU, consider using it for training.")
    print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
    print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
device = torch.device(device)

Using device: cuda
Device name: NVIDIA RTX A5000
Device memory: 23.547119140625 GB


### Configuration

### Raw Data Loading, Tokenizer Construction, and Dataset Pipeline

In [None]:
# Load dataset from Hugging Face (dzjxzyd/UniRef50_len_0_50); replace it as the path of your customed dataset
# If no custom split is defined during upload, all data are stored in the 'train' split by default
# Select the column of interest ('Reference sequence') and convert it to a list
ds_raw = load_dataset('dzjxzyd/UniRef50_len_0_50', split='train')['Reference sequence']# only need the sequence -output is a list
# Dataset division, 0.5 % as the validation dataset
val_ds_size = int(0.005* len(ds_raw))
train_ds_size  = len(ds_raw) - val_ds_size
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
def build_tokenizer(ds):
    # Initialize a WordLevel tokenizer with [UNK] as the unknown token
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    # Pre-tokenizer/preprocessing: split text(peptide sequences) into individual characters
    # (pattern='' + behavior='isolated' means every character is treated as a separate token)
    tokenizer.pre_tokenizer = Split(pattern='', behavior='isolated')
    # Define a trainer: 1. includes special tokens; 2. ignores tokens with frequency < 2
    trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]","[MASK]"], min_frequency=2)
    # Train tokenizer on the dataset (ds should be an iterable/list of sequences)
    tokenizer.train_from_iterator(ds, trainer=trainer)
    # Save the trained tokenizer to file for reuse
    tokenizer.save('tokenizer.json')
    return tokenizer
# Build tokenizers
tokenizer = build_tokenizer(train_ds_raw)

In [5]:
# Custom dataset class for peptide sequences
class PeptideDataset(Dataset):
    def __init__(self, ds, tokenizer, seq_len):
        """
        Args:
            ds (list): List of raw peptide sequences.
            tokenizer: A tokenizer object that can map characters/tokens to IDs.
            seq_len (int): Fixed sequence length for model input (after padding).
        """
        super().__init__()
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer = tokenizer

        # Define special tokens (convert them into tensor form)
        self.sos_token = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64)  # Start of sequence
        self.eos_token = torch.tensor([tokenizer.token_to_id("[EOS]")], dtype=torch.int64)  # End of sequence
        self.pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype=torch.int64)  # Padding
        self.mask_token = torch.tensor([tokenizer.token_to_id("[MASK]")], dtype=torch.int64) # Mask (for MLM)

    def __len__(self):
        # Return total number of sequences in the dataset
        return len(self.ds)

    def __getitem__(self, idx):
        """
        Retrieve one sequence, apply random masking, 
        then build encoder_input and label tensors with special tokens + padding.
        """
        seq = self.ds[idx]  # Get raw sequence

        # Apply masking strategy (15% replaced with [MASK]) 
        # Returns both masked sequence IDs and the original token IDs
        masked_seq_ids, origi_seq_ids = random_mask(seq, self.tokenizer)

        # Compute how many [PAD] tokens are needed after adding [SOS] and [EOS]
        num_padding_tokens = self.seq_len - len(masked_seq_ids) - 2  
        if num_padding_tokens < 0:
            raise ValueError("Sequence is too long for the specified seq_len")

        # Build encoder input: [SOS] + masked sequence + [EOS] + [PAD...]
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(masked_seq_ids, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )
        # Build label: [SOS] + original sequence + [EOS] + [PAD...]
        # (the model should learn to predict original tokens from masked input)
        label = torch.cat(
            [
                self.sos_token,
                torch.tensor(origi_seq_ids, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )
        # Sanity check: ensure both sequences have fixed length = seq_len
        assert encoder_input.size(0) == self.seq_len, "encoder_input size mismatch"
        assert label.size(0) == self.seq_len, "label size mismatch"
        return {
            "encoder_input": encoder_input,  
            # Shape: (seq_len). Model input sequence with masked tokens and padding.

            # "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),  
            "encoder_mask": (encoder_input == self.pad_token).bool(),  
            # Shape: (1, 1, seq_len). Attention mask where 1 = valid token, 0 = pad.
            # Two unsqueezes are added to match transformer attention dimensions.

            "label": label  
            # Shape: (seq_len). Original sequence (target labels for MLM).
        }
# Helper function to apply random token masking (like BERT MLM)
def random_mask(sentence, tokenizer):
    """
    Args:
        sentence (str): Raw sequence (string of characters).
        tokenizer: Tokenizer to map characters to IDs.

    Returns:
        masked_seq_ids (list of int): Sequence IDs with 15% tokens replaced by [MASK].
        origi_seq_ids (list of int): Original unmasked sequence IDs.
    """
    masked_seq_ids = []
    # Apply masking: each character has a 15% chance of being replaced by [MASK]
    for token in sentence:
        prob = random.random()
        if prob <= 0.15:
            masked_seq_ids.append(tokenizer.token_to_id("[MASK]"))  # Replace with [MASK]
        else: 
            masked_seq_ids.append(tokenizer.encode(token).ids[0])   # Keep original token ID

    # Encode the original sequence fully (without masking)
    origi_seq_ids = tokenizer.encode(sentence).ids  

    # Sanity check: both masked and original sequences must have same length
    assert len(masked_seq_ids) == len(origi_seq_ids), \
        "Masked sequence length does not match original sequence length"

    return masked_seq_ids, origi_seq_ids

In [None]:
# ===== Define maximum sequence length =====
# The maximum peptide length is set to 50 residues. With the addition of [SOS] and [EOS],
# the total sequence length becomes 52 tokens. This value ensures that all sequences
# are padded or truncated to a fixed dimension.
seq_len = 52  

# ===== Construct Dataset objects =====
# PeptideDataset applies tokenization, special token insertion ([SOS], [EOS], [PAD]),
# and random masking (for MLM pretraining). 
# - train_ds uses the training set of raw sequences
# - val_ds uses the validation set of raw sequences
train_ds = PeptideDataset(train_ds_raw, tokenizer, seq_len)
val_ds   = PeptideDataset(val_ds_raw, tokenizer, seq_len)

# ===== Wrap Dataset with DataLoader =====
# DataLoader handles batching and shuffling, making it easier to feed data into the model.
# - Training dataloader: batch_size=2000, with shuffling enabled to randomize batches each epoch
# - Validation dataloader: batch_size=2000, with shuffling disabled to keep evaluation consistent
train_dataloader = DataLoader(train_ds, batch_size=2000, shuffle=True)
val_dataloader   = DataLoader(val_ds,   batch_size=2000, shuffle=False)

### Loading Raw Data and Build Tokenizer

In [None]:
import torch
import torch.nn as nn
import math

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create (max_len, d_model) positional encodings
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)  # even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # odd indices
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)

        self.register_buffer("pe", pe)  # not learnable

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch, seq_len, d_model)
        """
        x = x + self.pe[:, :x.size(1), :].requires_grad_(False)
        return self.dropout(x)

class PepBERT(nn.Module):
    def __init__(self, vocab_size, seq_len, pad_id, d_model=160, n_heads=8, n_layers=6, d_ff=640, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx = pad_id)

        # fixed (sinusoidal) position encoding
        self.pos_encoding = SinusoidalPositionalEncoding(d_model, seq_len, dropout)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=dropout,
            batch_first=True  # Input shape: (batch, seq, d_model)
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        # Output projection
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, src, src_key_padding_mask=None):
        """
        src: (batch, seq_len)
        src_key_padding_mask: (batch, seq_len), True=PAD, False=real token
        """
        x = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoding(x)
        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        return self.proj(x)  # (batch, seq_len, vocab_size)

In [8]:
import torch
import torch.nn as nn
import math


class SinusoidalPositionalEncoding(nn.Module):
    """
    Fixed (non-learnable) sinusoidal positional encoding, as in
    'Attention Is All You Need'. Precomputes a (max_len, d_model)
    table and adds it to token embeddings at call time.

    Args:
        d_model: Embedding (model) dimension.
        max_len: Maximum supported sequence length for precomputed table.
        dropout: Dropout applied after adding positions.
    """
    def __init__(self, d_model: int, max_len: int, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Precompute sinusoidal table: shape (max_len, d_model)
        # pe[pos, 2i]   = sin(pos / (10000^(2i/d_model)))
        # pe[pos, 2i+1] = cos(pos / (10000^(2i/d_model)))
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)  # even dimensions
        pe[:, 1::2] = torch.cos(position * div_term)  # odd  dimensions

        # Add batch dim for broadcasting at runtime: (1, max_len, d_model)
        pe = pe.unsqueeze(0)

        # Register as a buffer so it moves with .to(device) but is not a parameter
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Add positions to token embeddings.

        Args:
            x: Token embeddings, shape (B, S, D) = (batch, seq_len, d_model)

        Returns:
            Tensor of shape (B, S, D) with positions added, then dropout applied.
        """
        # Slice the first S positions and add to x; no gradient through the table
        x = x + self.pe[:, : x.size(1), :].requires_grad_(False)

        # NOTE (optional): if you do NOT want to add positions to PAD rows,
        # you can pass a (B, S) bool mask into forward and zero out pe on PAD:
        # pe = self.pe[:, : x.size(1), :]
        # pe = pe.masked_fill(key_padding_mask.unsqueeze(-1), 0.0)  # True=PAD
        # x = x + pe

        return self.dropout(x)


class PepBERT(nn.Module):
    """
    BERT-style Transformer encoder for sequences (e.g., peptides).

    Components:
        - Token embedding with a defined padding_idx (PAD rows are zero and not updated).
        - Fixed sinusoidal positional encoding (added to token embeddings).
        - Stack of nn.TransformerEncoderLayer blocks (self-attention + FFN).
        - Linear projection to vocabulary size (for MLM-style training).

    Args:
        vocab_size: Size of the tokenizer vocabulary.
        seq_len:    Max sequence length (drives positional table size).
        pad_id:     Vocabulary ID used for PAD tokens.
        d_model:    Embedding dimension.
        n_heads:    Number of attention heads.
        n_layers:   Number of encoder layers (depth).
        d_ff:       Hidden size of the position-wise feed-forward (usually 4*d_model).
        dropout:    Dropout probability used across the module.
    """
    def __init__(
        self,
        vocab_size: int,
        seq_len: int,
        pad_id: int,
        d_model: int = 160,
        n_heads: int = 8,
        n_layers: int = 6,
        d_ff: int = 640,
        dropout: float = 0.1,
    ):
        super().__init__()

        # Token embedding: PAD rows are always zero and are excluded from updates
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)

        # Fixed positional encoding (sin/cos), shape added as (B, S, D)
        self.pos_encoding = SinusoidalPositionalEncoding(d_model, seq_len, dropout)

        # Transformer encoder stack (each layer: MHA + FFN + residual + LayerNorm)
        # batch_first=True => inputs are (B, S, D)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=dropout,
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        # Final projection to logits over vocabulary (for masked language modeling)
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(
        self,
        src: torch.Tensor,
        src_key_padding_mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """
        Forward pass.

        Args:
            src: LongTensor token IDs, shape (B, S).
            src_key_padding_mask: BoolTensor, shape (B, S),
                                    True at PAD positions (to be ignored by attention).

        Returns:
            Logits over vocabulary: shape (B, S, vocab_size).
        """
        # Embed tokens and scale by sqrt(d_model) (stabilizes training as in the paper)
        x = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)  # (B, S, D)

        # Add fixed sinusoidal positional encoding
        x = self.pos_encoding(x)  # (B, S, D)

        # IMPORTANT:
        # - nn.TransformerEncoder expects src_key_padding_mask with shape (B, S), True=PAD.
        # - On Apple Silicon (MPS), passing this mask may trigger a nested-tensor path
        #   not fully implemented. If you ever hit NotImplementedError on MPS, you can:
        #   (A) Drop the mask on MPS:
        #       if x.device.type == "mps": src_key_padding_mask = None
        #   (B) Or pass a dummy attn_mask to disable that fast-path while keeping padding mask:
        #       S = x.size(1)
        #       dummy = torch.zeros((S, S), dtype=torch.bool, device=x.device)
        #       x = self.encoder(x, mask=dummy, src_key_padding_mask=src_key_padding_mask)
        #       return self.proj(x)

        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)  # (B, S, D)

        # Project hidden states to vocabulary logits (use CrossEntropyLoss with ignore_index=pad_id)
        return self.proj(x)  # (B, S, vocab_size)

In [None]:
# ===== Define special token IDs =====
pad_id  = tokenizer.token_to_id("[PAD]")   # ID for [PAD] token (used for sequence padding)
mask_id = tokenizer.token_to_id("[MASK]")  # ID for [MASK] token (used in MLM objective)

# ===== Define the model =====
model = PepBERT(
    vocab_size=tokenizer.get_vocab_size(),
    seq_len=seq_len,
    pad_id=tokenizer.token_to_id("[PAD]"),
    d_model=160,
    n_heads=8,
    n_layers=6,
    d_ff=640,
    dropout=0.0
).to(device)

# ===== Define optimizer =====
# AdamW: widely used optimizer for Transformer models
# - lr           : learning rate
# - eps          : term added to denominator for numerical stability
# - betas        : (β1, β2) coefficients for running averages of gradient and squared gradient
# - weight_decay : L2 regularization to reduce overfitting
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=4e-4,
    eps=1e-8,
    betas=(0.9, 0.98),
    weight_decay=0.01
)

# ===== Define loss function =====
# CrossEntropyLoss computes negative log-likelihood over vocabulary.
# - ignore_index : excludes [PAD] tokens from loss computation
# - reduction    : here set to "sum" so loss can later be normalized
#                  by the number of masked tokens (MLM objective)
loss_fn = nn.CrossEntropyLoss(
    ignore_index=pad_id,
    reduction="sum"
).to(device)

# NOTE:
# For masked language modeling (MLM), the average loss per masked token
# is computed as: (sum of losses over all masked tokens) / (number of masked tokens).

In [None]:
def run_validation(model, val_dataloader, tokenizer, device, loss_fn, mask_id):
    model.eval().to(device)
    total_loss, total_masked = 0.0, 0
    V = tokenizer.get_vocab_size()
    # ===== Define special token IDs =====
    mask_id = tokenizer.token_to_id("[MASK]")  # ID for [MASK] token (used in MLM objective)
    with torch.no_grad():
        for batch in val_dataloader:
            encoder_input = batch["encoder_input"].to(device)   # (B, S)
            encoder_mask  = batch["encoder_mask"].to(device)    # (B, S), True=PAD
            labels        = batch["label"].to(device)           # (B, S)

            logits = model(encoder_input, src_key_padding_mask=encoder_mask)  # (B, S, V)

            masked_pos = (encoder_input == mask_id)             # (B, S)
            num_masked = int(masked_pos.sum().item())
            if num_masked == 0:
                continue  # nothing to average for this batch

            logits_flat = logits.view(-1, V)
            labels_flat = labels.view(-1)
            masked_idx  = masked_pos.view(-1)

            loss = loss_fn(logits_flat[masked_idx], labels_flat[masked_idx])  # sum over masked

            total_loss   += loss.item()
            total_masked += num_masked

    return total_loss / max(total_masked, 1)

In [None]:
num_epochs = 100
loss_train_col, loss_val_col = [], []

for epoch in range(num_epochs):
    torch.cuda.empty_cache()
    model.train().to(device)

    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    total_loss, total_masked = 0.0, 0

    for step, batch in enumerate(batch_iterator):
        # -------- 1) Load batch --------
        encoder_input = batch["encoder_input"].to(device)   # (B, S)
        encoder_mask  = batch["encoder_mask"].to(device)    # (B, S), True=PAD
        labels        = batch["label"].to(device)           # (B, S)

        # -------- 2) Forward --------
        logits = model(encoder_input, src_key_padding_mask=encoder_mask)  # (B, S, V)

        # -------- 3) Select only masked positions --------
        masked_pos = (encoder_input == mask_id)             # (B, S) bool
        num_masked = int(masked_pos.sum().item())
        if num_masked == 0:
            # No masked tokens in this batch; skip update to avoid NaNs
            batch_iterator.set_postfix({"loss": "skip (no [MASK])"})
            continue

        # Flatten and index only masked positions
        V = tokenizer.get_vocab_size()
        logits_flat = logits.view(-1, V)                    # (B*S, V)
        labels_flat = labels.view(-1)                       # (B*S,)
        masked_idx  = masked_pos.view(-1)                   # (B*S,)

        loss = loss_fn(logits_flat[masked_idx], labels_flat[masked_idx])  # sum over masked

        # -------- 4) Backprop --------
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # -------- 5) Logging (average per masked token for the batch) --------
        total_loss   += loss.item()
        total_masked += num_masked
        batch_iterator.set_postfix({"loss": f"{(loss.item()/num_masked):.4f}"})

    # -------- 6) Epoch averages --------
    ave_train_loss = total_loss / max(total_masked, 1)
    ave_val_loss   = run_validation(model, val_dataloader, tokenizer, device, loss_fn, mask_id)

    print(f"Epoch {epoch:02d}: train loss = {ave_train_loss:.4f}, val loss = {ave_val_loss:.4f}")
    loss_train_col.append(ave_train_loss)
    loss_val_col.append(ave_val_loss)

    # -------- 7) Checkpoint --------
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    }, f"tmodel_{epoch:02d}.pt")

# -------- 8) Save losses --------
df = pd.DataFrame({"loss_train": loss_train_col, "loss_val": loss_val_col})
df.to_csv("_loss.csv", index=False)

Processing Epoch 00: 100%|██████████| 981/981 [09:45<00:00,  1.68it/s, loss=2.8095]
  output = torch._nested_tensor_from_mask(


Epoch 00: train loss = 2.8267, val loss = 2.8010


Processing Epoch 01: 100%|██████████| 981/981 [09:40<00:00,  1.69it/s, loss=2.8027]


Epoch 01: train loss = 2.7963, val loss = 2.7962


Processing Epoch 02: 100%|██████████| 981/981 [09:35<00:00,  1.71it/s, loss=2.7888]


Epoch 02: train loss = 2.7940, val loss = 2.7946


Processing Epoch 03: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7700]


Epoch 03: train loss = 2.7909, val loss = 2.7859


Processing Epoch 04: 100%|██████████| 981/981 [09:32<00:00,  1.71it/s, loss=2.7668]


Epoch 04: train loss = 2.7828, val loss = 2.7861


Processing Epoch 05: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7852]


Epoch 05: train loss = 2.7792, val loss = 2.7811


Processing Epoch 06: 100%|██████████| 981/981 [09:32<00:00,  1.71it/s, loss=2.7950]


Epoch 06: train loss = 2.7774, val loss = 2.7843


Processing Epoch 07: 100%|██████████| 981/981 [09:30<00:00,  1.72it/s, loss=2.7685]


Epoch 07: train loss = 2.7755, val loss = 2.7748


Processing Epoch 08: 100%|██████████| 981/981 [09:34<00:00,  1.71it/s, loss=2.7691]


Epoch 08: train loss = 2.7747, val loss = 2.7744


Processing Epoch 09: 100%|██████████| 981/981 [09:34<00:00,  1.71it/s, loss=2.7708]


Epoch 09: train loss = 2.7729, val loss = 2.7758


Processing Epoch 10: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7496]


Epoch 10: train loss = 2.7718, val loss = 2.7771


Processing Epoch 11: 100%|██████████| 981/981 [09:34<00:00,  1.71it/s, loss=2.7795]


Epoch 11: train loss = 2.7707, val loss = 2.7718


Processing Epoch 12: 100%|██████████| 981/981 [09:35<00:00,  1.70it/s, loss=2.7753]


Epoch 12: train loss = 2.7695, val loss = 2.7731


Processing Epoch 13: 100%|██████████| 981/981 [10:15<00:00,  1.59it/s, loss=2.7672]


Epoch 13: train loss = 2.7680, val loss = 2.7665


Processing Epoch 14: 100%|██████████| 981/981 [10:12<00:00,  1.60it/s, loss=2.7611]


Epoch 14: train loss = 2.7673, val loss = 2.7740


Processing Epoch 15: 100%|██████████| 981/981 [10:17<00:00,  1.59it/s, loss=2.7552]


Epoch 15: train loss = 2.7663, val loss = 2.7702


Processing Epoch 16: 100%|██████████| 981/981 [10:18<00:00,  1.59it/s, loss=2.7575]


Epoch 16: train loss = 2.7652, val loss = 2.7664


Processing Epoch 17: 100%|██████████| 981/981 [10:17<00:00,  1.59it/s, loss=2.7597]


Epoch 17: train loss = 2.7643, val loss = 2.7681


Processing Epoch 18: 100%|██████████| 981/981 [10:16<00:00,  1.59it/s, loss=2.7686]


Epoch 18: train loss = 2.7634, val loss = 2.7661


Processing Epoch 19: 100%|██████████| 981/981 [10:17<00:00,  1.59it/s, loss=2.7653]


Epoch 19: train loss = 2.7622, val loss = 2.7660


Processing Epoch 20: 100%|██████████| 981/981 [10:16<00:00,  1.59it/s, loss=2.7569]


Epoch 20: train loss = 2.7611, val loss = 2.7651


Processing Epoch 21: 100%|██████████| 981/981 [10:12<00:00,  1.60it/s, loss=2.7768]


Epoch 21: train loss = 2.7602, val loss = 2.7649


Processing Epoch 22: 100%|██████████| 981/981 [10:14<00:00,  1.60it/s, loss=2.7603]


Epoch 22: train loss = 2.7597, val loss = 2.7656


Processing Epoch 23: 100%|██████████| 981/981 [10:13<00:00,  1.60it/s, loss=2.7678]


Epoch 23: train loss = 2.7584, val loss = 2.7639


Processing Epoch 24: 100%|██████████| 981/981 [10:14<00:00,  1.60it/s, loss=2.7476]


Epoch 24: train loss = 2.7577, val loss = 2.7583


Processing Epoch 25: 100%|██████████| 981/981 [10:15<00:00,  1.59it/s, loss=2.7653]


Epoch 25: train loss = 2.7572, val loss = 2.7601


Processing Epoch 26: 100%|██████████| 981/981 [10:17<00:00,  1.59it/s, loss=2.7453]


Epoch 26: train loss = 2.7562, val loss = 2.7594


Processing Epoch 27: 100%|██████████| 981/981 [10:14<00:00,  1.60it/s, loss=2.7687]


Epoch 27: train loss = 2.7555, val loss = 2.7575


Processing Epoch 28: 100%|██████████| 981/981 [10:14<00:00,  1.60it/s, loss=2.7453]


Epoch 28: train loss = 2.7546, val loss = 2.7578


Processing Epoch 29: 100%|██████████| 981/981 [10:16<00:00,  1.59it/s, loss=2.7314]


Epoch 29: train loss = 2.7533, val loss = 2.7550


Processing Epoch 30: 100%|██████████| 981/981 [10:16<00:00,  1.59it/s, loss=2.7537]


Epoch 30: train loss = 2.7527, val loss = 2.7578


Processing Epoch 31: 100%|██████████| 981/981 [10:15<00:00,  1.59it/s, loss=2.7741]


Epoch 31: train loss = 2.7515, val loss = 2.7521


Processing Epoch 32: 100%|██████████| 981/981 [10:15<00:00,  1.59it/s, loss=2.7667]


Epoch 32: train loss = 2.7508, val loss = 2.7577


Processing Epoch 33: 100%|██████████| 981/981 [09:56<00:00,  1.65it/s, loss=2.7250]


Epoch 33: train loss = 2.7500, val loss = 2.7535


Processing Epoch 34: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7619]


Epoch 34: train loss = 2.7491, val loss = 2.7496


Processing Epoch 35: 100%|██████████| 981/981 [09:37<00:00,  1.70it/s, loss=2.7623]


Epoch 35: train loss = 2.7482, val loss = 2.7507


Processing Epoch 36: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7467]


Epoch 36: train loss = 2.7477, val loss = 2.7541


Processing Epoch 37: 100%|██████████| 981/981 [09:32<00:00,  1.71it/s, loss=2.7359]


Epoch 37: train loss = 2.7472, val loss = 2.7504


Processing Epoch 38: 100%|██████████| 981/981 [09:28<00:00,  1.72it/s, loss=2.7565]


Epoch 38: train loss = 2.7464, val loss = 2.7510


Processing Epoch 39: 100%|██████████| 981/981 [09:31<00:00,  1.72it/s, loss=2.7701]


Epoch 39: train loss = 2.7464, val loss = 2.7497


Processing Epoch 40: 100%|██████████| 981/981 [09:31<00:00,  1.72it/s, loss=2.7367]


Epoch 40: train loss = 2.7457, val loss = 2.7465


Processing Epoch 41: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7368]


Epoch 41: train loss = 2.7453, val loss = 2.7464


Processing Epoch 42: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7366]


Epoch 42: train loss = 2.7445, val loss = 2.7487


Processing Epoch 43: 100%|██████████| 981/981 [09:32<00:00,  1.71it/s, loss=2.7343]


Epoch 43: train loss = 2.7442, val loss = 2.7452


Processing Epoch 44: 100%|██████████| 981/981 [09:36<00:00,  1.70it/s, loss=2.7575]


Epoch 44: train loss = 2.7438, val loss = 2.7415


Processing Epoch 45: 100%|██████████| 981/981 [09:35<00:00,  1.70it/s, loss=2.7550]


Epoch 45: train loss = 2.7434, val loss = 2.7452


Processing Epoch 46: 100%|██████████| 981/981 [09:34<00:00,  1.71it/s, loss=2.7379]


Epoch 46: train loss = 2.7428, val loss = 2.7478


Processing Epoch 47: 100%|██████████| 981/981 [09:33<00:00,  1.71it/s, loss=2.7412]


Epoch 47: train loss = 2.7426, val loss = 2.7431


Processing Epoch 48: 100%|██████████| 981/981 [09:32<00:00,  1.71it/s, loss=2.7640]


Epoch 48: train loss = 2.7421, val loss = 2.7450


Processing Epoch 49: 100%|██████████| 981/981 [09:34<00:00,  1.71it/s, loss=2.7505]


Epoch 49: train loss = 2.7415, val loss = 2.7388
