Note:

Forget “perplexity”.
For domain SLMs, you want to track:

Metric	Description
Exact match rate	% of tokens matching valid domain grammar
Parse accuracy	% of generated outputs that compile/execute
Semantic validity	% of outputs that make logical sense
Entropy per token	How confident vs overfitted model is

In [28]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("YvvonM/mental_health_data")
train_dataset = dataset['train'].train_test_split(test_size=0.1)['train']
eval_dataset = dataset['train'].train_test_split(test_size=0.1)['test']

In [None]:
train_batch_size = 8  # training batch size
eval_batch_size = 4  # evaluation batch size
context_length = block_size = 126  # number of tokens processed in a single batch
train_split = 0.9  # percentage of data to use from total data for training
number_of_heads = 8
number_of_layers = 6

In [31]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(
    tokenizer_file="../slm/tokenizer.json",
    truncation=True,
    unk_token="<UNK>",
    pad_token="<PAD>",
    bos_token="<BOS>",
    eos_token="<EOS>",
    max_length=context_length
)

In [32]:

def tokenize_and_shift_labels(example):
    text = f"<BOS> <USER> {example['Input']} <ASSISTANT> {example['Response']} <EOS>"
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=context_length,
        padding='max_length'
    )
    input_ids = encoding["input_ids"]

    inputs = input_ids[:-1]
    labels = input_ids[1:]
    labels = [l if l != tokenizer.pad_token_id else -100 for l in labels]

    return {"input_ids": inputs, "labels": labels}

In [33]:
from random import shuffle
train_dataset = train_dataset.map(
    tokenize_and_shift_labels,
    remove_columns=train_dataset.column_names,
)

eval_dataset = eval_dataset.map(
    tokenize_and_shift_labels,
    remove_columns=eval_dataset.column_names,
)

Map:   1%|          | 12078/1586505 [00:12<26:38, 984.86 examples/s] 


KeyboardInterrupt: 

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

device = "cpu"

if torch.cuda.is_available():
  device = "cuda"

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # causal LM
)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)


In [34]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, context_length, d_model):
        super().__init__()
        self.register_buffer("pe", self._build_pe(context_length, d_model))
        self.dropout = nn.Dropout(dropout=0.1)



    def _build_pe(self, length, d_model):
        position = torch.arange(0, length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(length, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  # shape: [1, length, d_model]

    def forward(self, x):
        seq_len = x.size(1)
        return self.dropout(x + self.pe[:, :seq_len, :])


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
        self.fc_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        B, T, C = x.shape
        qkv = self.qkv_proj(x)
        q, k, v = qkv.chunk(3, dim=-1)

        # Reshape for heads
        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, float("-inf"))
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_output = attn_weights @ v

        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)
        return self.fc_out(self.dropout(attn_output))

In [None]:
class GPTBlock(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.mlp = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask):
        x = x + self.attn(self.ln1(x), mask)
        x = x + self.mlp(self.ln2(x))
        return x


In [None]:
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, context_length, dropout=0.1):
        super().__init__()
        self.context_length = context_length
        self.wte = nn.Embedding(vocab_size, d_model)
        self.wpe = PositionalEncoding(context_length, d_model)

        self.blocks = nn.ModuleList([
            GPTBlock(d_model, n_heads, dropout) for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
        self.lm_head.weight = self.wte.weight  # Weight tying

        nn.init.normal_(self.wte.weight, mean=0.0, std=0.02)


    def forward(self, x, targets=None):
        B, T = x.size()
        assert T <= self.context_length, f"Input length {T} exceeds context length {self.context_length}"

        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).unsqueeze(0)
        x = self.wte(x)
        x = self.wpe(x)

        for block in self.blocks:
            x = block(x, mask)

        x = self.ln_f(x)
        logits = self.lm_head(x)  # [B, T, vocab_size]

        loss = None
        if targets is not None:
            # Trim targets/logits to same length
            min_len = min(logits.size(1), targets.size(1))
            logits = logits[:, :min_len, :]
            targets = targets[:, :min_len]

            # Flatten for cross-entropy
            loss = F.cross_entropy(
                logits.reshape(-1, logits.size(-1)),
                targets.reshape(-1),
                ignore_index=-100,
                label_smoothing=0.1
            )

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, top_k=50, top_p=0.9, temperature=1.0, repetition_penalty=1.1):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.context_length:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature

            # Repetition penalty
            for i in range(idx.size(0)):
                for token in idx[i].tolist():
                    logits[i, token] /= repetition_penalty

            # Top-k
            if top_k is not None:
                values, _ = torch.topk(logits, top_k)
                min_values = values[:, -1].unsqueeze(1)
                logits = torch.where(logits < min_values, float("-inf"), logits)

            # Top-p
            if top_p is not None:
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                probs = F.softmax(sorted_logits, dim=-1)
                cumulative_probs = torch.cumsum(probs, dim=-1)
                sorted_mask = cumulative_probs > top_p
                sorted_mask[:, 1:] = sorted_mask[:, :-1].clone()
                sorted_mask[:, 0] = 0
                logits.scatter_(1, sorted_indices, logits.masked_fill(sorted_mask, float("-inf")))

            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
        return idx


In [None]:
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10
train_batch_size = 16
eval_batch_size = 8
eval_steps = 2000
log_interval = 10  # log every n steps
save_dir = "./checkpoints"
os.makedirs(save_dir, exist_ok=True)


In [None]:
basic_model = GPT(vocab_size=8000, d_model=256, n_heads=number_of_heads, n_layers=number_of_layers, context_length=context_length).to(device)
# basic_model = GPT(vocab_size=8000, d_model=256).to(device)




basic_model.to(device)
optimizer = torch.optim.AdamW(basic_model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs*len(train_loader))


In [None]:
print(basic_model)
print(f"Total Parameters: {round(sum(p.numel() for p in basic_model.parameters() if p.requires_grad) / 1_000_000)}M")

GPT(
  (wte): Embedding(8000, 256)
  (wpe): PositionalEncoding()
  (blocks): ModuleList(
    (0-5): 6 x GPTBlock(
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (qkv_proj): Linear(in_features=256, out_features=768, bias=True)
        (fc_out): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (mlp): Sequential(
        (0): Linear(in_features=256, out_features=1024, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=1024, out_features=256, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (lm_head): Linear(in_features=256, out_features=8000, bias=False)
)
Total Parameters: 7M


In [None]:
xb = next(iter(train_loader))
xb_ids = xb['input_ids'].to(device)
x_embed = basic_model.wte(xb_ids)
print("Embedding stats:", x_embed.min().item(), x_embed.max().item(), x_embed.mean().item())

Embedding stats: -0.08833513408899307 0.08442691713571548 0.00016734316886868328


In [None]:
x_pos = basic_model.wpe(x_embed)
print("After positional encoding:", x_pos.min().item(), x_pos.max().item(), x_pos.mean().item())


After positional encoding: -1.058415174484253 1.0844268798828125 0.39544805884361267


In [None]:
mask = torch.tril(torch.ones(xb_ids.size(1), xb_ids.size(1), device=device)).unsqueeze(0).unsqueeze(0)
x = x_embed + x_pos

for i, block in enumerate(basic_model.blocks):
    x = block(x, mask)
    print(f"After block {i}: min={x.min().item()}, max={x.max().item()}, mean={x.mean().item()}")


After block 0: min=-2.10876202583313, max=2.4862170219421387, mean=0.41785746812820435
After block 1: min=-2.5692265033721924, max=3.157790422439575, mean=0.3904191255569458
After block 2: min=-2.949037551879883, max=3.3138914108276367, mean=0.4024883806705475
After block 3: min=-3.5521295070648193, max=3.7736759185791016, mean=0.3811834752559662
After block 4: min=-3.6067259311676025, max=3.7547340393066406, mean=0.38399630784988403
After block 5: min=-3.926696300506592, max=3.9288580417633057, mean=0.3636806607246399


In [None]:
x_ln = basic_model.ln_f(x)
logits = basic_model.lm_head(x_ln)
print("Final logits stats:", logits.min().item(), logits.max().item(), logits.mean().item())


Final logits stats: -1.5039607286453247 1.6910687685012817 0.003899493021890521


In [None]:

logits, loss = basic_model(xb['input_ids'].to(device), xb['labels'].to(device))
print(logits.shape, loss.item())

torch.Size([16, 49, 8000]) 8.90952205657959


In [None]:
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm

scaler = GradScaler()
accum_steps = 4
eval_subset = 100
log_interval = 10000

train_losses, eval_losses = [], []

for epoch in range(epochs):
    basic_model.train()
    running_loss = 0.0
    step_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)

    for step, batch in enumerate(step_bar):
        xb = batch['input_ids'].to(device, non_blocking=True)
        yb = batch['labels'].to(device, non_blocking=True)

        with autocast():
            logits, loss = basic_model(xb, yb)
            loss = loss / accum_steps

        scaler.scale(loss).backward()
        running_loss += loss.item()

        if (step + 1) % accum_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(basic_model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        # Show loss in progress bar occasionally
        if (step + 1) % log_interval == 0:
            avg_loss = running_loss / log_interval
            step_bar.set_postfix(train_loss=f"{avg_loss:.4f}")
            train_losses.append(avg_loss)
            running_loss = 0.0

        # Evaluate periodically
        if (step + 1) % eval_steps == 0:
            basic_model.eval()
            eval_loss = 0.0
            with torch.no_grad():
                for eval_step, eval_batch in enumerate(eval_loader):
                    if eval_step >= eval_subset:
                        break
                    xvb = eval_batch['input_ids'].to(device, non_blocking=True)
                    yvb = eval_batch['labels'].to(device, non_blocking=True)
                    with autocast():
                        _, e_loss = basic_model(xvb, yvb)
                    eval_loss += e_loss.item()
            eval_loss /= eval_subset
            tqdm.write(f"[Eval] Epoch {epoch+1}, Step {step+1}: Loss = {eval_loss:.4f}")
            eval_losses.append(eval_loss)
            basic_model.train()

    # Save checkpoint at the end of epoch
    checkpoint_path = os.path.join(save_dir, f"slm_epoch_{epoch+1}.pt")
    torch.save({
        'epoch': epoch+1,
        'model_state_dict': basic_model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_losses': train_losses,
        'eval_losses': eval_losses
    }, checkpoint_path)
    tqdm.write(f"Checkpoint saved at {checkpoint_path}")


  scaler = GradScaler()
  with autocast():
  with autocast():
Epoch 1/10:   2%|▏         | 2003/99157 [01:09<2:57:00,  9.15it/s, train_loss=0.0026]

[Eval] Epoch 1, Step 2000: Loss = 0.0050


Epoch 1/10:   4%|▍         | 4003/99157 [02:19<2:53:33,  9.14it/s, train_loss=0.0013]

[Eval] Epoch 1, Step 4000: Loss = 0.0023


Epoch 1/10:   6%|▌         | 6002/99157 [03:30<3:51:30,  6.71it/s, train_loss=0.0007]

[Eval] Epoch 1, Step 6000: Loss = 0.0014


Epoch 1/10:   8%|▊         | 8001/99157 [04:41<3:32:24,  7.15it/s, train_loss=0.0005]

[Eval] Epoch 1, Step 8000: Loss = 0.0011


Epoch 1/10:  10%|█         | 10003/99157 [05:53<2:52:43,  8.60it/s, train_loss=0.0004]

[Eval] Epoch 1, Step 10000: Loss = 0.0008


Epoch 1/10:  12%|█▏        | 12006/99157 [07:06<2:33:53,  9.44it/s, train_loss=0.0002]

[Eval] Epoch 1, Step 12000: Loss = 0.0006


Epoch 1/10:  14%|█▍        | 14004/99157 [08:18<2:31:46,  9.35it/s, train_loss=0.0002]

[Eval] Epoch 1, Step 14000: Loss = 0.0005


Epoch 1/10:  16%|█▌        | 16003/99157 [09:31<2:40:23,  8.64it/s, train_loss=0.0002]

[Eval] Epoch 1, Step 16000: Loss = 0.0004


Epoch 1/10:  18%|█▊        | 18005/99157 [10:44<2:39:46,  8.46it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 18000: Loss = 0.0005


Epoch 1/10:  20%|██        | 20005/99157 [11:57<2:50:39,  7.73it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 20000: Loss = 0.0002


Epoch 1/10:  22%|██▏       | 22004/99157 [13:10<2:25:45,  8.82it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 22000: Loss = 0.0002


Epoch 1/10:  24%|██▍       | 24003/99157 [14:25<2:27:18,  8.50it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 24000: Loss = 0.0002


Epoch 1/10:  26%|██▌       | 26005/99157 [15:38<2:20:46,  8.66it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 26000: Loss = 0.0004


Epoch 1/10:  28%|██▊       | 28004/99157 [16:51<2:20:14,  8.46it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 28000: Loss = 0.0002


Epoch 1/10:  30%|███       | 30002/99157 [18:03<2:44:52,  6.99it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 30000: Loss = 0.0002


Epoch 1/10:  32%|███▏      | 32003/99157 [19:17<2:11:04,  8.54it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 32000: Loss = 0.0002


Epoch 1/10:  34%|███▍      | 34003/99157 [20:30<2:12:13,  8.21it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 34000: Loss = 0.0002


Epoch 1/10:  36%|███▋      | 36003/99157 [21:43<1:58:51,  8.86it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 36000: Loss = 0.0002


Epoch 1/10:  38%|███▊      | 38005/99157 [22:56<1:42:43,  9.92it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 38000: Loss = 0.0001


Epoch 1/10:  40%|████      | 40004/99157 [24:09<1:58:33,  8.32it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 40000: Loss = 0.0001


Epoch 1/10:  42%|████▏     | 42003/99157 [25:23<1:53:28,  8.39it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 42000: Loss = 0.0000


Epoch 1/10:  44%|████▍     | 44003/99157 [26:35<1:43:06,  8.92it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 44000: Loss = 0.0000


Epoch 1/10:  46%|████▋     | 46004/99157 [27:48<1:42:16,  8.66it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 46000: Loss = 0.0000


Epoch 1/10:  48%|████▊     | 48004/99157 [29:01<1:37:36,  8.73it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 48000: Loss = 0.0000


Epoch 1/10:  50%|█████     | 50004/99157 [30:13<1:26:49,  9.43it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 50000: Loss = 0.0000


Epoch 1/10:  52%|█████▏    | 52004/99157 [31:26<1:23:04,  9.46it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 52000: Loss = 0.0000


Epoch 1/10:  54%|█████▍    | 54004/99157 [32:40<1:26:12,  8.73it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 54000: Loss = 0.0000


Epoch 1/10:  56%|█████▋    | 56003/99157 [33:53<1:24:36,  8.50it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 56000: Loss = 0.0001


Epoch 1/10:  58%|█████▊    | 58004/99157 [35:07<1:15:23,  9.10it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 58000: Loss = 0.0001


Epoch 1/10:  61%|██████    | 60004/99157 [36:20<1:09:42,  9.36it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 60000: Loss = 0.0001


Epoch 1/10:  63%|██████▎   | 62005/99157 [37:33<1:17:11,  8.02it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 62000: Loss = 0.0000


Epoch 1/10:  65%|██████▍   | 64005/99157 [38:45<1:09:20,  8.45it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 64000: Loss = 0.0000


Epoch 1/10:  67%|██████▋   | 66003/99157 [40:04<1:00:15,  9.17it/s, train_loss=0.0001]

[Eval] Epoch 1, Step 66000: Loss = 0.0000


Epoch 1/10:  69%|██████▊   | 68004/99157 [41:15<50:31, 10.28it/s, train_loss=0.0000]  

[Eval] Epoch 1, Step 68000: Loss = 0.0000


Epoch 1/10:  71%|███████   | 70005/99157 [42:25<47:29, 10.23it/s, train_loss=0.0000]  

[Eval] Epoch 1, Step 70000: Loss = 0.0000


Epoch 1/10:  73%|███████▎  | 72003/99157 [43:36<48:53,  9.26it/s, train_loss=0.0000]  

[Eval] Epoch 1, Step 72000: Loss = 0.0000


Epoch 1/10:  75%|███████▍  | 74004/99157 [44:48<41:01, 10.22it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 74000: Loss = 0.0000


Epoch 1/10:  77%|███████▋  | 76003/99157 [46:01<48:28,  7.96it/s, train_loss=0.0000]  

[Eval] Epoch 1, Step 76000: Loss = 0.0000


Epoch 1/10:  79%|███████▊  | 78004/99157 [47:13<38:54,  9.06it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 78000: Loss = 0.0000


Epoch 1/10:  81%|████████  | 80003/99157 [48:26<36:35,  8.72it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 80000: Loss = 0.0000


Epoch 1/10:  83%|████████▎ | 82004/99157 [49:39<33:23,  8.56it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 82000: Loss = 0.0000


Epoch 1/10:  85%|████████▍ | 84005/99157 [50:52<29:51,  8.46it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 84000: Loss = 0.0000


Epoch 1/10:  87%|████████▋ | 86002/99157 [52:04<32:05,  6.83it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 86000: Loss = 0.0000


Epoch 1/10:  89%|████████▉ | 88002/99157 [53:16<25:18,  7.35it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 88000: Loss = 0.0000


Epoch 1/10:  91%|█████████ | 90005/99157 [54:29<17:57,  8.50it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 90000: Loss = 0.0000


Epoch 1/10:  93%|█████████▎| 92005/99157 [55:41<12:30,  9.53it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 92000: Loss = 0.0000


Epoch 1/10:  95%|█████████▍| 94005/99157 [56:54<09:04,  9.46it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 94000: Loss = 0.0000


Epoch 1/10:  97%|█████████▋| 96005/99157 [58:07<06:13,  8.45it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 96000: Loss = 0.0000


Epoch 1/10:  99%|█████████▉| 98003/99157 [59:21<02:19,  8.25it/s, train_loss=0.0000]

[Eval] Epoch 1, Step 98000: Loss = 0.0001


Epoch 1/10: 100%|██████████| 99157/99157 [1:00:02<00:00, 27.52it/s, train_loss=0.0028]


Checkpoint saved at ./checkpoints\slm_epoch_1.pt


Epoch 2/10:   2%|▏         | 2004/99157 [01:13<3:13:30,  8.37it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 2000: Loss = 0.0000


Epoch 2/10:   4%|▍         | 4004/99157 [02:26<2:56:18,  9.00it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 4000: Loss = 0.0000


Epoch 2/10:   6%|▌         | 6004/99157 [03:40<3:14:20,  7.99it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 6000: Loss = 0.0000


Epoch 2/10:   8%|▊         | 8006/99157 [04:54<2:39:50,  9.50it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 8000: Loss = 0.0000


Epoch 2/10:  10%|█         | 10004/99157 [06:07<2:46:54,  8.90it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 10000: Loss = 0.0000


Epoch 2/10:  12%|█▏        | 12003/99157 [07:20<2:41:52,  8.97it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 12000: Loss = 0.0000


Epoch 2/10:  14%|█▍        | 14004/99157 [08:32<2:34:54,  9.16it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 14000: Loss = 0.0000


Epoch 2/10:  16%|█▌        | 16005/99157 [09:45<2:38:35,  8.74it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 16000: Loss = 0.0000


Epoch 2/10:  18%|█▊        | 18004/99157 [10:56<2:19:19,  9.71it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 18000: Loss = 0.0000


Epoch 2/10:  20%|██        | 20005/99157 [12:08<2:37:10,  8.39it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 20000: Loss = 0.0000


Epoch 2/10:  22%|██▏       | 22005/99157 [13:20<2:19:06,  9.24it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 22000: Loss = 0.0000


Epoch 2/10:  24%|██▍       | 24004/99157 [14:32<2:30:18,  8.33it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 24000: Loss = 0.0000


Epoch 2/10:  26%|██▌       | 26004/99157 [15:44<2:26:43,  8.31it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 26000: Loss = 0.0000


Epoch 2/10:  28%|██▊       | 28005/99157 [16:57<2:08:03,  9.26it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 28000: Loss = 0.0000


Epoch 2/10:  30%|███       | 30004/99157 [18:09<1:54:53, 10.03it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 30000: Loss = 0.0000


Epoch 2/10:  32%|███▏      | 32003/99157 [19:39<2:51:34,  6.52it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 32000: Loss = 0.0000


Epoch 2/10:  34%|███▍      | 34006/99157 [20:51<1:39:50, 10.88it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 34000: Loss = 0.0000


Epoch 2/10:  36%|███▋      | 36004/99157 [22:01<1:46:23,  9.89it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 36000: Loss = 0.0000


Epoch 2/10:  38%|███▊      | 38003/99157 [23:17<2:06:11,  8.08it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 38000: Loss = 0.0000


Epoch 2/10:  40%|████      | 40004/99157 [24:25<1:33:21, 10.56it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 40000: Loss = 0.0000


Epoch 2/10:  42%|████▏     | 42004/99157 [25:34<1:54:30,  8.32it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 42000: Loss = 0.0000


Epoch 2/10:  44%|████▍     | 44004/99157 [26:46<1:26:51, 10.58it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 44000: Loss = 0.0000


Epoch 2/10:  46%|████▋     | 46003/99157 [27:55<1:45:29,  8.40it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 46000: Loss = 0.0000


Epoch 2/10:  48%|████▊     | 48004/99157 [28:59<1:16:50, 11.10it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 48000: Loss = 0.0000


Epoch 2/10:  50%|█████     | 50004/99157 [30:01<1:19:05, 10.36it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 50000: Loss = 0.0000


Epoch 2/10:  52%|█████▏    | 52002/99157 [31:09<1:34:14,  8.34it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 52000: Loss = 0.0000


Epoch 2/10:  54%|█████▍    | 54003/99157 [32:22<1:32:16,  8.16it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 54000: Loss = 0.0000


Epoch 2/10:  56%|█████▋    | 56004/99157 [33:36<1:43:45,  6.93it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 56000: Loss = 0.0000


Epoch 2/10:  58%|█████▊    | 58004/99157 [34:52<1:25:16,  8.04it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 58000: Loss = 0.0000


Epoch 2/10:  61%|██████    | 60005/99157 [36:03<1:24:41,  7.71it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 60000: Loss = 0.0000


Epoch 2/10:  63%|██████▎   | 62004/99157 [37:11<53:31, 11.57it/s, train_loss=0.0000]  

[Eval] Epoch 2, Step 62000: Loss = 0.0000


Epoch 2/10:  65%|██████▍   | 64004/99157 [38:16<1:14:24,  7.87it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 64000: Loss = 0.0000


Epoch 2/10:  67%|██████▋   | 66004/99157 [39:26<57:31,  9.60it/s, train_loss=0.0000]  

[Eval] Epoch 2, Step 66000: Loss = 0.0000


Epoch 2/10:  69%|██████▊   | 68003/99157 [40:35<1:22:58,  6.26it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 68000: Loss = 0.0000


Epoch 2/10:  71%|███████   | 70004/99157 [41:45<1:14:26,  6.53it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 70000: Loss = 0.0000


Epoch 2/10:  73%|███████▎  | 72003/99157 [42:56<1:32:35,  4.89it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 72000: Loss = 0.0000


Epoch 2/10:  75%|███████▍  | 74003/99157 [44:11<41:58,  9.99it/s, train_loss=0.0000]  

[Eval] Epoch 2, Step 74000: Loss = 0.0000


Epoch 2/10:  77%|███████▋  | 76006/99157 [45:22<34:55, 11.05it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 76000: Loss = 0.0000


Epoch 2/10:  79%|███████▊  | 78005/99157 [46:30<37:23,  9.43it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 78000: Loss = 0.0005


Epoch 2/10:  81%|████████  | 80004/99157 [47:38<35:53,  8.89it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 80000: Loss = 0.0000


Epoch 2/10:  83%|████████▎ | 82004/99157 [48:47<27:12, 10.51it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 82000: Loss = 0.0000


Epoch 2/10:  85%|████████▍ | 84003/99157 [49:57<29:07,  8.67it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 84000: Loss = 0.0000


Epoch 2/10:  87%|████████▋ | 86003/99157 [51:05<27:44,  7.90it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 86000: Loss = 0.0000


Epoch 2/10:  89%|████████▉ | 88004/99157 [52:15<16:54, 11.00it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 88000: Loss = 0.0000


Epoch 2/10:  91%|█████████ | 90004/99157 [53:24<17:11,  8.87it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 90000: Loss = 0.0000


Epoch 2/10:  93%|█████████▎| 92004/99157 [54:33<13:13,  9.02it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 92000: Loss = 0.0000


Epoch 2/10:  95%|█████████▍| 94004/99157 [55:42<08:44,  9.82it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 94000: Loss = 0.0000


Epoch 2/10:  97%|█████████▋| 96005/99157 [56:50<05:55,  8.86it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 96000: Loss = 0.0000


Epoch 2/10:  99%|█████████▉| 98005/99157 [57:58<01:43, 11.15it/s, train_loss=0.0000]

[Eval] Epoch 2, Step 98000: Loss = 0.0000


Epoch 2/10: 100%|██████████| 99157/99157 [58:37<00:00, 28.19it/s, train_loss=0.0000]


Checkpoint saved at ./checkpoints\slm_epoch_2.pt


Epoch 3/10:   2%|▏         | 2006/99157 [01:08<2:23:41, 11.27it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 2000: Loss = 0.0000


Epoch 3/10:   4%|▍         | 4005/99157 [02:16<2:39:43,  9.93it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 4000: Loss = 0.0000


Epoch 3/10:   6%|▌         | 6004/99157 [03:24<2:50:03,  9.13it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 6000: Loss = 0.0000


Epoch 3/10:   8%|▊         | 8003/99157 [04:32<2:33:32,  9.90it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 8000: Loss = 0.0000


Epoch 3/10:  10%|█         | 10004/99157 [05:40<2:27:23, 10.08it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 10000: Loss = 0.0000


Epoch 3/10:  12%|█▏        | 12004/99157 [06:48<2:11:14, 11.07it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 12000: Loss = 0.0000


Epoch 3/10:  14%|█▍        | 14006/99157 [07:55<2:09:42, 10.94it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 14000: Loss = 0.0000


Epoch 3/10:  16%|█▌        | 16004/99157 [09:03<2:09:09, 10.73it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 16000: Loss = 0.0000


Epoch 3/10:  18%|█▊        | 18004/99157 [10:11<2:09:34, 10.44it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 18000: Loss = 0.0000


Epoch 3/10:  20%|██        | 20005/99157 [11:16<2:00:16, 10.97it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 20000: Loss = 0.0000


Epoch 3/10:  22%|██▏       | 22003/99157 [12:22<2:07:52, 10.06it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 22000: Loss = 0.0000


Epoch 3/10:  24%|██▍       | 24004/99157 [13:27<1:47:49, 11.62it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 24000: Loss = 0.0000


Epoch 3/10:  26%|██▌       | 26004/99157 [14:32<1:49:52, 11.10it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 26000: Loss = 0.0000


Epoch 3/10:  28%|██▊       | 28005/99157 [15:37<1:57:35, 10.08it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 28000: Loss = 0.0000


Epoch 3/10:  30%|███       | 30003/99157 [16:42<1:49:24, 10.54it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 30000: Loss = 0.0000


Epoch 3/10:  32%|███▏      | 32006/99157 [17:48<1:43:05, 10.86it/s, train_loss=0.0003]

[Eval] Epoch 3, Step 32000: Loss = 0.0000


Epoch 3/10:  34%|███▍      | 34006/99157 [18:53<1:46:45, 10.17it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 34000: Loss = 0.0000


Epoch 3/10:  36%|███▋      | 36005/99157 [19:58<1:41:33, 10.36it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 36000: Loss = 0.0000


Epoch 3/10:  38%|███▊      | 38004/99157 [21:03<1:37:06, 10.49it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 38000: Loss = 0.0000


Epoch 3/10:  40%|████      | 40004/99157 [22:08<1:34:37, 10.42it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 40000: Loss = 0.0000


Epoch 3/10:  42%|████▏     | 42005/99157 [23:13<1:36:42,  9.85it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 42000: Loss = 0.0000


Epoch 3/10:  44%|████▍     | 44004/99157 [24:18<1:28:31, 10.38it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 44000: Loss = 0.0000


Epoch 3/10:  46%|████▋     | 46003/99157 [25:23<1:26:06, 10.29it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 46000: Loss = 0.0000


Epoch 3/10:  48%|████▊     | 48005/99157 [26:29<1:32:13,  9.24it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 48000: Loss = 0.0000


Epoch 3/10:  50%|█████     | 50003/99157 [27:33<1:18:20, 10.46it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 50000: Loss = 0.0000


Epoch 3/10:  52%|█████▏    | 52004/99157 [28:38<1:09:26, 11.32it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 52000: Loss = 0.0000


Epoch 3/10:  54%|█████▍    | 54004/99157 [29:44<1:11:05, 10.59it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 54000: Loss = 0.0000


Epoch 3/10:  56%|█████▋    | 56003/99157 [30:48<1:10:11, 10.25it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 56000: Loss = 0.0000


Epoch 3/10:  58%|█████▊    | 58005/99157 [31:54<1:04:55, 10.56it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 58000: Loss = 0.0000


Epoch 3/10:  61%|██████    | 60005/99157 [32:59<1:00:35, 10.77it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 60000: Loss = 0.0000


Epoch 3/10:  63%|██████▎   | 62004/99157 [34:04<56:41, 10.92it/s, train_loss=0.0000]  

[Eval] Epoch 3, Step 62000: Loss = 0.0000


Epoch 3/10:  65%|██████▍   | 64006/99157 [35:09<52:52, 11.08it/s, train_loss=0.0000]  

[Eval] Epoch 3, Step 64000: Loss = 0.0000


Epoch 3/10:  67%|██████▋   | 66006/99157 [36:14<49:32, 11.15it/s, train_loss=0.0000]  

[Eval] Epoch 3, Step 66000: Loss = 0.0000


Epoch 3/10:  69%|██████▊   | 68003/99157 [37:19<1:00:49,  8.54it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 68000: Loss = 0.0000


Epoch 3/10:  71%|███████   | 70005/99157 [38:24<43:21, 11.21it/s, train_loss=0.0000]  

[Eval] Epoch 3, Step 70000: Loss = 0.0000


Epoch 3/10:  73%|███████▎  | 72004/99157 [39:29<43:03, 10.51it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 72000: Loss = 0.0000


Epoch 3/10:  75%|███████▍  | 74006/99157 [40:34<36:52, 11.37it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 74000: Loss = 0.0000


Epoch 3/10:  77%|███████▋  | 76003/99157 [41:39<36:19, 10.62it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 76000: Loss = 0.0000


Epoch 3/10:  79%|███████▊  | 78006/99157 [42:45<31:02, 11.36it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 78000: Loss = 0.0000


Epoch 3/10:  81%|████████  | 80005/99157 [43:49<28:10, 11.33it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 80000: Loss = 0.0000


Epoch 3/10:  83%|████████▎ | 82004/99157 [44:55<26:36, 10.74it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 82000: Loss = 0.0000


Epoch 3/10:  85%|████████▍ | 84006/99157 [46:01<22:46, 11.09it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 84000: Loss = 0.0000


Epoch 3/10:  87%|████████▋ | 86003/99157 [47:06<22:48,  9.61it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 86000: Loss = 0.0000


Epoch 3/10:  89%|████████▉ | 88005/99157 [48:12<18:06, 10.26it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 88000: Loss = 0.0000


Epoch 3/10:  91%|█████████ | 90004/99157 [49:17<13:41, 11.14it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 90000: Loss = 0.0000


Epoch 3/10:  93%|█████████▎| 92004/99157 [50:22<11:24, 10.44it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 92000: Loss = 0.0000


Epoch 3/10:  95%|█████████▍| 94004/99157 [51:28<07:59, 10.75it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 94000: Loss = 0.0000


Epoch 3/10:  97%|█████████▋| 96005/99157 [52:33<04:37, 11.34it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 96000: Loss = 0.0000


Epoch 3/10:  99%|█████████▉| 98006/99157 [53:38<01:43, 11.10it/s, train_loss=0.0000]

[Eval] Epoch 3, Step 98000: Loss = 0.0000


Epoch 3/10: 100%|██████████| 99157/99157 [54:15<00:00, 30.46it/s, train_loss=0.0000]


Checkpoint saved at ./checkpoints\slm_epoch_3.pt


Epoch 4/10:   2%|▏         | 2005/99157 [01:06<2:49:52,  9.53it/s, train_loss=0.0016]

[Eval] Epoch 4, Step 2000: Loss = 0.0000


Epoch 4/10:   4%|▍         | 4004/99157 [02:11<2:33:42, 10.32it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 4000: Loss = 0.0000


Epoch 4/10:   6%|▌         | 6006/99157 [03:16<2:35:46,  9.97it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 6000: Loss = 0.0000


Epoch 4/10:   8%|▊         | 8003/99157 [04:21<2:55:13,  8.67it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 8000: Loss = 0.0000


Epoch 4/10:  10%|█         | 10003/99157 [05:27<2:32:30,  9.74it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 10000: Loss = 0.0000


Epoch 4/10:  12%|█▏        | 12004/99157 [06:33<2:12:43, 10.94it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 12000: Loss = 0.0000


Epoch 4/10:  14%|█▍        | 14006/99157 [07:38<2:05:17, 11.33it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 14000: Loss = 0.0000


Epoch 4/10:  16%|█▌        | 16004/99157 [08:43<2:09:45, 10.68it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 16000: Loss = 0.0000


Epoch 4/10:  18%|█▊        | 18004/99157 [09:48<2:16:47,  9.89it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 18000: Loss = 0.0000


Epoch 4/10:  20%|██        | 20006/99157 [10:53<1:58:46, 11.11it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 20000: Loss = 0.0000


Epoch 4/10:  22%|██▏       | 22005/99157 [11:59<1:58:17, 10.87it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 22000: Loss = 0.0000


Epoch 4/10:  24%|██▍       | 24004/99157 [13:04<1:54:37, 10.93it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 24000: Loss = 0.0000


Epoch 4/10:  26%|██▌       | 26007/99157 [14:10<1:47:07, 11.38it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 26000: Loss = 0.0000


Epoch 4/10:  28%|██▊       | 28004/99157 [15:15<1:47:20, 11.05it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 28000: Loss = 0.0000


Epoch 4/10:  30%|███       | 30003/99157 [16:20<2:15:52,  8.48it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 30000: Loss = 0.0000


Epoch 4/10:  32%|███▏      | 32005/99157 [17:26<1:59:33,  9.36it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 32000: Loss = 0.0000


Epoch 4/10:  34%|███▍      | 34004/99157 [18:31<1:40:59, 10.75it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 34000: Loss = 0.0000


Epoch 4/10:  36%|███▋      | 36005/99157 [19:36<1:35:12, 11.06it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 36000: Loss = 0.0000


Epoch 4/10:  38%|███▊      | 38007/99157 [20:43<1:30:49, 11.22it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 38000: Loss = 0.0000


Epoch 4/10:  40%|████      | 40004/99157 [21:48<1:25:44, 11.50it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 40000: Loss = 0.0000


Epoch 4/10:  42%|████▏     | 42006/99157 [22:53<1:33:10, 10.22it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 42000: Loss = 0.0000


Epoch 4/10:  44%|████▍     | 44005/99157 [23:58<1:25:59, 10.69it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 44000: Loss = 0.0000


Epoch 4/10:  46%|████▋     | 46003/99157 [25:04<1:43:27,  8.56it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 46000: Loss = 0.0000


Epoch 4/10:  48%|████▊     | 48006/99157 [26:10<1:15:25, 11.30it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 48000: Loss = 0.0000


Epoch 4/10:  50%|█████     | 50004/99157 [27:15<1:12:24, 11.31it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 50000: Loss = 0.0000


Epoch 4/10:  52%|█████▏    | 52004/99157 [28:21<1:07:58, 11.56it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 52000: Loss = 0.0000


Epoch 4/10:  54%|█████▍    | 54004/99157 [29:26<1:12:56, 10.32it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 54000: Loss = 0.0000


Epoch 4/10:  56%|█████▋    | 56004/99157 [30:31<1:09:58, 10.28it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 56000: Loss = 0.0000


Epoch 4/10:  58%|█████▊    | 58006/99157 [31:37<1:03:13, 10.85it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 58000: Loss = 0.0000


Epoch 4/10:  61%|██████    | 60003/99157 [32:45<1:12:52,  8.95it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 60000: Loss = 0.0000


Epoch 4/10:  63%|██████▎   | 62005/99157 [33:49<53:41, 11.53it/s, train_loss=0.0000]  

[Eval] Epoch 4, Step 62000: Loss = 0.0000


Epoch 4/10:  65%|██████▍   | 64005/99157 [34:53<52:51, 11.08it/s, train_loss=0.0000]  

[Eval] Epoch 4, Step 64000: Loss = 0.0000


Epoch 4/10:  67%|██████▋   | 66004/99157 [35:58<47:36, 11.61it/s, train_loss=0.0000]  

[Eval] Epoch 4, Step 66000: Loss = 0.0000


Epoch 4/10:  69%|██████▊   | 68004/99157 [37:02<52:20,  9.92it/s, train_loss=0.0000]  

[Eval] Epoch 4, Step 68000: Loss = 0.0000


Epoch 4/10:  71%|███████   | 70003/99157 [38:06<44:54, 10.82it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 70000: Loss = 0.0000


Epoch 4/10:  73%|███████▎  | 72004/99157 [39:10<40:59, 11.04it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 72000: Loss = 0.0000


Epoch 4/10:  75%|███████▍  | 74006/99157 [40:14<39:42, 10.56it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 74000: Loss = 0.0000


Epoch 4/10:  77%|███████▋  | 76003/99157 [41:17<42:42,  9.03it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 76000: Loss = 0.0000


Epoch 4/10:  79%|███████▊  | 78004/99157 [42:21<30:20, 11.62it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 78000: Loss = 0.0000


Epoch 4/10:  81%|████████  | 80006/99157 [43:24<27:30, 11.61it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 80000: Loss = 0.0000


Epoch 4/10:  83%|████████▎ | 82004/99157 [44:27<25:38, 11.15it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 82000: Loss = 0.0000


Epoch 4/10:  85%|████████▍ | 84004/99157 [45:31<23:29, 10.75it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 84000: Loss = 0.0000


Epoch 4/10:  87%|████████▋ | 86004/99157 [46:35<21:04, 10.40it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 86000: Loss = 0.0000


Epoch 4/10:  89%|████████▉ | 88003/99157 [47:38<20:25,  9.10it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 88000: Loss = 0.0000


Epoch 4/10:  91%|█████████ | 90004/99157 [48:42<14:48, 10.31it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 90000: Loss = 0.0000


Epoch 4/10:  93%|█████████▎| 92004/99157 [49:46<10:24, 11.46it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 92000: Loss = 0.0000


Epoch 4/10:  95%|█████████▍| 94007/99157 [50:49<07:35, 11.30it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 94000: Loss = 0.0000


Epoch 4/10:  97%|█████████▋| 96006/99157 [51:53<04:52, 10.79it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 96000: Loss = 0.0000


Epoch 4/10:  99%|█████████▉| 98005/99157 [52:58<01:55,  9.98it/s, train_loss=0.0000]

[Eval] Epoch 4, Step 98000: Loss = 0.0000


Epoch 4/10: 100%|██████████| 99157/99157 [53:34<00:00, 30.85it/s, train_loss=0.0000]


Checkpoint saved at ./checkpoints\slm_epoch_4.pt


Epoch 5/10:   2%|▏         | 2004/99157 [01:03<2:26:42, 11.04it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 2000: Loss = 0.0000


Epoch 5/10:   4%|▍         | 4004/99157 [02:06<2:15:59, 11.66it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 4000: Loss = 0.0000


Epoch 5/10:   6%|▌         | 6004/99157 [03:09<2:27:58, 10.49it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 6000: Loss = 0.0000


Epoch 5/10:   8%|▊         | 8005/99157 [04:14<2:31:53, 10.00it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 8000: Loss = 0.0000


Epoch 5/10:  10%|█         | 10004/99157 [05:18<2:52:48,  8.60it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 10000: Loss = 0.0000


Epoch 5/10:  12%|█▏        | 12004/99157 [06:21<2:08:14, 11.33it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 12000: Loss = 0.0000


Epoch 5/10:  14%|█▍        | 14003/99157 [07:25<2:25:03,  9.78it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 14000: Loss = 0.0000


Epoch 5/10:  16%|█▌        | 16003/99157 [08:28<2:31:22,  9.16it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 16000: Loss = 0.0000


Epoch 5/10:  18%|█▊        | 18004/99157 [09:32<2:02:20, 11.05it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 18000: Loss = 0.0000


Epoch 5/10:  20%|██        | 20004/99157 [10:36<2:06:15, 10.45it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 20000: Loss = 0.0000


Epoch 5/10:  22%|██▏       | 22004/99157 [11:40<1:58:09, 10.88it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 22000: Loss = 0.0000


Epoch 5/10:  24%|██▍       | 24005/99157 [12:43<2:06:13,  9.92it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 24000: Loss = 0.0000


Epoch 5/10:  26%|██▌       | 26004/99157 [13:46<1:53:06, 10.78it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 26000: Loss = 0.0000


Epoch 5/10:  28%|██▊       | 28003/99157 [14:49<1:59:08,  9.95it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 28000: Loss = 0.0000


Epoch 5/10:  30%|███       | 30004/99157 [15:53<1:43:14, 11.16it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 30000: Loss = 0.0000


Epoch 5/10:  32%|███▏      | 32006/99157 [16:56<1:36:30, 11.60it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 32000: Loss = 0.0000


Epoch 5/10:  34%|███▍      | 34005/99157 [18:00<1:44:00, 10.44it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 34000: Loss = 0.0000


Epoch 5/10:  36%|███▋      | 36004/99157 [19:03<1:31:46, 11.47it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 36000: Loss = 0.0000


Epoch 5/10:  38%|███▊      | 38003/99157 [20:07<1:52:35,  9.05it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 38000: Loss = 0.0000


Epoch 5/10:  40%|████      | 40005/99157 [21:10<1:30:03, 10.95it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 40000: Loss = 0.0000


Epoch 5/10:  42%|████▏     | 42005/99157 [22:14<1:25:14, 11.18it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 42000: Loss = 0.0000


Epoch 5/10:  44%|████▍     | 44004/99157 [23:17<1:25:49, 10.71it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 44000: Loss = 0.0000


Epoch 5/10:  46%|████▋     | 46006/99157 [24:20<1:16:40, 11.55it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 46000: Loss = 0.0000


Epoch 5/10:  48%|████▊     | 48003/99157 [25:24<1:20:06, 10.64it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 48000: Loss = 0.0000


Epoch 5/10:  50%|█████     | 50002/99157 [26:27<1:29:33,  9.15it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 50000: Loss = 0.0000


Epoch 5/10:  52%|█████▏    | 52006/99157 [27:30<1:14:25, 10.56it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 52000: Loss = 0.0000


Epoch 5/10:  54%|█████▍    | 54005/99157 [28:33<1:10:37, 10.65it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 54000: Loss = 0.0000


Epoch 5/10:  56%|█████▋    | 56004/99157 [29:36<1:03:07, 11.39it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 56000: Loss = 0.0000


Epoch 5/10:  58%|█████▊    | 58004/99157 [30:39<59:30, 11.53it/s, train_loss=0.0000]  

[Eval] Epoch 5, Step 58000: Loss = 0.0000


Epoch 5/10:  61%|██████    | 60004/99157 [31:43<1:05:06, 10.02it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 60000: Loss = 0.0000


Epoch 5/10:  63%|██████▎   | 62006/99157 [32:47<1:00:31, 10.23it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 62000: Loss = 0.0000


Epoch 5/10:  65%|██████▍   | 64006/99157 [33:51<50:50, 11.52it/s, train_loss=0.0000]  

[Eval] Epoch 5, Step 64000: Loss = 0.0000


Epoch 5/10:  67%|██████▋   | 66005/99157 [34:54<47:35, 11.61it/s, train_loss=0.0000]  

[Eval] Epoch 5, Step 66000: Loss = 0.0000


Epoch 5/10:  69%|██████▊   | 68005/99157 [35:58<53:03,  9.78it/s, train_loss=0.0000]  

[Eval] Epoch 5, Step 68000: Loss = 0.0000


Epoch 5/10:  71%|███████   | 70004/99157 [37:01<45:37, 10.65it/s, train_loss=0.0000]  

[Eval] Epoch 5, Step 70000: Loss = 0.0000


Epoch 5/10:  73%|███████▎  | 72003/99157 [38:05<51:15,  8.83it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 72000: Loss = 0.0000


Epoch 5/10:  75%|███████▍  | 74005/99157 [39:09<36:27, 11.50it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 74000: Loss = 0.0000


Epoch 5/10:  77%|███████▋  | 76004/99157 [40:12<32:10, 11.99it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 76000: Loss = 0.0000


Epoch 5/10:  79%|███████▊  | 78005/99157 [41:15<36:50,  9.57it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 78000: Loss = 0.0000


Epoch 5/10:  81%|████████  | 80003/99157 [42:19<29:11, 10.94it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 80000: Loss = 0.0000


Epoch 5/10:  83%|████████▎ | 82006/99157 [43:22<25:06, 11.38it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 82000: Loss = 0.0000


Epoch 5/10:  85%|████████▍ | 84003/99157 [44:26<24:26, 10.33it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 84000: Loss = 0.0000


Epoch 5/10:  87%|████████▋ | 86006/99157 [45:29<19:39, 11.15it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 86000: Loss = 0.0000


Epoch 5/10:  89%|████████▉ | 88006/99157 [46:32<16:52, 11.02it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 88000: Loss = 0.0000


Epoch 5/10:  91%|█████████ | 90006/99157 [47:36<13:17, 11.48it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 90000: Loss = 0.0000


Epoch 5/10:  93%|█████████▎| 92005/99157 [48:40<10:08, 11.74it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 92000: Loss = 0.0000


Epoch 5/10:  95%|█████████▍| 94004/99157 [49:43<07:50, 10.95it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 94000: Loss = 0.0000


Epoch 5/10:  97%|█████████▋| 96006/99157 [50:46<04:35, 11.43it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 96000: Loss = 0.0000


Epoch 5/10:  99%|█████████▉| 98007/99157 [51:50<01:38, 11.63it/s, train_loss=0.0000]

[Eval] Epoch 5, Step 98000: Loss = 0.0000


Epoch 5/10: 100%|██████████| 99157/99157 [52:25<00:00, 31.52it/s, train_loss=0.0000]


Checkpoint saved at ./checkpoints\slm_epoch_5.pt


Epoch 6/10:   2%|▏         | 2006/99157 [01:02<2:32:58, 10.58it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 2000: Loss = 0.0000


Epoch 6/10:   4%|▍         | 4005/99157 [02:05<2:17:24, 11.54it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 4000: Loss = 0.0000


Epoch 6/10:   6%|▌         | 6004/99157 [03:08<2:20:36, 11.04it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 6000: Loss = 0.0000


Epoch 6/10:   8%|▊         | 8004/99157 [04:10<2:09:03, 11.77it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 8000: Loss = 0.0000


Epoch 6/10:  10%|█         | 10005/99157 [05:13<2:13:21, 11.14it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 10000: Loss = 0.0000


Epoch 6/10:  12%|█▏        | 12005/99157 [06:16<2:02:56, 11.82it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 12000: Loss = 0.0000


Epoch 6/10:  14%|█▍        | 14004/99157 [07:18<2:02:01, 11.63it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 14000: Loss = 0.0000


Epoch 6/10:  16%|█▌        | 16006/99157 [08:20<1:55:44, 11.97it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 16000: Loss = 0.0000


Epoch 6/10:  18%|█▊        | 18004/99157 [09:22<1:55:26, 11.72it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 18000: Loss = 0.0000


Epoch 6/10:  20%|██        | 20004/99157 [10:25<2:03:57, 10.64it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 20000: Loss = 0.0000


Epoch 6/10:  22%|██▏       | 22004/99157 [11:27<1:44:49, 12.27it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 22000: Loss = 0.0000


Epoch 6/10:  24%|██▍       | 24007/99157 [12:30<1:51:36, 11.22it/s, train_loss=0.0000]

[Eval] Epoch 6, Step 24000: Loss = 0.0000


Epoch 6/10:  26%|██▌       | 26003/99157 [6:12:27<2:02:59,  9.91it/s, train_loss=0.0000]      

[Eval] Epoch 6, Step 26000: Loss = 0.0000


Epoch 6/10:  28%|██▊       | 27595/99157 [6:13:22<16:08:17,  1.23it/s, train_loss=0.0000]


KeyboardInterrupt: 

In [None]:
total_tokens = 0
for batch in train_loader:
    total_tokens += batch['input_ids'].numel()
    print(f"Processed {total_tokens:,} tokens", end='\r')
print(f"Total tokens in dataset: {total_tokens:,}")


Total tokens in dataset: 77,738,745
