In [1]:
"""
Implements a simple n-gram language model in PyTorch.
Acts as the correctness reference for all the other versions.
"""
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from common import RNG

## Hyperparameter 

In [3]:
context_length = 3 # if 3 tokens predict the 4th, this is a 4-gram model
embedding_size = 64
hidden_size = 512

learning_rate = 1e-3
batch_size = 64
num_steps = 50000

## MLP

In [4]:
class MLP(nn.Module):
    def __init__(self, vocab_size, context_length, embedding_size, hidden_size):
        super().__init__()
        self.wte = nn.Embedding(vocab_size, embedding_size)
        self.mlp = nn.Sequential(
            nn.Linear(context_length * embedding_size, hidden_size),
            nn.Tanh(),
            nn.GELU(),
            nn.Linear(hidden_size, vocab_size)
        )

    def forward(self, idx, targets=None):
        B, T = idx.size()
        emb = self.wte(idx)
        emb = emb.view(B, -1)
        logits = self.mlp(emb)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits, targets)
        return logits, loss

In [5]:
def dataloader(tokens, context_length, batch_size, device):
    n = len(tokens)
    inputs, targets = [], []
    pos = 0
    while True:
        window = tokens[pos:pos + context_length + 1]
        inputs.append(window[:-1])
        targets.append(window[-1])
        if len(inputs) == batch_size:
            yield (torch.tensor(inputs, device=device), torch.tensor(targets, device=device))
            inputs, targets = [], []
        pos += 1
        if pos + context_length >= n:
            pos = 0

In [6]:
def eval_split(model, tokens, context_length, batch_size, device, max_batches=None):
    model.eval()
    total_loss = 0
    num_batches = len(tokens) // batch_size
    if max_batches is not None:
        num_batches = min(num_batches, max_batches)
    data_iter = dataloader(tokens, context_length, batch_size, device)
    for _ in range(num_batches):
        inputs, targets = next(data_iter)
        with torch.no_grad():
            logits, loss = model(inputs, targets)
        total_loss += loss.item()
    mean_loss = total_loss / num_batches
    return mean_loss

## Training

In [7]:
def train(model, train_tokens, val_tokens, context_length, batch_size, num_steps, learning_rate, device):
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    train_data_iter = dataloader(train_tokens, context_length, batch_size, device)
    for step in range(num_steps):
        lr = learning_rate * 0.5 * (1 + math.cos(math.pi * step / num_steps))
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        if step % 200 == 0 or step == num_steps - 1:
            train_loss = eval_split(model, train_tokens, context_length, batch_size, device, max_batches=20)
            val_loss = eval_split(model, val_tokens, context_length, batch_size, device)
            print(f'step {step} | train_loss {train_loss:.4f} | val_loss {val_loss:.4f} | lr {lr:e}')
        model.train()
        inputs, targets = next(train_data_iter)
        logits, loss = model(inputs, targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return val_loss

In [8]:
random = RNG(1337)
# TODO: actually use this rng for the model initialization


## (optional) Optimize Hyperparameter

In [9]:
def objective(trial):
    # Define the hyperparameter search space
    embedding_size = trial.suggest_int('embedding_size', 16, 64)
    hidden_size = trial.suggest_int('hidden_size', 256, 1024)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    context_length = trial.suggest_int('context_length', 2, 5)

    # Load data and preprocess
    train_text = open('data/train.txt', 'r').read()
    uchars = sorted(list(set(train_text)))
    vocab_size = len(uchars)
    char_to_token = {c: i for i, c in enumerate(uchars)}
    train_tokens = [char_to_token[c] for c in train_text]
    val_tokens = [char_to_token[c] for c in open('data/val.txt', 'r').read()]

    # Create the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MLP(vocab_size, context_length, embedding_size, hidden_size).to(device)

    # Train the model
    num_steps = 50000
    val_loss = train(model, train_tokens, val_tokens, context_length, batch_size, num_steps, learning_rate, device)
    
    return val_loss

In [10]:
if __name__ == "__main__":
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)
    
    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

[I 2024-07-19 22:00:11,261] A new study created in memory with name: no-name-f27561d9-03d1-4777-a7c7-83960f9f0297
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)


step 0 | train_loss 3.3023 | val_loss 3.3001 | lr 3.140955e-04
step 200 | train_loss 2.3936 | val_loss 2.4220 | lr 3.140831e-04
step 400 | train_loss 2.3212 | val_loss 2.3475 | lr 3.140459e-04
step 600 | train_loss 2.2930 | val_loss 2.3173 | lr 3.139839e-04
step 800 | train_loss 2.2743 | val_loss 2.2917 | lr 3.138971e-04
step 1000 | train_loss 2.2544 | val_loss 2.2694 | lr 3.137856e-04
step 1200 | train_loss 2.2414 | val_loss 2.2576 | lr 3.136493e-04
step 1400 | train_loss 2.2335 | val_loss 2.2465 | lr 3.134883e-04
step 1600 | train_loss 2.2218 | val_loss 2.2352 | lr 3.133026e-04
step 1800 | train_loss 2.1903 | val_loss 2.2264 | lr 3.130922e-04
step 2000 | train_loss 2.1905 | val_loss 2.2186 | lr 3.128571e-04
step 2200 | train_loss 2.1830 | val_loss 2.2111 | lr 3.125975e-04
step 2400 | train_loss 2.1769 | val_loss 2.2046 | lr 3.123133e-04
step 2600 | train_loss 2.1821 | val_loss 2.1989 | lr 3.120046e-04
step 2800 | train_loss 2.1704 | val_loss 2.1931 | lr 3.116714e-04
step 3000 | train

[I 2024-07-19 22:01:11,896] Trial 0 finished with value: 2.056985463414873 and parameters: {'embedding_size': 47, 'hidden_size': 738, 'learning_rate': 0.0003140954960818081, 'batch_size': 128, 'context_length': 3}. Best is trial 0 with value: 2.056985463414873.


step 49999 | train_loss 1.9644 | val_loss 2.0570 | lr 3.099998e-13
step 0 | train_loss 3.3026 | val_loss 3.3027 | lr 1.288762e-04


  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)


step 200 | train_loss 2.6088 | val_loss 2.6082 | lr 1.288711e-04
step 400 | train_loss 2.5009 | val_loss 2.5000 | lr 1.288559e-04
step 600 | train_loss 2.4483 | val_loss 2.4552 | lr 1.288304e-04
step 800 | train_loss 2.4207 | val_loss 2.4291 | lr 1.287948e-04
step 1000 | train_loss 2.4082 | val_loss 2.4127 | lr 1.287490e-04
step 1200 | train_loss 2.3910 | val_loss 2.4007 | lr 1.286931e-04
step 1400 | train_loss 2.3781 | val_loss 2.3891 | lr 1.286271e-04
step 1600 | train_loss 2.3751 | val_loss 2.3806 | lr 1.285509e-04
step 1800 | train_loss 2.3689 | val_loss 2.3719 | lr 1.284645e-04
step 2000 | train_loss 2.3611 | val_loss 2.3642 | lr 1.283681e-04
step 2200 | train_loss 2.3590 | val_loss 2.3601 | lr 1.282616e-04
step 2400 | train_loss 2.3554 | val_loss 2.3543 | lr 1.281449e-04
step 2600 | train_loss 2.3486 | val_loss 2.3522 | lr 1.280183e-04
step 2800 | train_loss 2.3492 | val_loss 2.3452 | lr 1.278816e-04
step 3000 | train_loss 2.3451 | val_loss 2.3402 | lr 1.277348e-04
step 3200 | tr

[I 2024-07-19 22:02:10,859] Trial 1 finished with value: 2.247021347284317 and parameters: {'embedding_size': 37, 'hidden_size': 913, 'learning_rate': 0.00012887620370903477, 'batch_size': 64, 'context_length': 2}. Best is trial 0 with value: 2.056985463414873.


step 49999 | train_loss 2.2518 | val_loss 2.2470 | lr 1.271957e-13
step 0 | train_loss 3.3209 | val_loss 3.3157 | lr 5.838780e-03
step 200 | train_loss 2.3884 | val_loss 2.3656 | lr 5.838549e-03
step 400 | train_loss 2.2926 | val_loss 2.2995 | lr 5.837858e-03
step 600 | train_loss 2.2929 | val_loss 2.2892 | lr 5.836705e-03
step 800 | train_loss 2.2926 | val_loss 2.2532 | lr 5.835092e-03
step 1000 | train_loss 2.2866 | val_loss 2.2650 | lr 5.833019e-03
step 1200 | train_loss 2.2516 | val_loss 2.2607 | lr 5.830485e-03
step 1400 | train_loss 2.2764 | val_loss 2.2481 | lr 5.827492e-03
step 1600 | train_loss 2.2445 | val_loss 2.2244 | lr 5.824040e-03
step 1800 | train_loss 2.2439 | val_loss 2.2298 | lr 5.820129e-03
step 2000 | train_loss 2.2449 | val_loss 2.2285 | lr 5.815759e-03
step 2200 | train_loss 2.2232 | val_loss 2.2228 | lr 5.810933e-03
step 2400 | train_loss 2.2354 | val_loss 2.2142 | lr 5.805650e-03
step 2600 | train_loss 2.2188 | val_loss 2.2134 | lr 5.799911e-03
step 2800 | trai

[I 2024-07-19 22:03:15,179] Trial 2 finished with value: 2.0138411607061113 and parameters: {'embedding_size': 38, 'hidden_size': 349, 'learning_rate': 0.005838779662173129, 'batch_size': 64, 'context_length': 4}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 1.8828 | val_loss 2.0138 | lr 5.762645e-12
step 0 | train_loss 3.2858 | val_loss 3.2868 | lr 1.265469e-04
step 200 | train_loss 2.6541 | val_loss 2.6628 | lr 1.265419e-04
step 400 | train_loss 2.5204 | val_loss 2.5331 | lr 1.265269e-04
step 600 | train_loss 2.4597 | val_loss 2.4757 | lr 1.265020e-04
step 800 | train_loss 2.4237 | val_loss 2.4398 | lr 1.264670e-04
step 1000 | train_loss 2.3986 | val_loss 2.4140 | lr 1.264221e-04
step 1200 | train_loss 2.3758 | val_loss 2.3943 | lr 1.263672e-04
step 1400 | train_loss 2.3606 | val_loss 2.3793 | lr 1.263023e-04
step 1600 | train_loss 2.3481 | val_loss 2.3653 | lr 1.262275e-04
step 1800 | train_loss 2.3279 | val_loss 2.3541 | lr 1.261427e-04
step 2000 | train_loss 2.3188 | val_loss 2.3445 | lr 1.260480e-04
step 2200 | train_loss 2.3109 | val_loss 2.3346 | lr 1.259434e-04
step 2400 | train_loss 2.3017 | val_loss 2.3266 | lr 1.258289e-04
step 2600 | train_loss 2.2976 | val_loss 2.3183 | lr 1.257045e-04
step 2800 | trai

[I 2024-07-19 22:04:20,117] Trial 3 finished with value: 2.1294838977711543 and parameters: {'embedding_size': 26, 'hidden_size': 494, 'learning_rate': 0.0001265469285635071, 'batch_size': 128, 'context_length': 3}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 2.0851 | val_loss 2.1295 | lr 1.248968e-13
step 0 | train_loss 3.3011 | val_loss 3.3012 | lr 5.006369e-04
step 200 | train_loss 2.3733 | val_loss 2.4022 | lr 5.006171e-04
step 400 | train_loss 2.3125 | val_loss 2.3398 | lr 5.005578e-04
step 600 | train_loss 2.2856 | val_loss 2.3128 | lr 5.004590e-04
step 800 | train_loss 2.2710 | val_loss 2.2871 | lr 5.003207e-04
step 1000 | train_loss 2.2531 | val_loss 2.2647 | lr 5.001429e-04
step 1200 | train_loss 2.2356 | val_loss 2.2525 | lr 4.999257e-04
step 1400 | train_loss 2.2293 | val_loss 2.2457 | lr 4.996690e-04
step 1600 | train_loss 2.2189 | val_loss 2.2294 | lr 4.993730e-04
step 1800 | train_loss 2.1820 | val_loss 2.2232 | lr 4.990377e-04
step 2000 | train_loss 2.1840 | val_loss 2.2159 | lr 4.986630e-04
step 2200 | train_loss 2.1746 | val_loss 2.2069 | lr 4.982492e-04
step 2400 | train_loss 2.1729 | val_loss 2.2024 | lr 4.977962e-04
step 2600 | train_loss 2.1781 | val_loss 2.1968 | lr 4.973041e-04
step 2800 | trai

[I 2024-07-19 22:05:26,384] Trial 4 finished with value: 2.0618603144373213 and parameters: {'embedding_size': 27, 'hidden_size': 824, 'learning_rate': 0.0005006368655812558, 'batch_size': 128, 'context_length': 3}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 1.9511 | val_loss 2.0619 | lr 4.941088e-13
step 0 | train_loss 3.3278 | val_loss 3.3295 | lr 2.115794e-04
step 200 | train_loss 2.5743 | val_loss 2.5863 | lr 2.115710e-04
step 400 | train_loss 2.4985 | val_loss 2.4893 | lr 2.115460e-04
step 600 | train_loss 2.4705 | val_loss 2.4537 | lr 2.115042e-04
step 800 | train_loss 2.4455 | val_loss 2.4276 | lr 2.114458e-04
step 1000 | train_loss 2.4263 | val_loss 2.4080 | lr 2.113706e-04
step 1200 | train_loss 2.3916 | val_loss 2.3959 | lr 2.112788e-04
step 1400 | train_loss 2.3933 | val_loss 2.3887 | lr 2.111703e-04
step 1600 | train_loss 2.3917 | val_loss 2.3744 | lr 2.110452e-04
step 1800 | train_loss 2.3851 | val_loss 2.3737 | lr 2.109035e-04
step 2000 | train_loss 2.3820 | val_loss 2.3665 | lr 2.107452e-04
step 2200 | train_loss 2.3794 | val_loss 2.3628 | lr 2.105703e-04
step 2400 | train_loss 2.3584 | val_loss 2.3553 | lr 2.103788e-04
step 2600 | train_loss 2.3580 | val_loss 2.3482 | lr 2.101709e-04
step 2800 | trai

[I 2024-07-19 22:06:29,167] Trial 5 finished with value: 2.243515248809542 and parameters: {'embedding_size': 51, 'hidden_size': 597, 'learning_rate': 0.00021157937340316298, 'batch_size': 32, 'context_length': 2}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 2.2418 | val_loss 2.2435 | lr 2.088205e-13
step 0 | train_loss 3.3253 | val_loss 3.3269 | lr 4.124402e-03
step 200 | train_loss 2.3704 | val_loss 2.3981 | lr 4.124240e-03
step 400 | train_loss 2.3647 | val_loss 2.3683 | lr 4.123751e-03
step 600 | train_loss 2.3279 | val_loss 2.3587 | lr 4.122937e-03
step 800 | train_loss 2.3119 | val_loss 2.3292 | lr 4.121798e-03
step 1000 | train_loss 2.3239 | val_loss 2.3418 | lr 4.120333e-03
step 1200 | train_loss 2.3158 | val_loss 2.3335 | lr 4.118543e-03
step 1400 | train_loss 2.3012 | val_loss 2.3168 | lr 4.116429e-03
step 1600 | train_loss 2.3360 | val_loss 2.3280 | lr 4.113990e-03
step 1800 | train_loss 2.2562 | val_loss 2.3102 | lr 4.111228e-03
step 2000 | train_loss 2.2574 | val_loss 2.2928 | lr 4.108141e-03
step 2200 | train_loss 2.2803 | val_loss 2.3061 | lr 4.104732e-03
step 2400 | train_loss 2.2669 | val_loss 2.2976 | lr 4.101000e-03
step 2600 | train_loss 2.2904 | val_loss 2.3113 | lr 4.096946e-03
step 2800 | trai

[I 2024-07-19 22:07:27,227] Trial 6 finished with value: 2.2341749710696086 and parameters: {'embedding_size': 40, 'hidden_size': 854, 'learning_rate': 0.004124402324038016, 'batch_size': 128, 'context_length': 2}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 2.1911 | val_loss 2.2342 | lr 4.070622e-12
step 0 | train_loss 3.3170 | val_loss 3.3171 | lr 1.176443e-04
step 200 | train_loss 2.7022 | val_loss 2.7229 | lr 1.176397e-04
step 400 | train_loss 2.5648 | val_loss 2.5713 | lr 1.176257e-04
step 600 | train_loss 2.5080 | val_loss 2.5052 | lr 1.176025e-04
step 800 | train_loss 2.4773 | val_loss 2.4657 | lr 1.175700e-04
step 1000 | train_loss 2.4536 | val_loss 2.4372 | lr 1.175282e-04
step 1200 | train_loss 2.4219 | val_loss 2.4186 | lr 1.174772e-04
step 1400 | train_loss 2.4109 | val_loss 2.4057 | lr 1.174169e-04
step 1600 | train_loss 2.4049 | val_loss 2.3897 | lr 1.173473e-04
step 1800 | train_loss 2.3945 | val_loss 2.3820 | lr 1.172685e-04
step 2000 | train_loss 2.3872 | val_loss 2.3712 | lr 1.171805e-04
step 2200 | train_loss 2.3807 | val_loss 2.3638 | lr 1.170832e-04
step 2400 | train_loss 2.3692 | val_loss 2.3568 | lr 1.169768e-04
step 2600 | train_loss 2.3595 | val_loss 2.3488 | lr 1.168611e-04
step 2800 | trai

[I 2024-07-19 22:08:32,775] Trial 7 finished with value: 2.162983306284462 and parameters: {'embedding_size': 33, 'hidden_size': 710, 'learning_rate': 0.00011764429463011084, 'batch_size': 32, 'context_length': 3}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 2.1579 | val_loss 2.1630 | lr 1.161103e-13
step 0 | train_loss 3.3282 | val_loss 3.3295 | lr 1.571625e-03
step 200 | train_loss 2.4244 | val_loss 2.4242 | lr 1.571563e-03
step 400 | train_loss 2.3718 | val_loss 2.3758 | lr 1.571377e-03
step 600 | train_loss 2.3495 | val_loss 2.3547 | lr 1.571067e-03
step 800 | train_loss 2.3617 | val_loss 2.3392 | lr 1.570632e-03
step 1000 | train_loss 2.3677 | val_loss 2.3372 | lr 1.570074e-03
step 1200 | train_loss 2.3256 | val_loss 2.3312 | lr 1.569392e-03
step 1400 | train_loss 2.3363 | val_loss 2.3182 | lr 1.568587e-03
step 1600 | train_loss 2.3336 | val_loss 2.3231 | lr 1.567657e-03
step 1800 | train_loss 2.3378 | val_loss 2.3144 | lr 1.566605e-03
step 2000 | train_loss 2.3103 | val_loss 2.3053 | lr 1.565429e-03
step 2200 | train_loss 2.3191 | val_loss 2.3070 | lr 1.564129e-03
step 2400 | train_loss 2.3198 | val_loss 2.3011 | lr 1.562707e-03
step 2600 | train_loss 2.3100 | val_loss 2.2994 | lr 1.561163e-03
step 2800 | trai

[I 2024-07-19 22:09:35,397] Trial 8 finished with value: 2.2319393126027927 and parameters: {'embedding_size': 40, 'hidden_size': 463, 'learning_rate': 0.0015716249077989233, 'batch_size': 64, 'context_length': 2}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 2.2189 | val_loss 2.2319 | lr 1.551132e-12
step 0 | train_loss 3.3138 | val_loss 3.3137 | lr 1.871399e-04
step 200 | train_loss 2.5141 | val_loss 2.5152 | lr 1.871325e-04
step 400 | train_loss 2.4370 | val_loss 2.4364 | lr 1.871104e-04
step 600 | train_loss 2.3973 | val_loss 2.4008 | lr 1.870734e-04
step 800 | train_loss 2.3860 | val_loss 2.3807 | lr 1.870217e-04
step 1000 | train_loss 2.3796 | val_loss 2.3687 | lr 1.869553e-04
step 1200 | train_loss 2.3596 | val_loss 2.3612 | lr 1.868741e-04
step 1400 | train_loss 2.3467 | val_loss 2.3486 | lr 1.867781e-04
step 1600 | train_loss 2.3470 | val_loss 2.3440 | lr 1.866675e-04
step 1800 | train_loss 2.3406 | val_loss 2.3358 | lr 1.865421e-04
step 2000 | train_loss 2.3367 | val_loss 2.3279 | lr 1.864021e-04
step 2200 | train_loss 2.3327 | val_loss 2.3249 | lr 1.862474e-04
step 2400 | train_loss 2.3335 | val_loss 2.3212 | lr 1.860781e-04
step 2600 | train_loss 2.3230 | val_loss 2.3170 | lr 1.858941e-04
step 2800 | trai

[I 2024-07-19 22:10:36,750] Trial 9 finished with value: 2.236103798661913 and parameters: {'embedding_size': 57, 'hidden_size': 945, 'learning_rate': 0.00018713991693097823, 'batch_size': 64, 'context_length': 2}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 2.2366 | val_loss 2.2361 | lr 1.846997e-13
step 0 | train_loss 3.3121 | val_loss 3.3107 | lr 8.316679e-03
step 200 | train_loss 2.3933 | val_loss 2.3870 | lr 8.316350e-03
step 400 | train_loss 2.3001 | val_loss 2.3121 | lr 8.315365e-03
step 600 | train_loss 2.2760 | val_loss 2.2836 | lr 8.313724e-03
step 800 | train_loss 2.2774 | val_loss 2.2533 | lr 8.311426e-03
step 1000 | train_loss 2.2648 | val_loss 2.2505 | lr 8.308473e-03
step 1200 | train_loss 2.2331 | val_loss 2.2454 | lr 8.304864e-03
step 1400 | train_loss 2.2577 | val_loss 2.2431 | lr 8.300601e-03
step 1600 | train_loss 2.2451 | val_loss 2.2164 | lr 8.295683e-03
step 1800 | train_loss 2.2519 | val_loss 2.2297 | lr 8.290112e-03
step 2000 | train_loss 2.2413 | val_loss 2.2155 | lr 8.283889e-03
step 2200 | train_loss 2.2249 | val_loss 2.1991 | lr 8.277014e-03
step 2400 | train_loss 2.2315 | val_loss 2.2036 | lr 8.269489e-03
step 2600 | train_loss 2.2329 | val_loss 2.1998 | lr 8.261314e-03
step 2800 | trai

[I 2024-07-19 22:12:31,579] Trial 10 finished with value: 2.0393022000789642 and parameters: {'embedding_size': 16, 'hidden_size': 269, 'learning_rate': 0.0083166786208697, 'batch_size': 64, 'context_length': 5}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 1.9037 | val_loss 2.0393 | lr 8.208233e-12
step 0 | train_loss 3.3157 | val_loss 3.3121 | lr 7.811120e-03
step 200 | train_loss 2.3547 | val_loss 2.3664 | lr 7.810812e-03
step 400 | train_loss 2.3193 | val_loss 2.3167 | lr 7.809887e-03
step 600 | train_loss 2.2667 | val_loss 2.2803 | lr 7.808345e-03
step 800 | train_loss 2.2691 | val_loss 2.2448 | lr 7.806187e-03
step 1000 | train_loss 2.2782 | val_loss 2.2517 | lr 7.803414e-03
step 1200 | train_loss 2.2324 | val_loss 2.2499 | lr 7.800024e-03
step 1400 | train_loss 2.2479 | val_loss 2.2351 | lr 7.796020e-03
step 1600 | train_loss 2.2165 | val_loss 2.2181 | lr 7.791401e-03
step 1800 | train_loss 2.2420 | val_loss 2.2155 | lr 7.786169e-03
step 2000 | train_loss 2.2334 | val_loss 2.2186 | lr 7.780324e-03
step 2200 | train_loss 2.2207 | val_loss 2.2126 | lr 7.773867e-03
step 2400 | train_loss 2.2147 | val_loss 2.1984 | lr 7.766799e-03
step 2600 | train_loss 2.2268 | val_loss 2.1981 | lr 7.759121e-03
step 2800 | trai

[I 2024-07-19 22:13:51,205] Trial 11 finished with value: 2.0257206293089047 and parameters: {'embedding_size': 16, 'hidden_size': 263, 'learning_rate': 0.007811120232798292, 'batch_size': 64, 'context_length': 5}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 1.8794 | val_loss 2.0257 | lr 7.709267e-12
step 0 | train_loss 3.3017 | val_loss 3.3048 | lr 2.966448e-03
step 200 | train_loss 2.3543 | val_loss 2.3886 | lr 2.966331e-03
step 400 | train_loss 2.2859 | val_loss 2.3050 | lr 2.965979e-03
step 600 | train_loss 2.2594 | val_loss 2.2654 | lr 2.965394e-03
step 800 | train_loss 2.2540 | val_loss 2.2435 | lr 2.964574e-03
step 1000 | train_loss 2.2489 | val_loss 2.2297 | lr 2.963521e-03
step 1200 | train_loss 2.2134 | val_loss 2.2170 | lr 2.962234e-03
step 1400 | train_loss 2.2176 | val_loss 2.2052 | lr 2.960713e-03
step 1600 | train_loss 2.1991 | val_loss 2.1893 | lr 2.958959e-03
step 1800 | train_loss 2.2084 | val_loss 2.1808 | lr 2.956972e-03
step 2000 | train_loss 2.1943 | val_loss 2.1746 | lr 2.954752e-03
step 2200 | train_loss 2.1863 | val_loss 2.1703 | lr 2.952300e-03
step 2400 | train_loss 2.1838 | val_loss 2.1550 | lr 2.949616e-03
step 2600 | train_loss 2.1642 | val_loss 2.1530 | lr 2.946700e-03
step 2800 | trai

[I 2024-07-19 22:14:55,992] Trial 12 finished with value: 2.0331808466996466 and parameters: {'embedding_size': 16, 'hidden_size': 289, 'learning_rate': 0.0029664477823724117, 'batch_size': 64, 'context_length': 5}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 1.8553 | val_loss 2.0332 | lr 2.927767e-12
step 0 | train_loss 3.3180 | val_loss 3.3136 | lr 9.882995e-03
step 200 | train_loss 2.4844 | val_loss 2.4737 | lr 9.882605e-03
step 400 | train_loss 2.3693 | val_loss 2.3741 | lr 9.881435e-03
step 600 | train_loss 2.3862 | val_loss 2.3998 | lr 9.879484e-03
step 800 | train_loss 2.4421 | val_loss 2.3848 | lr 9.876754e-03
step 1000 | train_loss 2.4268 | val_loss 2.3941 | lr 9.873244e-03
step 1200 | train_loss 2.3323 | val_loss 2.3713 | lr 9.868956e-03
step 1400 | train_loss 2.3789 | val_loss 2.3669 | lr 9.863889e-03
step 1600 | train_loss 2.3712 | val_loss 2.3607 | lr 9.858046e-03
step 1800 | train_loss 2.3815 | val_loss 2.3440 | lr 9.851425e-03
step 2000 | train_loss 2.3872 | val_loss 2.3859 | lr 9.844030e-03
step 2200 | train_loss 2.3955 | val_loss 2.3998 | lr 9.835860e-03
step 2400 | train_loss 2.3636 | val_loss 2.3340 | lr 9.826918e-03
step 2600 | train_loss 2.3837 | val_loss 2.3684 | lr 9.817204e-03
step 2800 | trai

[I 2024-07-19 22:16:00,435] Trial 13 finished with value: 2.073575678680624 and parameters: {'embedding_size': 60, 'hidden_size': 403, 'learning_rate': 0.009882995088996524, 'batch_size': 64, 'context_length': 4}. Best is trial 2 with value: 2.0138411607061113.


step 49999 | train_loss 2.0300 | val_loss 2.0736 | lr 9.754125e-12
step 0 | train_loss 3.2850 | val_loss 3.2966 | lr 4.788129e-03
step 200 | train_loss 2.3539 | val_loss 2.3602 | lr 4.787940e-03
step 400 | train_loss 2.2785 | val_loss 2.2931 | lr 4.787373e-03
step 600 | train_loss 2.2577 | val_loss 2.2703 | lr 4.786428e-03
step 800 | train_loss 2.2625 | val_loss 2.2318 | lr 4.785105e-03
step 1000 | train_loss 2.2676 | val_loss 2.2426 | lr 4.783405e-03
step 1200 | train_loss 2.2081 | val_loss 2.2350 | lr 4.781327e-03
step 1400 | train_loss 2.2294 | val_loss 2.2177 | lr 4.778873e-03
step 1600 | train_loss 2.2053 | val_loss 2.1976 | lr 4.776042e-03
step 1800 | train_loss 2.2210 | val_loss 2.1945 | lr 4.772834e-03
step 2000 | train_loss 2.2087 | val_loss 2.1897 | lr 4.769251e-03
step 2200 | train_loss 2.1826 | val_loss 2.1849 | lr 4.765293e-03
step 2400 | train_loss 2.1922 | val_loss 2.1717 | lr 4.760961e-03
step 2600 | train_loss 2.1933 | val_loss 2.1731 | lr 4.756254e-03
step 2800 | trai

[I 2024-07-19 22:17:05,754] Trial 14 finished with value: 2.0076147115656306 and parameters: {'embedding_size': 25, 'hidden_size': 382, 'learning_rate': 0.00478812911907272, 'batch_size': 64, 'context_length': 4}. Best is trial 14 with value: 2.0076147115656306.


step 49999 | train_loss 1.8243 | val_loss 2.0076 | lr 4.725694e-12
step 0 | train_loss 3.3177 | val_loss 3.3187 | lr 1.165369e-03
step 200 | train_loss 2.3780 | val_loss 2.4129 | lr 1.165323e-03
step 400 | train_loss 2.3014 | val_loss 2.3313 | lr 1.165185e-03
step 600 | train_loss 2.2724 | val_loss 2.2910 | lr 1.164955e-03
step 800 | train_loss 2.2503 | val_loss 2.2607 | lr 1.164633e-03
step 1000 | train_loss 2.2482 | val_loss 2.2466 | lr 1.164219e-03
step 1200 | train_loss 2.2179 | val_loss 2.2328 | lr 1.163713e-03
step 1400 | train_loss 2.2083 | val_loss 2.2108 | lr 1.163116e-03
step 1600 | train_loss 2.2064 | val_loss 2.2075 | lr 1.162427e-03
step 1800 | train_loss 2.1930 | val_loss 2.1971 | lr 1.161646e-03
step 2000 | train_loss 2.1866 | val_loss 2.1785 | lr 1.160774e-03
step 2200 | train_loss 2.1790 | val_loss 2.1820 | lr 1.159811e-03
step 2400 | train_loss 2.1789 | val_loss 2.1722 | lr 1.158756e-03
step 2600 | train_loss 2.1648 | val_loss 2.1667 | lr 1.157611e-03
step 2800 | trai

[I 2024-07-19 22:18:09,515] Trial 15 finished with value: 2.0101490222982 and parameters: {'embedding_size': 25, 'hidden_size': 394, 'learning_rate': 0.0011653685943772332, 'batch_size': 64, 'context_length': 4}. Best is trial 14 with value: 2.0076147115656306.


step 49999 | train_loss 1.8610 | val_loss 2.0101 | lr 1.150173e-12
step 0 | train_loss 3.3083 | val_loss 3.3139 | lr 1.311942e-03
step 200 | train_loss 2.3511 | val_loss 2.4263 | lr 1.311890e-03
step 400 | train_loss 2.3095 | val_loss 2.3589 | lr 1.311735e-03
step 600 | train_loss 2.3005 | val_loss 2.3356 | lr 1.311476e-03
step 800 | train_loss 2.2704 | val_loss 2.3042 | lr 1.311114e-03
step 1000 | train_loss 2.2630 | val_loss 2.2761 | lr 1.310648e-03
step 1200 | train_loss 2.2461 | val_loss 2.2759 | lr 1.310078e-03
step 1400 | train_loss 2.2656 | val_loss 2.2679 | lr 1.309406e-03
step 1600 | train_loss 2.2526 | val_loss 2.2428 | lr 1.308630e-03
step 1800 | train_loss 2.2504 | val_loss 2.2438 | lr 1.307751e-03
step 2000 | train_loss 2.2443 | val_loss 2.2361 | lr 1.306770e-03
step 2200 | train_loss 2.2479 | val_loss 2.2295 | lr 1.305685e-03
step 2400 | train_loss 2.1858 | val_loss 2.2214 | lr 1.304498e-03
step 2600 | train_loss 2.2339 | val_loss 2.2074 | lr 1.303208e-03
step 2800 | trai

[I 2024-07-19 22:19:13,058] Trial 16 finished with value: 1.9904905743896961 and parameters: {'embedding_size': 25, 'hidden_size': 534, 'learning_rate': 0.0013119420989330382, 'batch_size': 32, 'context_length': 4}. Best is trial 16 with value: 1.9904905743896961.


step 49999 | train_loss 1.8708 | val_loss 1.9905 | lr 1.294835e-12
step 0 | train_loss 3.2958 | val_loss 3.2995 | lr 2.201489e-03
step 200 | train_loss 2.3566 | val_loss 2.4277 | lr 2.201402e-03
step 400 | train_loss 2.3331 | val_loss 2.3725 | lr 2.201141e-03
step 600 | train_loss 2.3119 | val_loss 2.3287 | lr 2.200707e-03
step 800 | train_loss 2.2604 | val_loss 2.2894 | lr 2.200098e-03
step 1000 | train_loss 2.2604 | val_loss 2.2645 | lr 2.199317e-03
step 1200 | train_loss 2.2550 | val_loss 2.2760 | lr 2.198361e-03
step 1400 | train_loss 2.3023 | val_loss 2.2722 | lr 2.197233e-03
step 1600 | train_loss 2.2674 | val_loss 2.2370 | lr 2.195931e-03
step 1800 | train_loss 2.2702 | val_loss 2.2390 | lr 2.194456e-03
step 2000 | train_loss 2.2599 | val_loss 2.2330 | lr 2.192809e-03
step 2200 | train_loss 2.2354 | val_loss 2.2202 | lr 2.190989e-03
step 2400 | train_loss 2.1933 | val_loss 2.2208 | lr 2.188997e-03
step 2600 | train_loss 2.2247 | val_loss 2.1989 | lr 2.186833e-03
step 2800 | trai

[I 2024-07-19 22:20:17,489] Trial 17 finished with value: 1.9984359991337572 and parameters: {'embedding_size': 30, 'hidden_size': 557, 'learning_rate': 0.0022014887199505615, 'batch_size': 32, 'context_length': 4}. Best is trial 16 with value: 1.9904905743896961.


step 49999 | train_loss 1.8504 | val_loss 1.9984 | lr 2.172782e-12
step 0 | train_loss 3.2977 | val_loss 3.2999 | lr 2.073719e-03
step 200 | train_loss 2.3489 | val_loss 2.4343 | lr 2.073637e-03
step 400 | train_loss 2.3527 | val_loss 2.3716 | lr 2.073392e-03
step 600 | train_loss 2.3371 | val_loss 2.3365 | lr 2.072983e-03
step 800 | train_loss 2.2761 | val_loss 2.2845 | lr 2.072410e-03
step 1000 | train_loss 2.2823 | val_loss 2.2704 | lr 2.071673e-03
step 1200 | train_loss 2.2636 | val_loss 2.2744 | lr 2.070774e-03
step 1400 | train_loss 2.2958 | val_loss 2.2651 | lr 2.069710e-03
step 1600 | train_loss 2.2756 | val_loss 2.2370 | lr 2.068484e-03
step 1800 | train_loss 2.2896 | val_loss 2.2393 | lr 2.067095e-03
step 2000 | train_loss 2.2630 | val_loss 2.2288 | lr 2.065543e-03
step 2200 | train_loss 2.2557 | val_loss 2.2242 | lr 2.063829e-03
step 2400 | train_loss 2.1993 | val_loss 2.2229 | lr 2.061953e-03
step 2600 | train_loss 2.2053 | val_loss 2.2006 | lr 2.059915e-03
step 2800 | trai

[I 2024-07-19 22:21:31,038] Trial 18 finished with value: 2.000408693615879 and parameters: {'embedding_size': 32, 'hidden_size': 576, 'learning_rate': 0.002073719357181968, 'batch_size': 32, 'context_length': 4}. Best is trial 16 with value: 1.9904905743896961.


step 49999 | train_loss 1.8385 | val_loss 2.0004 | lr 2.046679e-12
step 0 | train_loss 3.3071 | val_loss 3.3067 | lr 6.711250e-04
step 200 | train_loss 2.4084 | val_loss 2.4944 | lr 6.710985e-04
step 400 | train_loss 2.3621 | val_loss 2.4161 | lr 6.710190e-04
step 600 | train_loss 2.3345 | val_loss 2.3776 | lr 6.708865e-04
step 800 | train_loss 2.3151 | val_loss 2.3421 | lr 6.707011e-04
step 1000 | train_loss 2.2843 | val_loss 2.3113 | lr 6.704628e-04
step 1200 | train_loss 2.2717 | val_loss 2.2939 | lr 6.701716e-04
step 1400 | train_loss 2.2743 | val_loss 2.2878 | lr 6.698276e-04
step 1600 | train_loss 2.2749 | val_loss 2.2663 | lr 6.694307e-04
step 1800 | train_loss 2.2656 | val_loss 2.2571 | lr 6.689812e-04
step 2000 | train_loss 2.2589 | val_loss 2.2465 | lr 6.684790e-04
step 2200 | train_loss 2.2493 | val_loss 2.2365 | lr 6.679242e-04
step 2400 | train_loss 2.2237 | val_loss 2.2353 | lr 6.673169e-04
step 2600 | train_loss 2.2210 | val_loss 2.2204 | lr 6.666573e-04
step 2800 | trai

[I 2024-07-19 22:22:46,558] Trial 19 finished with value: 2.0004928798547814 and parameters: {'embedding_size': 21, 'hidden_size': 521, 'learning_rate': 0.0006711249701238267, 'batch_size': 32, 'context_length': 5}. Best is trial 16 with value: 1.9904905743896961.


step 49999 | train_loss 1.8998 | val_loss 2.0005 | lr 6.623738e-13
step 0 | train_loss 3.3142 | val_loss 3.3156 | lr 8.683602e-04
step 200 | train_loss 2.3656 | val_loss 2.4419 | lr 8.683259e-04
step 400 | train_loss 2.3226 | val_loss 2.3692 | lr 8.682231e-04
step 600 | train_loss 2.3220 | val_loss 2.3399 | lr 8.680517e-04
step 800 | train_loss 2.2796 | val_loss 2.2960 | lr 8.678118e-04
step 1000 | train_loss 2.2567 | val_loss 2.2754 | lr 8.675035e-04
step 1200 | train_loss 2.2439 | val_loss 2.2696 | lr 8.671267e-04
step 1400 | train_loss 2.2744 | val_loss 2.2659 | lr 8.666815e-04
step 1600 | train_loss 2.2593 | val_loss 2.2411 | lr 8.661680e-04
step 1800 | train_loss 2.2540 | val_loss 2.2411 | lr 8.655864e-04
step 2000 | train_loss 2.2448 | val_loss 2.2326 | lr 8.649366e-04
step 2200 | train_loss 2.2236 | val_loss 2.2207 | lr 8.642187e-04
step 2400 | train_loss 2.1911 | val_loss 2.2185 | lr 8.634330e-04
step 2600 | train_loss 2.2097 | val_loss 2.2007 | lr 8.625795e-04
step 2800 | trai

[W 2024-07-19 22:23:21,219] Trial 20 failed with parameters: {'embedding_size': 32, 'hidden_size': 654, 'learning_rate': 0.0008683602070624887, 'batch_size': 32, 'context_length': 4} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\50196\.conda\envs\deeplearning\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\50196\AppData\Local\Temp\ipykernel_46372\779706708.py", line 23, in objective
    val_loss = train(model, train_tokens, val_tokens, context_length, batch_size, num_steps, learning_rate, device)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\50196\AppData\Local\Temp\ipykernel_46372\2298479787.py", line 15, in train
    loss.backward()
  File "C:\Users\50196\.conda\envs\deeplearning\Lib\site-packages\torch\_tensor.py", line 525, in backwa

step 25200 | train_loss 1.9767 | val_loss 2.0272 | lr 4.287242e-04


KeyboardInterrupt: 