# Inital Supervised Learning performed on the model

### Imports

In [1]:
import os
import numpy as np # type: ignore
import time
import torch
import torch.nn as nn # type: ignore
import torch.optim as optim # type: ignore
import helper_functions as helper
from torch.utils.data import DataLoader, random_split # type: ignore
from chess import pgn # type: ignore
from tqdm import tqdm # type: ignore
from dataset import ChessPGNDataset
from model import ChessNet, ResBlock

#### Device Agnostic Code

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [3]:
print(torch.__version__)

2.10.0+cu126


# Processing the PGNs

### Creating Move Map

In [4]:
MOVE_MAP = helper.create_move_map()

In [5]:
MOVE_MAP_LENGTH = len(MOVE_MAP)
print(MOVE_MAP_LENGTH)

8192


### Loading the data

In [6]:
pgn_path = "../lichess_db_standard_rated_2026-01.pgn/lichess_db_standard_rated_2026-01.pgn"

In [7]:
pgn_dataset_v1 = ChessPGNDataset(pgn_file_path=pgn_path, move_map=MOVE_MAP)

In [8]:
print(len(pgn_dataset_v1))

7925


In [9]:
total_size = len(pgn_dataset_v1)
train_size = int(0.8 * total_size) 
val_size = total_size - train_size

In [10]:
train_dataset, val_dataset = random_split(pgn_dataset_v1, [train_size, val_size])

In [11]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
print(len(next(iter(train_loader)))) 

3


#### Stuff to do before training begins

In [None]:
train_data_iter = iter(train_loader)
images, labels, games = next(train_data_iter)

print(f"Batch shape (Images): {images.shape}") 
print(f"Batch shape (Labels): {labels.shape}") 
print(f"Batch shape (Labels): {games.shape}")
print(f"First label: {labels[0].item()}")      

Batch shape (Images): torch.Size([64, 12, 8, 8])
Batch shape (Labels): torch.Size([64])
Batch shape (Labels): torch.Size([64, 1])
First label: 1318


In [None]:
val_data_iter = iter(val_loader)
images, labels, games = next(val_data_iter)

print(f"Batch shape (Images): {images.shape}") 
print(f"Batch shape (Labels): {labels.shape}") 
print(f"Batch shape (Labels): {games.shape}")
print(f"First label: {labels[0].item()}")      

Batch shape (Images): torch.Size([64, 12, 8, 8])
Batch shape (Labels): torch.Size([64])
Batch shape (Labels): torch.Size([64, 1])
First label: 4298


# Making the model and training loop

#### Making the model

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [None]:
model = ChessNet(MOVE_MAP_LENGTH)
model.to(device)
policy_criterion = nn.CrossEntropyLoss()
value_criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)

#### Making the training loop

In [None]:
test_batch = next(iter(train_loader))
test_images, test_labels, test_values = [x.to(device) for x in test_batch]
print(f"Test images: min={test_images.min():.4f}, max={test_images.max():.4f}")
print(f"Test values: min={test_values.min():.4f}, max={test_values.max():.4f}, mean={test_values.mean():.4f}")

with torch.no_grad():
    pred_pol, pred_val = model(test_images)
    print(f"Pred values: min={pred_val.min():.4f}, max={pred_val.max():.4f}")
    test_loss = torch.nn.functional.mse_loss(pred_val, test_values.float())
    print(f"Test MSE loss: {test_loss.item():.6f}")


Test images: min=0.0000, max=1.0000
Test values: min=-1.0000, max=1.0000, mean=0.0469
Pred values: min=-0.1894, max=0.2884
Test MSE loss: 0.931998


In [None]:
print(f"Move map size: {len(MOVE_MAP)}")
print(f"Policy head output size: {MOVE_MAP_LENGTH}")
print(f"Move indices range: 0 to {max(MOVE_MAP.values())}")

out_of_range = [idx for idx in MOVE_MAP.values() if idx >= MOVE_MAP_LENGTH]
if out_of_range:
    print(f"WARNING: {len(out_of_range)} move indices are >= {MOVE_MAP_LENGTH}!")
else:
    print(f"All move indices are within valid range [0, {MOVE_MAP_LENGTH-1}]")

test_batch = next(iter(train_loader))
test_images, test_labels, test_values = test_batch
print(f"\nBatch shapes:")
print(f"  Images: {test_images.shape}")
print(f"  Labels (move indices): {test_labels.shape}, min={test_labels.min()}, max={test_labels.max()}")
print(f"  Values: {test_values.shape}")


Move map size: 8192
Policy head output size: 8192
Move indices range: 0 to 8191
All move indices are within valid range [0, 8191]

Batch shapes:
  Images: torch.Size([64, 12, 8, 8])
  Labels (move indices): torch.Size([64]), min=521, max=8107
  Values: torch.Size([64, 1])


In [None]:
def train_step(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               optimizer: torch.optim.Optimizer,
               policy_criterion,
               value_criterion,
               device: torch.device = device):
        
    running_policy_loss = 0.0
    running_value_loss = 0.0
    model.train()
    model.to(device)

    for batch, (image, move_labels, game_values) in enumerate(data_loader):

        image, move_labels, game_values = image.to(device), move_labels.to(device), game_values.to(device).float()
        
        # Validate inputs
        if move_labels.min() < 0 or move_labels.max() >= MOVE_MAP_LENGTH:
            print(f"WARNING: Invalid move indices in batch {batch}: min={move_labels.min()}, max={move_labels.max()}")
            continue
            
        optimizer.zero_grad()

        pred_policy, pred_value = model(image)
        
        # Check for NaN/Inf
        if torch.isnan(pred_policy).any() or torch.isinf(pred_policy).any():
            print(f"WARNING: NaN/Inf in policy predictions at batch {batch}")
            continue
        if torch.isnan(pred_value).any() or torch.isinf(pred_value).any():
            print(f"WARNING: NaN/Inf in value predictions at batch {batch}")
            continue

        policy_loss = policy_criterion(pred_policy, move_labels)
        value_loss = torch.nn.functional.mse_loss(pred_value, game_values)

        total_loss = policy_loss + value_loss
        total_loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        running_policy_loss += policy_loss.item()
        running_value_loss += value_loss.item()

    print(f"Train policy loss: {running_policy_loss/len(data_loader):.5f} | Train value loss: {running_value_loss/len(data_loader):.5f}\n")


In [20]:
def test_step(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               optimizer: torch.optim.Optimizer,
               policy_criterion,
               value_criterion,
               device: torch.device = device):
    running_policy_loss = 0.0
    running_value_loss = 0.0
    model.eval()
    model.to(device)

    with torch.inference_mode(): 
        for batch, (image, move_labels, game_values) in enumerate(data_loader):
            image, move_labels, game_values = image.to(device), move_labels.to(device), game_values.to(device).float()
            
            # Validate inputs
            if move_labels.min() < 0 or move_labels.max() >= MOVE_MAP_LENGTH:
                print(f"WARNING: Invalid move indices in batch {batch}: min={move_labels.min()}, max={move_labels.max()}")
                continue
            
            pred_policy, pred_value = model(image)

            policy_loss = policy_criterion(pred_policy, move_labels)
            value_loss = torch.nn.functional.mse_loss(pred_value, game_values)

            running_policy_loss += policy_loss.item()
            running_value_loss += value_loss.item()

            avg_test_policy_loss = running_policy_loss / len(data_loader)
    print(f"Test policy loss: {avg_test_policy_loss:.5f} | Test value loss: {running_value_loss/len(data_loader):.5f}\n")

    return avg_test_policy_loss


In [21]:
from pathlib import Path

best_test_loss = float('inf')

MODEL_PATH = Path("models")
MODEL_PATH.mkdir(parents=True, 
                 exist_ok=True 
)

MODEL_NAME = "supervised_learning_chess_model_1.pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME
epochs = 200
for epoch in tqdm(range(epochs)):
    tqdm.write(f"Epoch: {epoch}\n---------")
    train_step(model,
               train_loader,
               optimizer,
               policy_criterion,
               value_criterion,
               device
               )
    current_test_loss = test_step(model,
               val_loader,
               optimizer,
               policy_criterion,
               value_criterion,
               device
               )
    scheduler.step(current_test_loss)
    if current_test_loss < best_test_loss:
        best_test_loss = current_test_loss
        print(f"⭐ New Best Model! Loss improved to: {best_test_loss:.5f}. Saving...")
        torch.save(obj=model.state_dict(), f=MODEL_SAVE_PATH)
    else:
        print(f"No improvement in Test Loss ({current_test_loss:.5f} vs best {best_test_loss:.5f}).")

    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current Learning Rate: {current_lr}")


  0%|          | 0/200 [00:00<?, ?it/s]

Epoch: 0
---------
Train policy loss: 6.38006 | Train value loss: 0.97327

Test policy loss: 5.83408 | Test value loss: 0.97458

⭐ New Best Model! Loss improved to: 5.83408. Saving...


  0%|          | 1/200 [00:04<13:35,  4.10s/it]

Current Learning Rate: 0.001
Epoch: 1
---------
Train policy loss: 4.75792 | Train value loss: 0.95804

Test policy loss: 5.48632 | Test value loss: 0.95798

⭐ New Best Model! Loss improved to: 5.48632. Saving...


  1%|          | 2/200 [00:07<12:58,  3.93s/it]

Current Learning Rate: 0.001
Epoch: 2
---------
Train policy loss: 3.03016 | Train value loss: 0.91542

Test policy loss: 5.29527 | Test value loss: 0.96310

⭐ New Best Model! Loss improved to: 5.29527. Saving...


  2%|▏         | 3/200 [00:11<12:49,  3.90s/it]

Current Learning Rate: 0.001
Epoch: 3
---------
Train policy loss: 1.83900 | Train value loss: 0.84504



  2%|▏         | 4/200 [00:15<12:14,  3.75s/it]

Test policy loss: 5.57321 | Test value loss: 0.94742

No improvement in Test Loss (5.57321 vs best 5.29527).
Current Learning Rate: 0.001
Epoch: 4
---------
Train policy loss: 1.28499 | Train value loss: 0.73585



  2%|▎         | 5/200 [00:18<12:01,  3.70s/it]

Test policy loss: 5.50440 | Test value loss: 0.95735

No improvement in Test Loss (5.50440 vs best 5.29527).
Current Learning Rate: 0.001
Epoch: 5
---------
Train policy loss: 0.97817 | Train value loss: 0.65254



  3%|▎         | 6/200 [00:22<12:05,  3.74s/it]

Test policy loss: 5.69611 | Test value loss: 0.97257

No improvement in Test Loss (5.69611 vs best 5.29527).
Current Learning Rate: 0.001
Epoch: 6
---------
Train policy loss: 0.80434 | Train value loss: 0.54200



  4%|▎         | 7/200 [00:26<11:58,  3.72s/it]

Test policy loss: 5.79179 | Test value loss: 0.96220

No improvement in Test Loss (5.79179 vs best 5.29527).
Current Learning Rate: 0.001
Epoch: 7
---------
Train policy loss: 0.66725 | Train value loss: 0.44545



  4%|▍         | 8/200 [00:29<11:37,  3.63s/it]

Test policy loss: 5.93196 | Test value loss: 1.04806

No improvement in Test Loss (5.93196 vs best 5.29527).
Current Learning Rate: 0.001
Epoch: 8
---------
Train policy loss: 0.59630 | Train value loss: 0.40177



  4%|▍         | 9/200 [00:33<11:18,  3.55s/it]

Test policy loss: 5.98560 | Test value loss: 1.09289

No improvement in Test Loss (5.98560 vs best 5.29527).
Current Learning Rate: 0.0001
Epoch: 9
---------
Train policy loss: 0.36670 | Train value loss: 0.25026



  5%|▌         | 10/200 [00:36<11:12,  3.54s/it]

Test policy loss: 5.95530 | Test value loss: 0.98319

No improvement in Test Loss (5.95530 vs best 5.29527).
Current Learning Rate: 0.0001
Epoch: 10
---------
Train policy loss: 0.30040 | Train value loss: 0.17687



  6%|▌         | 11/200 [00:40<11:05,  3.52s/it]

Test policy loss: 5.95376 | Test value loss: 1.00625

No improvement in Test Loss (5.95376 vs best 5.29527).
Current Learning Rate: 0.0001
Epoch: 11
---------
Train policy loss: 0.27542 | Train value loss: 0.15071



  6%|▌         | 12/200 [00:43<11:10,  3.57s/it]

Test policy loss: 5.94561 | Test value loss: 1.02080

No improvement in Test Loss (5.94561 vs best 5.29527).
Current Learning Rate: 0.0001
Epoch: 12
---------
Train policy loss: 0.26216 | Train value loss: 0.13665



  6%|▋         | 13/200 [00:47<11:10,  3.59s/it]

Test policy loss: 6.05815 | Test value loss: 1.03550

No improvement in Test Loss (6.05815 vs best 5.29527).
Current Learning Rate: 0.0001
Epoch: 13
---------
Train policy loss: 0.24661 | Train value loss: 0.12352



  7%|▋         | 14/200 [00:51<11:07,  3.59s/it]

Test policy loss: 6.00688 | Test value loss: 1.03439

No improvement in Test Loss (6.00688 vs best 5.29527).
Current Learning Rate: 0.0001
Epoch: 14
---------
Train policy loss: 0.24548 | Train value loss: 0.11561



  8%|▊         | 15/200 [00:54<11:04,  3.59s/it]

Test policy loss: 5.98116 | Test value loss: 1.07019

No improvement in Test Loss (5.98116 vs best 5.29527).
Current Learning Rate: 1e-05
Epoch: 15
---------
Train policy loss: 0.22065 | Train value loss: 0.10799



  8%|▊         | 16/200 [00:58<10:58,  3.58s/it]

Test policy loss: 5.99381 | Test value loss: 1.06207

No improvement in Test Loss (5.99381 vs best 5.29527).
Current Learning Rate: 1e-05
Epoch: 16
---------
Train policy loss: 0.21290 | Train value loss: 0.10198



  8%|▊         | 17/200 [01:01<10:43,  3.52s/it]

Test policy loss: 5.98892 | Test value loss: 1.06217

No improvement in Test Loss (5.98892 vs best 5.29527).
Current Learning Rate: 1e-05
Epoch: 17
---------
Train policy loss: 0.21705 | Train value loss: 0.09974



  9%|▉         | 18/200 [01:05<10:47,  3.56s/it]

Test policy loss: 5.98785 | Test value loss: 1.06859

No improvement in Test Loss (5.98785 vs best 5.29527).
Current Learning Rate: 1e-05
Epoch: 18
---------
Train policy loss: 0.23264 | Train value loss: 0.09960



 10%|▉         | 19/200 [01:08<10:47,  3.58s/it]

Test policy loss: 6.09271 | Test value loss: 1.08288

No improvement in Test Loss (6.09271 vs best 5.29527).
Current Learning Rate: 1e-05
Epoch: 19
---------
Train policy loss: 0.21027 | Train value loss: 0.09636



 10%|█         | 20/200 [01:12<10:35,  3.53s/it]

Test policy loss: 6.00163 | Test value loss: 1.06936

No improvement in Test Loss (6.00163 vs best 5.29527).
Current Learning Rate: 1e-05
Epoch: 20
---------
Train policy loss: 0.21060 | Train value loss: 0.09604



 10%|█         | 21/200 [01:15<10:35,  3.55s/it]

Test policy loss: 5.95956 | Test value loss: 1.06847

No improvement in Test Loss (5.95956 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-06
Epoch: 21
---------
Train policy loss: 0.21768 | Train value loss: 0.09545



 11%|█         | 22/200 [01:19<10:29,  3.54s/it]

Test policy loss: 5.98413 | Test value loss: 1.07358

No improvement in Test Loss (5.98413 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-06
Epoch: 22
---------
Train policy loss: 0.20726 | Train value loss: 0.09374



 12%|█▏        | 23/200 [01:23<10:31,  3.57s/it]

Test policy loss: 6.00617 | Test value loss: 1.07345

No improvement in Test Loss (6.00617 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-06
Epoch: 23
---------
Train policy loss: 0.20796 | Train value loss: 0.09414



 12%|█▏        | 24/200 [01:26<10:38,  3.63s/it]

Test policy loss: 6.01345 | Test value loss: 1.07493

No improvement in Test Loss (6.01345 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-06
Epoch: 24
---------
Train policy loss: 0.20716 | Train value loss: 0.09359



 12%|█▎        | 25/200 [01:30<10:23,  3.56s/it]

Test policy loss: 5.99337 | Test value loss: 1.07314

No improvement in Test Loss (5.99337 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-06
Epoch: 25
---------
Train policy loss: 0.20945 | Train value loss: 0.09605



 13%|█▎        | 26/200 [01:33<10:20,  3.56s/it]

Test policy loss: 6.02310 | Test value loss: 1.07765

No improvement in Test Loss (6.02310 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-06
Epoch: 26
---------
Train policy loss: 0.20542 | Train value loss: 0.09348



 14%|█▎        | 27/200 [01:37<10:07,  3.51s/it]

Test policy loss: 5.99136 | Test value loss: 1.07116

No improvement in Test Loss (5.99136 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-07
Epoch: 27
---------
Train policy loss: 0.20833 | Train value loss: 0.09506



 14%|█▍        | 28/200 [01:40<10:01,  3.50s/it]

Test policy loss: 6.02732 | Test value loss: 1.07557

No improvement in Test Loss (6.02732 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-07
Epoch: 28
---------
Train policy loss: 0.21247 | Train value loss: 0.09322



 14%|█▍        | 29/200 [01:44<10:05,  3.54s/it]

Test policy loss: 6.01168 | Test value loss: 1.07621

No improvement in Test Loss (6.01168 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-07
Epoch: 29
---------
Train policy loss: 0.20339 | Train value loss: 0.09362



 15%|█▌        | 30/200 [01:47<10:03,  3.55s/it]

Test policy loss: 5.99108 | Test value loss: 1.07729

No improvement in Test Loss (5.99108 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-07
Epoch: 30
---------
Train policy loss: 0.20412 | Train value loss: 0.09360



 16%|█▌        | 31/200 [01:51<09:55,  3.52s/it]

Test policy loss: 5.98842 | Test value loss: 1.07149

No improvement in Test Loss (5.98842 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-07
Epoch: 31
---------
Train policy loss: 0.20625 | Train value loss: 0.09375



 16%|█▌        | 32/200 [01:54<09:45,  3.48s/it]

Test policy loss: 5.99189 | Test value loss: 1.07303

No improvement in Test Loss (5.99189 vs best 5.29527).
Current Learning Rate: 1.0000000000000002e-07
Epoch: 32
---------
Train policy loss: 0.20980 | Train value loss: 0.09371



 16%|█▋        | 33/200 [01:58<09:41,  3.48s/it]

Test policy loss: 6.00058 | Test value loss: 1.06990

No improvement in Test Loss (6.00058 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 33
---------
Train policy loss: 0.21552 | Train value loss: 0.09891



 17%|█▋        | 34/200 [02:01<09:43,  3.51s/it]

Test policy loss: 6.05664 | Test value loss: 1.08621

No improvement in Test Loss (6.05664 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 34
---------
Train policy loss: 0.20964 | Train value loss: 0.09406



 18%|█▊        | 35/200 [02:05<09:34,  3.48s/it]

Test policy loss: 5.99766 | Test value loss: 1.07358

No improvement in Test Loss (5.99766 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 35
---------
Train policy loss: 0.20641 | Train value loss: 0.09339



 18%|█▊        | 36/200 [02:08<09:24,  3.44s/it]

Test policy loss: 5.98771 | Test value loss: 1.07657

No improvement in Test Loss (5.98771 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 36
---------
Train policy loss: 0.20668 | Train value loss: 0.09354



 18%|█▊        | 37/200 [02:12<09:21,  3.44s/it]

Test policy loss: 6.00468 | Test value loss: 1.07481

No improvement in Test Loss (6.00468 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 37
---------
Train policy loss: 0.20880 | Train value loss: 0.09364



 19%|█▉        | 38/200 [02:15<09:23,  3.48s/it]

Test policy loss: 5.99326 | Test value loss: 1.07416

No improvement in Test Loss (5.99326 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 38
---------
Train policy loss: 0.21046 | Train value loss: 0.09662



 20%|█▉        | 39/200 [02:19<09:26,  3.52s/it]

Test policy loss: 6.00016 | Test value loss: 1.07792

No improvement in Test Loss (6.00016 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 39
---------
Train policy loss: 0.21499 | Train value loss: 0.09675



 20%|██        | 40/200 [02:22<09:16,  3.48s/it]

Test policy loss: 6.03524 | Test value loss: 1.07941

No improvement in Test Loss (6.03524 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 40
---------
Train policy loss: 0.20568 | Train value loss: 0.09338



 20%|██        | 41/200 [02:26<09:14,  3.49s/it]

Test policy loss: 6.00087 | Test value loss: 1.07380

No improvement in Test Loss (6.00087 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 41
---------
Train policy loss: 0.21767 | Train value loss: 0.09459



 21%|██        | 42/200 [02:29<09:09,  3.48s/it]

Test policy loss: 6.01306 | Test value loss: 1.07624

No improvement in Test Loss (6.01306 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 42
---------
Train policy loss: 0.20988 | Train value loss: 0.09797



 22%|██▏       | 43/200 [02:33<09:07,  3.49s/it]

Test policy loss: 6.02696 | Test value loss: 1.07781

No improvement in Test Loss (6.02696 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 43
---------
Train policy loss: 0.21220 | Train value loss: 0.09817



 22%|██▏       | 44/200 [02:36<09:17,  3.58s/it]

Test policy loss: 6.04232 | Test value loss: 1.08172

No improvement in Test Loss (6.04232 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 44
---------
Train policy loss: 0.20461 | Train value loss: 0.09434



 22%|██▎       | 45/200 [02:40<09:18,  3.60s/it]

Test policy loss: 5.99287 | Test value loss: 1.07092

No improvement in Test Loss (5.99287 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 45
---------
Train policy loss: 0.20679 | Train value loss: 0.09361



 23%|██▎       | 46/200 [02:44<09:15,  3.60s/it]

Test policy loss: 6.01497 | Test value loss: 1.07399

No improvement in Test Loss (6.01497 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 46
---------
Train policy loss: 0.20602 | Train value loss: 0.09334



 24%|██▎       | 47/200 [02:47<09:16,  3.64s/it]

Test policy loss: 6.02141 | Test value loss: 1.07332

No improvement in Test Loss (6.02141 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 47
---------
Train policy loss: 0.20775 | Train value loss: 0.09374



 24%|██▍       | 48/200 [02:51<09:12,  3.63s/it]

Test policy loss: 5.98936 | Test value loss: 1.06858

No improvement in Test Loss (5.98936 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 48
---------
Train policy loss: 0.20528 | Train value loss: 0.09341



 24%|██▍       | 49/200 [02:54<09:03,  3.60s/it]

Test policy loss: 5.98902 | Test value loss: 1.06982

No improvement in Test Loss (5.98902 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 49
---------
Train policy loss: 0.20375 | Train value loss: 0.09611



 25%|██▌       | 50/200 [02:58<09:10,  3.67s/it]

Test policy loss: 6.02388 | Test value loss: 1.07729

No improvement in Test Loss (6.02388 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 50
---------
Train policy loss: 0.21840 | Train value loss: 0.09612



 26%|██▌       | 51/200 [03:02<09:13,  3.72s/it]

Test policy loss: 6.05709 | Test value loss: 1.08607

No improvement in Test Loss (6.05709 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 51
---------
Train policy loss: 0.20908 | Train value loss: 0.09669



 26%|██▌       | 52/200 [03:06<09:23,  3.81s/it]

Test policy loss: 6.04528 | Test value loss: 1.07955

No improvement in Test Loss (6.04528 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 52
---------
Train policy loss: 0.20833 | Train value loss: 0.09415



 26%|██▋       | 53/200 [03:10<09:22,  3.83s/it]

Test policy loss: 5.99281 | Test value loss: 1.07442

No improvement in Test Loss (5.99281 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 53
---------
Train policy loss: 0.20660 | Train value loss: 0.09326



 27%|██▋       | 54/200 [03:14<09:19,  3.84s/it]

Test policy loss: 6.01168 | Test value loss: 1.07612

No improvement in Test Loss (6.01168 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 54
---------
Train policy loss: 0.20720 | Train value loss: 0.09357



 28%|██▊       | 55/200 [03:18<09:23,  3.89s/it]

Test policy loss: 5.99617 | Test value loss: 1.07109

No improvement in Test Loss (5.99617 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 55
---------
Train policy loss: 0.20906 | Train value loss: 0.09343



 28%|██▊       | 56/200 [03:22<09:28,  3.95s/it]

Test policy loss: 5.99565 | Test value loss: 1.07242

No improvement in Test Loss (5.99565 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 56
---------
Train policy loss: 0.20724 | Train value loss: 0.09381



 28%|██▊       | 57/200 [03:26<09:21,  3.93s/it]

Test policy loss: 5.99968 | Test value loss: 1.07446

No improvement in Test Loss (5.99968 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 57
---------
Train policy loss: 0.21637 | Train value loss: 0.09492



 29%|██▉       | 58/200 [03:30<09:17,  3.93s/it]

Test policy loss: 6.00460 | Test value loss: 1.07591

No improvement in Test Loss (6.00460 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 58
---------
Train policy loss: 0.20494 | Train value loss: 0.09328



 30%|██▉       | 59/200 [03:33<09:02,  3.84s/it]

Test policy loss: 5.98791 | Test value loss: 1.07301

No improvement in Test Loss (5.98791 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 59
---------
Train policy loss: 0.21483 | Train value loss: 0.09373



 30%|███       | 60/200 [03:37<08:58,  3.84s/it]

Test policy loss: 5.97780 | Test value loss: 1.07159

No improvement in Test Loss (5.97780 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 60
---------
Train policy loss: 0.20654 | Train value loss: 0.09669



 30%|███       | 61/200 [03:41<08:52,  3.83s/it]

Test policy loss: 6.01471 | Test value loss: 1.07831

No improvement in Test Loss (6.01471 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 61
---------
Train policy loss: 0.20680 | Train value loss: 0.09308



 31%|███       | 62/200 [03:45<08:53,  3.87s/it]

Test policy loss: 6.01388 | Test value loss: 1.07529

No improvement in Test Loss (6.01388 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 62
---------
Train policy loss: 0.21774 | Train value loss: 0.09694



 32%|███▏      | 63/200 [03:49<08:49,  3.86s/it]

Test policy loss: 6.02077 | Test value loss: 1.08117

No improvement in Test Loss (6.02077 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 63
---------
Train policy loss: 0.21140 | Train value loss: 0.09449



 32%|███▏      | 64/200 [03:52<08:35,  3.79s/it]

Test policy loss: 6.01323 | Test value loss: 1.07638

No improvement in Test Loss (6.01323 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 64
---------
Train policy loss: 0.21021 | Train value loss: 0.09346



 32%|███▎      | 65/200 [03:56<08:37,  3.83s/it]

Test policy loss: 5.98362 | Test value loss: 1.07267

No improvement in Test Loss (5.98362 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 65
---------
Train policy loss: 0.21042 | Train value loss: 0.09968



 33%|███▎      | 66/200 [04:00<08:32,  3.83s/it]

Test policy loss: 6.00859 | Test value loss: 1.07397

No improvement in Test Loss (6.00859 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 66
---------
Train policy loss: 0.21043 | Train value loss: 0.09337



 34%|███▎      | 67/200 [04:04<08:25,  3.80s/it]

Test policy loss: 6.00812 | Test value loss: 1.07381

No improvement in Test Loss (6.00812 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 67
---------
Train policy loss: 0.20778 | Train value loss: 0.09352



 34%|███▍      | 68/200 [04:08<08:19,  3.79s/it]

Test policy loss: 6.00032 | Test value loss: 1.07374

No improvement in Test Loss (6.00032 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 68
---------
Train policy loss: 0.20520 | Train value loss: 0.09331



 34%|███▍      | 69/200 [04:12<08:22,  3.84s/it]

Test policy loss: 6.00026 | Test value loss: 1.06997

No improvement in Test Loss (6.00026 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 69
---------
Train policy loss: 0.20531 | Train value loss: 0.09343



 35%|███▌      | 70/200 [04:16<08:20,  3.85s/it]

Test policy loss: 5.96812 | Test value loss: 1.07255

No improvement in Test Loss (5.96812 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 70
---------
Train policy loss: 0.20905 | Train value loss: 0.09365



 36%|███▌      | 71/200 [04:20<08:20,  3.88s/it]

Test policy loss: 5.98958 | Test value loss: 1.07379

No improvement in Test Loss (5.98958 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 71
---------
Train policy loss: 0.20907 | Train value loss: 0.09477



 36%|███▌      | 72/200 [04:23<08:17,  3.89s/it]

Test policy loss: 6.00977 | Test value loss: 1.07773

No improvement in Test Loss (6.00977 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 72
---------
Train policy loss: 0.21335 | Train value loss: 0.09358



 36%|███▋      | 73/200 [04:27<08:09,  3.86s/it]

Test policy loss: 5.99821 | Test value loss: 1.07414

No improvement in Test Loss (5.99821 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 73
---------
Train policy loss: 0.21310 | Train value loss: 0.09346



 37%|███▋      | 74/200 [04:31<08:07,  3.87s/it]

Test policy loss: 5.98347 | Test value loss: 1.07352

No improvement in Test Loss (5.98347 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 74
---------
Train policy loss: 0.20410 | Train value loss: 0.09350



 38%|███▊      | 75/200 [04:35<07:53,  3.79s/it]

Test policy loss: 5.99081 | Test value loss: 1.07265

No improvement in Test Loss (5.99081 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 75
---------
Train policy loss: 0.21098 | Train value loss: 0.09494



 38%|███▊      | 76/200 [04:38<07:45,  3.75s/it]

Test policy loss: 6.06183 | Test value loss: 1.08094

No improvement in Test Loss (6.06183 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 76
---------
Train policy loss: 0.21374 | Train value loss: 0.09860



 38%|███▊      | 77/200 [04:42<07:46,  3.80s/it]

Test policy loss: 6.03511 | Test value loss: 1.08115

No improvement in Test Loss (6.03511 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 77
---------
Train policy loss: 0.20896 | Train value loss: 0.09602



 39%|███▉      | 78/200 [04:46<07:42,  3.79s/it]

Test policy loss: 6.01248 | Test value loss: 1.07191

No improvement in Test Loss (6.01248 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 78
---------
Train policy loss: 0.20384 | Train value loss: 0.09351



 40%|███▉      | 79/200 [04:50<07:43,  3.83s/it]

Test policy loss: 5.98447 | Test value loss: 1.06757

No improvement in Test Loss (5.98447 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 79
---------
Train policy loss: 0.20695 | Train value loss: 0.09367



 40%|████      | 80/200 [04:54<07:42,  3.85s/it]

Test policy loss: 5.99731 | Test value loss: 1.07448

No improvement in Test Loss (5.99731 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 80
---------
Train policy loss: 0.20792 | Train value loss: 0.09363



 40%|████      | 81/200 [04:58<07:37,  3.84s/it]

Test policy loss: 5.98981 | Test value loss: 1.07512

No improvement in Test Loss (5.98981 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 81
---------
Train policy loss: 0.20737 | Train value loss: 0.09365



 41%|████      | 82/200 [05:02<07:33,  3.85s/it]

Test policy loss: 6.01873 | Test value loss: 1.07462

No improvement in Test Loss (6.01873 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 82
---------
Train policy loss: 0.20815 | Train value loss: 0.09320



 42%|████▏     | 83/200 [05:05<07:29,  3.84s/it]

Test policy loss: 5.97917 | Test value loss: 1.07040

No improvement in Test Loss (5.97917 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 83
---------
Train policy loss: 0.20730 | Train value loss: 0.09356



 42%|████▏     | 84/200 [05:09<07:30,  3.88s/it]

Test policy loss: 5.99589 | Test value loss: 1.07090

No improvement in Test Loss (5.99589 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 84
---------
Train policy loss: 0.21268 | Train value loss: 0.09396



 42%|████▎     | 85/200 [05:13<07:26,  3.89s/it]

Test policy loss: 6.01678 | Test value loss: 1.08021

No improvement in Test Loss (6.01678 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 85
---------
Train policy loss: 0.20930 | Train value loss: 0.09340



 43%|████▎     | 86/200 [05:17<07:22,  3.88s/it]

Test policy loss: 5.98725 | Test value loss: 1.07072

No improvement in Test Loss (5.98725 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 86
---------
Train policy loss: 0.21304 | Train value loss: 0.09550



 44%|████▎     | 87/200 [05:21<07:14,  3.85s/it]

Test policy loss: 5.99793 | Test value loss: 1.07218

No improvement in Test Loss (5.99793 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 87
---------
Train policy loss: 0.21330 | Train value loss: 0.09358



 44%|████▍     | 88/200 [05:25<07:11,  3.85s/it]

Test policy loss: 6.00532 | Test value loss: 1.07138

No improvement in Test Loss (6.00532 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 88
---------
Train policy loss: 0.21041 | Train value loss: 0.09331



 44%|████▍     | 89/200 [05:29<07:08,  3.86s/it]

Test policy loss: 5.99807 | Test value loss: 1.07662

No improvement in Test Loss (5.99807 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 89
---------
Train policy loss: 0.20990 | Train value loss: 0.09400



 45%|████▌     | 90/200 [05:32<07:01,  3.83s/it]

Test policy loss: 6.00994 | Test value loss: 1.07798

No improvement in Test Loss (6.00994 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 90
---------
Train policy loss: 0.20831 | Train value loss: 0.09759



 46%|████▌     | 91/200 [05:36<06:55,  3.81s/it]

Test policy loss: 6.00791 | Test value loss: 1.07905

No improvement in Test Loss (6.00791 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 91
---------
Train policy loss: 0.21114 | Train value loss: 0.09347



 46%|████▌     | 92/200 [05:40<06:47,  3.77s/it]

Test policy loss: 6.03243 | Test value loss: 1.08105

No improvement in Test Loss (6.03243 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 92
---------
Train policy loss: 0.21431 | Train value loss: 0.09613



 46%|████▋     | 93/200 [05:43<06:40,  3.74s/it]

Test policy loss: 5.99736 | Test value loss: 1.07586

No improvement in Test Loss (5.99736 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 93
---------
Train policy loss: 0.20928 | Train value loss: 0.09356



 47%|████▋     | 94/200 [05:47<06:44,  3.81s/it]

Test policy loss: 5.98129 | Test value loss: 1.07007

No improvement in Test Loss (5.98129 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 94
---------
Train policy loss: 0.20740 | Train value loss: 0.09360



 48%|████▊     | 95/200 [05:51<06:42,  3.83s/it]

Test policy loss: 5.96780 | Test value loss: 1.06963

No improvement in Test Loss (5.96780 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 95
---------
Train policy loss: 0.20778 | Train value loss: 0.09361



 48%|████▊     | 96/200 [05:55<06:41,  3.86s/it]

Test policy loss: 5.96912 | Test value loss: 1.07283

No improvement in Test Loss (5.96912 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 96
---------
Train policy loss: 0.20487 | Train value loss: 0.09352



 48%|████▊     | 97/200 [05:59<06:35,  3.84s/it]

Test policy loss: 6.00692 | Test value loss: 1.07431

No improvement in Test Loss (6.00692 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 97
---------
Train policy loss: 0.20394 | Train value loss: 0.09373



 49%|████▉     | 98/200 [06:03<06:28,  3.81s/it]

Test policy loss: 6.00965 | Test value loss: 1.07783

No improvement in Test Loss (6.00965 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 98
---------
Train policy loss: 0.22263 | Train value loss: 0.09638



 50%|████▉     | 99/200 [06:07<06:21,  3.78s/it]

Test policy loss: 6.01116 | Test value loss: 1.07836

No improvement in Test Loss (6.01116 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 99
---------
Train policy loss: 0.20602 | Train value loss: 0.09421



 50%|█████     | 100/200 [06:10<06:21,  3.82s/it]

Test policy loss: 5.98595 | Test value loss: 1.07283

No improvement in Test Loss (5.98595 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 100
---------
Train policy loss: 0.20957 | Train value loss: 0.09373



 50%|█████     | 101/200 [06:14<06:12,  3.76s/it]

Test policy loss: 6.00183 | Test value loss: 1.07516

No improvement in Test Loss (6.00183 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 101
---------
Train policy loss: 0.20864 | Train value loss: 0.09324



 51%|█████     | 102/200 [06:18<06:10,  3.78s/it]

Test policy loss: 5.96826 | Test value loss: 1.07014

No improvement in Test Loss (5.96826 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 102
---------
Train policy loss: 0.21255 | Train value loss: 0.09894



 52%|█████▏    | 103/200 [06:22<06:12,  3.84s/it]

Test policy loss: 6.02578 | Test value loss: 1.08392

No improvement in Test Loss (6.02578 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 103
---------
Train policy loss: 0.20801 | Train value loss: 0.09354



 52%|█████▏    | 104/200 [06:26<06:17,  3.93s/it]

Test policy loss: 5.96430 | Test value loss: 1.07091

No improvement in Test Loss (5.96430 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 104
---------
Train policy loss: 0.20694 | Train value loss: 0.09333



 52%|█████▎    | 105/200 [06:30<06:19,  3.99s/it]

Test policy loss: 5.98381 | Test value loss: 1.07056

No improvement in Test Loss (5.98381 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 105
---------
Train policy loss: 0.20743 | Train value loss: 0.09463



 53%|█████▎    | 106/200 [06:34<06:11,  3.95s/it]

Test policy loss: 6.00558 | Test value loss: 1.07504

No improvement in Test Loss (6.00558 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 106
---------
Train policy loss: 0.20702 | Train value loss: 0.09349



 54%|█████▎    | 107/200 [06:38<06:06,  3.94s/it]

Test policy loss: 6.00305 | Test value loss: 1.07306

No improvement in Test Loss (6.00305 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 107
---------
Train policy loss: 0.20676 | Train value loss: 0.09344



 54%|█████▍    | 108/200 [06:42<06:01,  3.93s/it]

Test policy loss: 5.98693 | Test value loss: 1.06483

No improvement in Test Loss (5.98693 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 108
---------
Train policy loss: 0.20785 | Train value loss: 0.09347



 55%|█████▍    | 109/200 [06:46<05:52,  3.88s/it]

Test policy loss: 5.99002 | Test value loss: 1.07000

No improvement in Test Loss (5.99002 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 109
---------
Train policy loss: 0.20268 | Train value loss: 0.09336



 55%|█████▌    | 110/200 [06:49<05:47,  3.86s/it]

Test policy loss: 5.99530 | Test value loss: 1.07245

No improvement in Test Loss (5.99530 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 110
---------
Train policy loss: 0.21225 | Train value loss: 0.09321



 56%|█████▌    | 111/200 [06:53<05:43,  3.86s/it]

Test policy loss: 5.98091 | Test value loss: 1.07299

No improvement in Test Loss (5.98091 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 111
---------
Train policy loss: 0.20631 | Train value loss: 0.09347



 56%|█████▌    | 112/200 [06:57<05:39,  3.85s/it]

Test policy loss: 5.99221 | Test value loss: 1.07197

No improvement in Test Loss (5.99221 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 112
---------
Train policy loss: 0.20444 | Train value loss: 0.09354



 56%|█████▋    | 113/200 [07:01<05:30,  3.80s/it]

Test policy loss: 5.99166 | Test value loss: 1.07552

No improvement in Test Loss (5.99166 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 113
---------
Train policy loss: 0.20305 | Train value loss: 0.09317



 57%|█████▋    | 114/200 [07:05<05:30,  3.84s/it]

Test policy loss: 5.99000 | Test value loss: 1.07612

No improvement in Test Loss (5.99000 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 114
---------
Train policy loss: 0.20808 | Train value loss: 0.09366



 57%|█████▊    | 115/200 [07:09<05:29,  3.88s/it]

Test policy loss: 5.97529 | Test value loss: 1.07218

No improvement in Test Loss (5.97529 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 115
---------
Train policy loss: 0.20891 | Train value loss: 0.09374



 58%|█████▊    | 116/200 [07:13<05:25,  3.87s/it]

Test policy loss: 6.00155 | Test value loss: 1.07682

No improvement in Test Loss (6.00155 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 116
---------
Train policy loss: 0.20459 | Train value loss: 0.09592



 58%|█████▊    | 117/200 [07:16<05:17,  3.83s/it]

Test policy loss: 6.01397 | Test value loss: 1.07740

No improvement in Test Loss (6.01397 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 117
---------
Train policy loss: 0.20906 | Train value loss: 0.09356



 59%|█████▉    | 118/200 [07:20<05:12,  3.81s/it]

Test policy loss: 5.99200 | Test value loss: 1.07781

No improvement in Test Loss (5.99200 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 118
---------
Train policy loss: 0.20549 | Train value loss: 0.09354



 60%|█████▉    | 119/200 [07:24<05:14,  3.88s/it]

Test policy loss: 5.96411 | Test value loss: 1.07073

No improvement in Test Loss (5.96411 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 119
---------
Train policy loss: 0.20915 | Train value loss: 0.09293



 60%|██████    | 120/200 [07:28<05:08,  3.86s/it]

Test policy loss: 5.97956 | Test value loss: 1.07487

No improvement in Test Loss (5.97956 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 120
---------
Train policy loss: 0.20989 | Train value loss: 0.09347



 60%|██████    | 121/200 [07:32<05:03,  3.85s/it]

Test policy loss: 5.98902 | Test value loss: 1.07531

No improvement in Test Loss (5.98902 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 121
---------
Train policy loss: 0.21036 | Train value loss: 0.09597



 61%|██████    | 122/200 [07:35<04:57,  3.82s/it]

Test policy loss: 6.01123 | Test value loss: 1.07660

No improvement in Test Loss (6.01123 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 122
---------
Train policy loss: 0.20454 | Train value loss: 0.09394



 62%|██████▏   | 123/200 [07:39<04:56,  3.85s/it]

Test policy loss: 6.02879 | Test value loss: 1.07503

No improvement in Test Loss (6.02879 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 123
---------
Train policy loss: 0.20494 | Train value loss: 0.09315



 62%|██████▏   | 124/200 [07:43<04:54,  3.87s/it]

Test policy loss: 5.99944 | Test value loss: 1.07146

No improvement in Test Loss (5.99944 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 124
---------
Train policy loss: 0.20699 | Train value loss: 0.09352



 62%|██████▎   | 125/200 [07:47<04:48,  3.84s/it]

Test policy loss: 6.04109 | Test value loss: 1.07971

No improvement in Test Loss (6.04109 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 125
---------
Train policy loss: 0.20583 | Train value loss: 0.09767



 63%|██████▎   | 126/200 [07:51<04:46,  3.87s/it]

Test policy loss: 5.98775 | Test value loss: 1.07467

No improvement in Test Loss (5.98775 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 126
---------
Train policy loss: 0.20740 | Train value loss: 0.09359



 64%|██████▎   | 127/200 [07:55<04:39,  3.83s/it]

Test policy loss: 5.98995 | Test value loss: 1.07645

No improvement in Test Loss (5.98995 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 127
---------
Train policy loss: 0.20935 | Train value loss: 0.09336



 64%|██████▍   | 128/200 [07:59<04:39,  3.88s/it]

Test policy loss: 5.97527 | Test value loss: 1.07049

No improvement in Test Loss (5.97527 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 128
---------
Train policy loss: 0.20917 | Train value loss: 0.09341



 64%|██████▍   | 129/200 [08:02<04:32,  3.84s/it]

Test policy loss: 5.98459 | Test value loss: 1.07030

No improvement in Test Loss (5.98459 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 129
---------
Train policy loss: 0.20938 | Train value loss: 0.09319



 65%|██████▌   | 130/200 [08:06<04:30,  3.86s/it]

Test policy loss: 6.02707 | Test value loss: 1.07591

No improvement in Test Loss (6.02707 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 130
---------
Train policy loss: 0.20434 | Train value loss: 0.09303



 66%|██████▌   | 131/200 [08:10<04:22,  3.81s/it]

Test policy loss: 5.98660 | Test value loss: 1.06926

No improvement in Test Loss (5.98660 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 131
---------
Train policy loss: 0.20695 | Train value loss: 0.09404



 66%|██████▌   | 132/200 [08:14<04:18,  3.79s/it]

Test policy loss: 6.02980 | Test value loss: 1.08028

No improvement in Test Loss (6.02980 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 132
---------
Train policy loss: 0.20663 | Train value loss: 0.09343



 66%|██████▋   | 133/200 [08:18<04:12,  3.76s/it]

Test policy loss: 5.99461 | Test value loss: 1.07342

No improvement in Test Loss (5.99461 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 133
---------
Train policy loss: 0.21903 | Train value loss: 0.09881



 67%|██████▋   | 134/200 [08:21<04:07,  3.75s/it]

Test policy loss: 6.05350 | Test value loss: 1.08217

No improvement in Test Loss (6.05350 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 134
---------
Train policy loss: 0.20857 | Train value loss: 0.09338



 68%|██████▊   | 135/200 [08:25<04:07,  3.80s/it]

Test policy loss: 5.98605 | Test value loss: 1.07978

No improvement in Test Loss (5.98605 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 135
---------
Train policy loss: 0.20692 | Train value loss: 0.09342



 68%|██████▊   | 136/200 [08:29<04:03,  3.81s/it]

Test policy loss: 5.99174 | Test value loss: 1.07524

No improvement in Test Loss (5.99174 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 136
---------
Train policy loss: 0.20479 | Train value loss: 0.09321



 68%|██████▊   | 137/200 [08:33<03:57,  3.78s/it]

Test policy loss: 5.97507 | Test value loss: 1.07305

No improvement in Test Loss (5.97507 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 137
---------
Train policy loss: 0.21353 | Train value loss: 0.09497



 69%|██████▉   | 138/200 [08:37<03:54,  3.79s/it]

Test policy loss: 6.02336 | Test value loss: 1.07982

No improvement in Test Loss (6.02336 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 138
---------
Train policy loss: 0.21158 | Train value loss: 0.09470



 70%|██████▉   | 139/200 [08:40<03:51,  3.80s/it]

Test policy loss: 6.03466 | Test value loss: 1.08043

No improvement in Test Loss (6.03466 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 139
---------
Train policy loss: 0.21072 | Train value loss: 0.09328



 70%|███████   | 140/200 [08:44<03:47,  3.80s/it]

Test policy loss: 5.98621 | Test value loss: 1.06980

No improvement in Test Loss (5.98621 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 140
---------
Train policy loss: 0.20472 | Train value loss: 0.09351



 70%|███████   | 141/200 [08:48<03:41,  3.76s/it]

Test policy loss: 6.01860 | Test value loss: 1.07412

No improvement in Test Loss (6.01860 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 141
---------
Train policy loss: 0.20390 | Train value loss: 0.09366



 71%|███████   | 142/200 [08:52<03:40,  3.80s/it]

Test policy loss: 6.00943 | Test value loss: 1.07521

No improvement in Test Loss (6.00943 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 142
---------
Train policy loss: 0.20971 | Train value loss: 0.09541



 72%|███████▏  | 143/200 [08:56<03:44,  3.93s/it]

Test policy loss: 5.99827 | Test value loss: 1.07744

No improvement in Test Loss (5.99827 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 143
---------
Train policy loss: 0.20840 | Train value loss: 0.09675



 72%|███████▏  | 144/200 [09:00<03:41,  3.96s/it]

Test policy loss: 6.01316 | Test value loss: 1.07748

No improvement in Test Loss (6.01316 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 144
---------
Train policy loss: 0.20970 | Train value loss: 0.09525



 72%|███████▎  | 145/200 [09:04<03:36,  3.93s/it]

Test policy loss: 5.99656 | Test value loss: 1.07218

No improvement in Test Loss (5.99656 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 145
---------
Train policy loss: 0.20934 | Train value loss: 0.09384



 73%|███████▎  | 146/200 [09:08<03:32,  3.94s/it]

Test policy loss: 5.98882 | Test value loss: 1.07415

No improvement in Test Loss (5.98882 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 146
---------
Train policy loss: 0.20920 | Train value loss: 0.09362



 74%|███████▎  | 147/200 [09:12<03:28,  3.92s/it]

Test policy loss: 5.96725 | Test value loss: 1.06963

No improvement in Test Loss (5.96725 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 147
---------
Train policy loss: 0.21654 | Train value loss: 0.09316



 74%|███████▍  | 148/200 [09:16<03:23,  3.90s/it]

Test policy loss: 5.98682 | Test value loss: 1.07442

No improvement in Test Loss (5.98682 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 148
---------
Train policy loss: 0.21198 | Train value loss: 0.09506



 74%|███████▍  | 149/200 [09:19<03:17,  3.87s/it]

Test policy loss: 5.99275 | Test value loss: 1.07453

No improvement in Test Loss (5.99275 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 149
---------
Train policy loss: 0.21045 | Train value loss: 0.09400



 75%|███████▌  | 150/200 [09:23<03:13,  3.86s/it]

Test policy loss: 5.99877 | Test value loss: 1.07078

No improvement in Test Loss (5.99877 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 150
---------
Train policy loss: 0.20964 | Train value loss: 0.09331



 76%|███████▌  | 151/200 [09:27<03:12,  3.94s/it]

Test policy loss: 6.00488 | Test value loss: 1.07000

No improvement in Test Loss (6.00488 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 151
---------
Train policy loss: 0.21297 | Train value loss: 0.10269



 76%|███████▌  | 152/200 [09:31<03:08,  3.93s/it]

Test policy loss: 5.98773 | Test value loss: 1.07254

No improvement in Test Loss (5.98773 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 152
---------
Train policy loss: 0.20892 | Train value loss: 0.09379



 76%|███████▋  | 153/200 [09:35<03:02,  3.88s/it]

Test policy loss: 5.98485 | Test value loss: 1.07091

No improvement in Test Loss (5.98485 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 153
---------
Train policy loss: 0.20928 | Train value loss: 0.09316



 77%|███████▋  | 154/200 [09:39<02:58,  3.87s/it]

Test policy loss: 5.98488 | Test value loss: 1.07433

No improvement in Test Loss (5.98488 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 154
---------
Train policy loss: 0.20974 | Train value loss: 0.09314



 78%|███████▊  | 155/200 [09:43<02:52,  3.84s/it]

Test policy loss: 5.98826 | Test value loss: 1.07113

No improvement in Test Loss (5.98826 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 155
---------
Train policy loss: 0.20867 | Train value loss: 0.09864



 78%|███████▊  | 156/200 [09:46<02:48,  3.82s/it]

Test policy loss: 6.02530 | Test value loss: 1.07675

No improvement in Test Loss (6.02530 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 156
---------
Train policy loss: 0.21121 | Train value loss: 0.09311



 78%|███████▊  | 157/200 [09:50<02:43,  3.81s/it]

Test policy loss: 5.99749 | Test value loss: 1.07676

No improvement in Test Loss (5.99749 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 157
---------
Train policy loss: 0.20986 | Train value loss: 0.09523



 79%|███████▉  | 158/200 [09:54<02:40,  3.82s/it]

Test policy loss: 6.01907 | Test value loss: 1.07887

No improvement in Test Loss (6.01907 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 158
---------
Train policy loss: 0.20948 | Train value loss: 0.09400



 80%|███████▉  | 159/200 [09:58<02:36,  3.82s/it]

Test policy loss: 6.01822 | Test value loss: 1.08185

No improvement in Test Loss (6.01822 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 159
---------
Train policy loss: 0.22697 | Train value loss: 0.09532



 80%|████████  | 160/200 [10:02<02:32,  3.82s/it]

Test policy loss: 6.03717 | Test value loss: 1.07784

No improvement in Test Loss (6.03717 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 160
---------
Train policy loss: 0.20966 | Train value loss: 0.09331



 80%|████████  | 161/200 [10:05<02:28,  3.81s/it]

Test policy loss: 5.96261 | Test value loss: 1.06992

No improvement in Test Loss (5.96261 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 161
---------
Train policy loss: 0.20950 | Train value loss: 0.09329



 81%|████████  | 162/200 [10:09<02:25,  3.83s/it]

Test policy loss: 5.99443 | Test value loss: 1.07276

No improvement in Test Loss (5.99443 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 162
---------
Train policy loss: 0.21532 | Train value loss: 0.09361



 82%|████████▏ | 163/200 [10:13<02:20,  3.80s/it]

Test policy loss: 6.01378 | Test value loss: 1.07583

No improvement in Test Loss (6.01378 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 163
---------
Train policy loss: 0.21970 | Train value loss: 0.09824



 82%|████████▏ | 164/200 [10:17<02:17,  3.83s/it]

Test policy loss: 6.00923 | Test value loss: 1.07698

No improvement in Test Loss (6.00923 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 164
---------
Train policy loss: 0.20988 | Train value loss: 0.09341



 82%|████████▎ | 165/200 [10:21<02:15,  3.87s/it]

Test policy loss: 5.99371 | Test value loss: 1.07327

No improvement in Test Loss (5.99371 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 165
---------
Train policy loss: 0.21322 | Train value loss: 0.09727



 83%|████████▎ | 166/200 [10:25<02:11,  3.88s/it]

Test policy loss: 6.04672 | Test value loss: 1.08513

No improvement in Test Loss (6.04672 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 166
---------
Train policy loss: 0.20920 | Train value loss: 0.09643



 84%|████████▎ | 167/200 [10:28<02:06,  3.82s/it]

Test policy loss: 6.01119 | Test value loss: 1.07668

No improvement in Test Loss (6.01119 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 167
---------
Train policy loss: 0.20754 | Train value loss: 0.09355



 84%|████████▍ | 168/200 [10:32<02:01,  3.81s/it]

Test policy loss: 5.97556 | Test value loss: 1.06944

No improvement in Test Loss (5.97556 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 168
---------
Train policy loss: 0.20994 | Train value loss: 0.09381



 84%|████████▍ | 169/200 [10:36<01:59,  3.85s/it]

Test policy loss: 6.02621 | Test value loss: 1.07904

No improvement in Test Loss (6.02621 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 169
---------
Train policy loss: 0.21231 | Train value loss: 0.09320



 85%|████████▌ | 170/200 [10:40<01:54,  3.83s/it]

Test policy loss: 6.00585 | Test value loss: 1.07693

No improvement in Test Loss (6.00585 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 170
---------
Train policy loss: 0.21046 | Train value loss: 0.09334



 86%|████████▌ | 171/200 [10:44<01:49,  3.79s/it]

Test policy loss: 6.00123 | Test value loss: 1.07430

No improvement in Test Loss (6.00123 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 171
---------
Train policy loss: 0.21614 | Train value loss: 0.09697



 86%|████████▌ | 172/200 [10:47<01:45,  3.77s/it]

Test policy loss: 6.00484 | Test value loss: 1.07704

No improvement in Test Loss (6.00484 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 172
---------
Train policy loss: 0.20789 | Train value loss: 0.09349



 86%|████████▋ | 173/200 [10:51<01:42,  3.78s/it]

Test policy loss: 5.98699 | Test value loss: 1.07250

No improvement in Test Loss (5.98699 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 173
---------
Train policy loss: 0.21250 | Train value loss: 0.09337



 87%|████████▋ | 174/200 [10:55<01:40,  3.87s/it]

Test policy loss: 5.99585 | Test value loss: 1.07704

No improvement in Test Loss (5.99585 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 174
---------
Train policy loss: 0.21844 | Train value loss: 0.10314



 88%|████████▊ | 175/200 [10:59<01:36,  3.87s/it]

Test policy loss: 6.05882 | Test value loss: 1.08591

No improvement in Test Loss (6.05882 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 175
---------
Train policy loss: 0.20590 | Train value loss: 0.10228



 88%|████████▊ | 176/200 [11:03<01:32,  3.87s/it]

Test policy loss: 5.98446 | Test value loss: 1.07099

No improvement in Test Loss (5.98446 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 176
---------
Train policy loss: 0.21097 | Train value loss: 0.09761



 88%|████████▊ | 177/200 [11:07<01:28,  3.83s/it]

Test policy loss: 5.98290 | Test value loss: 1.07202

No improvement in Test Loss (5.98290 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 177
---------
Train policy loss: 0.20552 | Train value loss: 0.09531



 89%|████████▉ | 178/200 [11:11<01:24,  3.84s/it]

Test policy loss: 6.00436 | Test value loss: 1.07550

No improvement in Test Loss (6.00436 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 178
---------
Train policy loss: 0.20943 | Train value loss: 0.09324



 90%|████████▉ | 179/200 [11:15<01:21,  3.90s/it]

Test policy loss: 5.97354 | Test value loss: 1.07044

No improvement in Test Loss (5.97354 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 179
---------
Train policy loss: 0.20690 | Train value loss: 0.09344



 90%|█████████ | 180/200 [11:19<01:18,  3.92s/it]

Test policy loss: 5.97596 | Test value loss: 1.07403

No improvement in Test Loss (5.97596 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 180
---------
Train policy loss: 0.20576 | Train value loss: 0.09335



 90%|█████████ | 181/200 [11:23<01:15,  3.95s/it]

Test policy loss: 6.00891 | Test value loss: 1.07797

No improvement in Test Loss (6.00891 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 181
---------
Train policy loss: 0.20866 | Train value loss: 0.09340



 91%|█████████ | 182/200 [11:27<01:11,  3.95s/it]

Test policy loss: 5.97914 | Test value loss: 1.07168

No improvement in Test Loss (5.97914 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 182
---------
Train policy loss: 0.21093 | Train value loss: 0.09319



 92%|█████████▏| 183/200 [11:30<01:06,  3.89s/it]

Test policy loss: 5.99647 | Test value loss: 1.07218

No improvement in Test Loss (5.99647 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 183
---------
Train policy loss: 0.21177 | Train value loss: 0.09432



 92%|█████████▏| 184/200 [11:34<01:02,  3.91s/it]

Test policy loss: 6.02187 | Test value loss: 1.07885

No improvement in Test Loss (6.02187 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 184
---------
Train policy loss: 0.20900 | Train value loss: 0.09317



 92%|█████████▎| 185/200 [11:38<00:58,  3.91s/it]

Test policy loss: 5.98389 | Test value loss: 1.07089

No improvement in Test Loss (5.98389 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 185
---------
Train policy loss: 0.21119 | Train value loss: 0.09378



 93%|█████████▎| 186/200 [11:42<00:54,  3.87s/it]

Test policy loss: 6.00494 | Test value loss: 1.07734

No improvement in Test Loss (6.00494 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 186
---------
Train policy loss: 0.21867 | Train value loss: 0.09487



 94%|█████████▎| 187/200 [11:46<00:49,  3.82s/it]

Test policy loss: 6.00887 | Test value loss: 1.07461

No improvement in Test Loss (6.00887 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 187
---------
Train policy loss: 0.20653 | Train value loss: 0.09315



 94%|█████████▍| 188/200 [11:49<00:45,  3.82s/it]

Test policy loss: 5.99309 | Test value loss: 1.07428

No improvement in Test Loss (5.99309 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 188
---------
Train policy loss: 0.20584 | Train value loss: 0.09345



 94%|█████████▍| 189/200 [11:53<00:42,  3.83s/it]

Test policy loss: 5.95746 | Test value loss: 1.06681

No improvement in Test Loss (5.95746 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 189
---------
Train policy loss: 0.20895 | Train value loss: 0.09319



 95%|█████████▌| 190/200 [11:58<00:39,  3.95s/it]

Test policy loss: 5.99164 | Test value loss: 1.07273

No improvement in Test Loss (5.99164 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 190
---------
Train policy loss: 0.21140 | Train value loss: 0.09513



 96%|█████████▌| 191/200 [12:02<00:36,  4.02s/it]

Test policy loss: 6.00948 | Test value loss: 1.07853

No improvement in Test Loss (6.00948 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 191
---------
Train policy loss: 0.20782 | Train value loss: 0.09320



 96%|█████████▌| 192/200 [12:05<00:31,  3.91s/it]

Test policy loss: 5.97602 | Test value loss: 1.07235

No improvement in Test Loss (5.97602 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 192
---------
Train policy loss: 0.21662 | Train value loss: 0.09485



 96%|█████████▋| 193/200 [12:09<00:27,  3.88s/it]

Test policy loss: 6.04854 | Test value loss: 1.08339

No improvement in Test Loss (6.04854 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 193
---------
Train policy loss: 0.20797 | Train value loss: 0.09328



 97%|█████████▋| 194/200 [12:13<00:23,  3.86s/it]

Test policy loss: 5.97894 | Test value loss: 1.07090

No improvement in Test Loss (5.97894 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 194
---------
Train policy loss: 0.21005 | Train value loss: 0.09351



 98%|█████████▊| 195/200 [12:17<00:19,  3.87s/it]

Test policy loss: 5.97707 | Test value loss: 1.07031

No improvement in Test Loss (5.97707 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 195
---------
Train policy loss: 0.21189 | Train value loss: 0.09665



 98%|█████████▊| 196/200 [12:21<00:15,  3.84s/it]

Test policy loss: 6.01449 | Test value loss: 1.07947

No improvement in Test Loss (6.01449 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 196
---------
Train policy loss: 0.20719 | Train value loss: 0.09343



 98%|█████████▊| 197/200 [12:25<00:11,  3.84s/it]

Test policy loss: 5.97922 | Test value loss: 1.07088

No improvement in Test Loss (5.97922 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 197
---------
Train policy loss: 0.20997 | Train value loss: 0.09351



 99%|█████████▉| 198/200 [12:29<00:07,  3.89s/it]

Test policy loss: 5.97959 | Test value loss: 1.07263

No improvement in Test Loss (5.97959 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 198
---------
Train policy loss: 0.20860 | Train value loss: 0.09339



100%|█████████▉| 199/200 [12:32<00:03,  3.87s/it]

Test policy loss: 5.98638 | Test value loss: 1.07651

No improvement in Test Loss (5.98638 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08
Epoch: 199
---------
Train policy loss: 0.21549 | Train value loss: 0.09409



100%|██████████| 200/200 [12:36<00:00,  3.78s/it]

Test policy loss: 6.00758 | Test value loss: 1.07568

No improvement in Test Loss (6.00758 vs best 5.29527).
Current Learning Rate: 1.0000000000000004e-08



