# Covulutional Neural Networks Training with Hyperparameter Tuning on All Feature Dataset  

In [None]:
enable_wandb = True
use_gpu = True

In [None]:
# Install torch
!pip install torch -q

In [None]:
# Install torchinfo
!pip install torchinfo -q

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
# Install dependencies
!pip install -q numerapi pandas pyarrow matplotlib lightgbm scikit-learn cloudpickle scipy==1.10.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import necessary libraries and packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torchinfo import summary

# Check GPU status
if torch.cuda.is_available():
 print("GPU is available!")
else:
 print("GPU is not available.")

GPU is available!


In [None]:
gpu_available = torch.cuda.is_available()
gpu_available

True

In [None]:
# Initialize NumerAPI - the official Python API client for Numerai
from numerapi import NumerAPI
napi = NumerAPI()

# List the datasets and available versions
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
print("Available versions:\n", dataset_versions)

# Set data version to one of the latest datasets
DATA_VERSION = "v4.3"

# Print all files available for download for our version
current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]
print("availbable", DATA_VERSION, "files:\n", current_version_files)

Available versions:
 ['v4.3', 'v4.1', 'v4.2', 'v4']
availbable v4.3 files:
 ['v4.3/features.json', 'v4.3/live_benchmark_models.parquet', 'v4.3/live_example_preds.csv', 'v4.3/live_example_preds.parquet', 'v4.3/live_int8.parquet', 'v4.3/meta_model.parquet', 'v4.3/train_benchmark_models.parquet', 'v4.3/train_int8.parquet', 'v4.3/validation_benchmark_models.parquet', 'v4.3/validation_example_preds.csv', 'v4.3/validation_example_preds.parquet', 'v4.3/validation_int8.parquet']


In [None]:
# Download the feature metadata file
napi.download_dataset(f"{DATA_VERSION}/features.json");

# Read the metadata and display
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

v4.3/features.json: 1.12MB [00:00, 1.28MB/s]                           

feature_stats 2376
feature_sets 17
targets 41





In [None]:
# Check the number of features for each feature set
feature_sets = feature_metadata["feature_sets"]
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

small 42
medium 705
all 2376


In [None]:
# Define our feature set
feature_set = feature_sets["all"]

# Download the training data - this will take a few minutes
napi.download_dataset(f"{DATA_VERSION}/train_int8.parquet");

# Load only the "medium" feature set to
# Use the "all" feature set to use all features
train = pd.read_parquet(
    f"{DATA_VERSION}/train_int8.parquet",
    columns=["era", "target"] + feature_set
)

train.head()

v4.3/train_int8.parquet: 2.09GB [02:32, 13.7MB/s]                            


Unnamed: 0_level_0,era,target,feature_aaronic_unexampled_arguer,feature_abactinal_inventable_luminescence,feature_abating_unadaptable_weakfish,feature_abdominal_subtriplicate_fin,feature_abducent_unbeneficed_lithophyte,feature_abducted_euphonic_pipewort,feature_ablest_mauritanian_elding,feature_abreast_viscoelastic_commander,...,feature_yelled_hysteretic_eath,feature_yokelish_metapsychological_lunt,feature_yorkist_authenticated_lotted,feature_yoruban_purplish_directoire,feature_yoruban_unapplied_tawse,feature_zincky_unseemly_butt,feature_zincoid_peccant_greywacke,feature_zoophoric_underglaze_algin,feature_zygodactyl_exponible_lathi,feature_zymotic_roundabout_figuration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,1,0.25,2,2,0,2,3,2,4,2,...,2,2,2,2,2,2,2,2,3,2
n003bee128c2fcfc,1,0.75,3,2,4,2,1,2,2,2,...,3,2,2,2,2,2,2,2,1,2
n0048ac83aff7194,1,0.25,3,2,4,2,2,2,4,4,...,1,2,2,2,2,2,2,2,2,2
n00691bec80d3e02,1,0.75,1,2,1,2,2,2,4,2,...,2,2,2,2,2,2,2,2,2,2
n00b8720a2fdc4f2,1,0.5,1,2,0,2,1,2,2,3,...,1,2,2,2,2,2,2,2,1,2


### Data Preprocessing

In [None]:
# Reset the index of our dataset
train_set = train.reset_index()

In [None]:
# Extract the target and drop unnecessary columns for the train
target = train_set['target']
train = train_set.drop(columns=['era', 'id', 'target'])

In [None]:
# Divide the dataset into train, validation, and test sets
X_train, X_remain, y_train, y_remain = train_test_split(train, target, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_remain, y_remain, test_size=2/3, random_state=42)

In [None]:
# Turn data into tensors
X_train = torch.Tensor(X_train.values)
y_train = torch.Tensor(y_train.values)
X_test = torch.Tensor(X_test.values)
y_test = torch.Tensor(y_test.values)
X_val = torch.Tensor(X_val.values)
y_val = torch.Tensor(y_val.values)

In [None]:
# Put tensors into dataloader
train_loader = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_loader, batch_size=64, shuffle=True)

validation_loader = TensorDataset(X_val, y_val)
val_dataloader = DataLoader(validation_loader, batch_size=64, shuffle=False)

test_loader = TensorDataset(X_test, y_test)
test_dataloader = DataLoader(test_loader, batch_size=64, shuffle=False)

### Design Model Architecture

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, stride=2, padding=1)
        self.batch_norm1 = nn.BatchNorm1d(num_features=32)
        self.maxpool1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.batch_norm2 = nn.BatchNorm1d(num_features=64)
        self.adaptive_avg_pool = nn.AdaptiveAvgPool1d(output_size=1)
        self.linear1 = nn.Linear(64, 1)


    def forward(self, x):
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.batch_norm1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = torch.relu(x)
        x = self.batch_norm2(x)
        x = self.adaptive_avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.linear1(x)
        x = torch.relu(x)

        return x


# Instantiate the model
model = CNN()
if gpu_available and use_gpu:
  model = model.cuda()
# Print the model architecture
print(model)

CNN(
  (conv1): Conv1d(1, 32, kernel_size=(3,), stride=(2,), padding=(1,))
  (batch_norm1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool1): MaxPool1d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(2,), padding=(1,))
  (batch_norm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (adaptive_avg_pool): AdaptiveAvgPool1d(output_size=1)
  (linear1): Linear(in_features=64, out_features=1, bias=True)
)


In [None]:
# Display the model summary
summary(model, (64, 1, 2376))

Layer (type:depth-idx)                   Output Shape              Param #
CNN                                      [64, 1]                   --
├─Conv1d: 1-1                            [64, 32, 1188]            128
├─BatchNorm1d: 1-2                       [64, 32, 1188]            64
├─MaxPool1d: 1-3                         [64, 32, 593]             --
├─Conv1d: 1-4                            [64, 64, 297]             6,208
├─BatchNorm1d: 1-5                       [64, 64, 297]             128
├─AdaptiveAvgPool1d: 1-6                 [64, 64, 1]               --
├─Linear: 1-7                            [64, 1]                   65
Total params: 6,593
Trainable params: 6,593
Non-trainable params: 0
Total mult-adds (M): 127.75
Input size (MB): 0.61
Forward/backward pass size (MB): 58.39
Params size (MB): 0.03
Estimated Total Size (MB): 59.03

In [None]:
# Choose optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

In [None]:
# Choose loss function
criterion = nn.L1Loss()

In [None]:
enable_wandb = True

In [None]:
# Install wandb for model performance tracking
if enable_wandb:
  !pip install wandb -qU
  import wandb
  wandb.login()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# Define configuration for the naive CNN model
if enable_wandb:
  wandb.init(
    # Set the project where this run will be logged
    project="project_CNN",
    # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
    name= "experiment1",
    # Track hyperparameters and run metadata
    config={
    "learning_rate": 0.001,
    "epochs": 10,
    "batch_size": 64,
    "weight_decay": 0.001,
    "notes for me": "This is a CNN"
    })

[34m[1mwandb[0m: Currently logged in as: [33mfreyaj[0m ([33mfreyajiang[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Define a function to track the loss of each epoch
def get_loss(loader, model):
  with torch.no_grad(): # Anything under torch.no_grad will be calculated with no gradients. Can only be used for testing, not training!

    loss = 0
    for i, (features, targets) in enumerate(loader): # The batches.
          if gpu_available and use_gpu:
              features = features.cuda()
              targets = targets.cuda()
          features = features.unsqueeze(1)
          targets = targets.unsqueeze(1)
          outputs = model(features)

          loss = loss + criterion(outputs, targets)
    return loss/ len(loader)

In [None]:
# Define early stop function
class EarlyStopper:
    def __init__(self, patience=1):
        self.patience = patience
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        # If the new loss is lower than the old loss, reset the counter!
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
            # Keep track of the best model by saving it on the hard drive.
            torch.save(model, r"./best_model.pt")
        # otherwise, increment the counter.
        elif validation_loss > self.min_validation_loss:
            self.counter += 1
            # If there has been too many epochs with the loss being high, terminate.
            if self.counter >= self.patience:
                return True
        return False

In [None]:
# Instantiate early stopping
early_stopper = EarlyStopper(patience=3)

## Training Naive CNN Model

In [None]:
# Train the model
for epoch in range(10):
    for i, (features, targets) in enumerate(train_dataloader):
        model.train()
        optimizer.zero_grad()
        if gpu_available and use_gpu:
          features = features.cuda()
          targets = targets.cuda()
        features = features.unsqueeze(1)
        targets = targets.unsqueeze(1)
        outputs = model(features)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        if i % 10000 == 0:
          print("Epoch", epoch + 1, " batch", i+1, ". Training Loss: ", loss.item())
          if enable_wandb:
            wandb.log({"loss": loss})

    training_loss = get_loss(train_dataloader, model)
    validation_loss = get_loss(val_dataloader, model)
    print(f'Epoch [{epoch + 1}/{10}], Train Loss: {training_loss.item()}, Validation Loss: {validation_loss.item()}')
    if enable_wandb:
      wandb.log({"epoch": epoch + 1, "train_loss": training_loss.item(), "val_loss": validation_loss.item()})

    if early_stopper.early_stop(validation_loss):
        print("Validation loss hasn't dropped. Early stopping!")
        break

Epoch 1  batch 1 . Training Loss:  0.535932183265686
Epoch 1  batch 10001 . Training Loss:  0.16022354364395142
Epoch 1  batch 20001 . Training Loss:  0.14875328540802002
Epoch [1/10], Train Loss: 0.1499340683221817, Validation Loss: 0.1498262882232666
Epoch 2  batch 1 . Training Loss:  0.15251871943473816
Epoch 2  batch 10001 . Training Loss:  0.17587080597877502
Epoch 2  batch 20001 . Training Loss:  0.11726409196853638
Epoch [2/10], Train Loss: 0.14977216720581055, Validation Loss: 0.14970773458480835
Epoch 3  batch 1 . Training Loss:  0.16417557001113892
Epoch 3  batch 10001 . Training Loss:  0.14081144332885742
Epoch 3  batch 20001 . Training Loss:  0.16035354137420654
Epoch [3/10], Train Loss: 0.14971254765987396, Validation Loss: 0.14962713420391083
Epoch 4  batch 1 . Training Loss:  0.1367497444152832
Epoch 4  batch 10001 . Training Loss:  0.14071042835712433
Epoch 4  batch 20001 . Training Loss:  0.11417432874441147
Epoch [4/10], Train Loss: 0.14974620938301086, Validation Los

In [None]:
# Once training is done, load the best model (might not be the last model due to early stopping)
model1 = torch.load(r"./best_model.pt")
test_loss = get_loss(test_dataloader, model1)
print("best model performance Test set:", test_loss)

best model performance Test set: tensor(0.1495, device='cuda:0')


In [None]:
if enable_wandb:
  wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▄▅▇█
loss,█▂▂▂▂▁▂▁▂▁▁▁▁▂▂▂▁▂
train_loss,█▃▁▂▁▄
val_loss,█▄▁▃▂▄

0,1
Test Loss,0.14952
epoch,6.0
loss,0.14468
train_loss,0.1498
val_loss,0.14972


## Training CNN Model with Hyperparameter Tuning

In [None]:
import torch
import torch.nn as nn
import wandb

wandb.login()

# Define sweep configuration
sweep_configuration = {
    "name": "sweep",
    "method": "grid",
    "metric": {"goal": "minimize", "name": "validation_loss"},
    "parameters": {
        "learning_rate": {"values": [1e-2, 1e-3]},
        "batch_size": {"values": [64, 128]},
    },
}

def train(config=None):
    # Initialize WandB
    run = wandb.init()

    # Your model, optimizer, loss function, and data loaders
    model = CNN()
    optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate, weight_decay=0.001)
    criterion = nn.L1Loss()
    early_stopping = EarlyStopper(patience=3)
    train_dataloader = DataLoader(train_loader, batch_size=wandb.config.batch_size, shuffle=True)
    val_dataloader = DataLoader(validation_loader, batch_size=wandb.config.batch_size, shuffle=False)

    # Training loop
    for epoch in range(10):
        for i, (features, targets) in enumerate(train_dataloader):
            model.train()
            optimizer.zero_grad()
            if gpu_available and use_gpu:
                model = model.cuda()
                features = features.cuda()
                targets = targets.cuda()
            features = features.unsqueeze(1)
            targets = targets.unsqueeze(1)
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if i % 10000 == 0:
                print("Epoch", epoch + 1, " batch", i+1, ". Training Loss: ", loss.item())
                if enable_wandb:
                    wandb.log({"loss": loss})

        # Calculate training and validation loss
        training_loss = get_loss(train_dataloader, model)
        validation_loss = get_loss(val_dataloader, model)
        print(f'Epoch [{epoch + 1}/{10}], Train Loss: {training_loss.item()}, Validation Loss: {validation_loss.item()}')

        # Log loss values to wandb
        if enable_wandb:
            wandb.log({"epoch": epoch + 1, "train_loss": training_loss.item(), "val_loss": validation_loss.item()})

        # Check for early stopping
        if early_stopping.early_stop(validation_loss):
            print("Validation loss hasn't dropped. Early stopping!")
            break

sweep_id = wandb.sweep(sweep=sweep_configuration, project="project-cnn-sweep")

# Start the sweep
wandb.agent(sweep_id, function=train)

[34m[1mwandb[0m: Currently logged in as: [33mfreyaj[0m ([33mfreyajiang[0m). Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: npg6olxv
Sweep URL: https://wandb.ai/freyajiang/project-cnn-sweep/sweeps/npg6olxv


[34m[1mwandb[0m: Agent Starting Run: ohaebdym with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.01


Epoch 1  batch 1 . Training Loss:  0.49134498834609985
Epoch 1  batch 10001 . Training Loss:  0.12915204465389252
Epoch 1  batch 20001 . Training Loss:  0.13301435112953186
Epoch [1/10], Train Loss: 0.15016739070415497, Validation Loss: 0.15006627142429352
Epoch 2  batch 1 . Training Loss:  0.1606120467185974
Epoch 2  batch 10001 . Training Loss:  0.15389466285705566
Epoch 2  batch 20001 . Training Loss:  0.17243734002113342
Epoch [2/10], Train Loss: 0.15421655774116516, Validation Loss: 0.15411049127578735
Epoch 3  batch 1 . Training Loss:  0.16409936547279358
Epoch 3  batch 10001 . Training Loss:  0.15718121826648712
Epoch 3  batch 20001 . Training Loss:  0.17316392064094543
Epoch [3/10], Train Loss: 0.15106455981731415, Validation Loss: 0.15096409618854523
Epoch 4  batch 1 . Training Loss:  0.13008925318717957
Epoch 4  batch 10001 . Training Loss:  0.14469817280769348
Epoch 4  batch 20001 . Training Loss:  0.12217612564563751
Epoch [4/10], Train Loss: 0.1500667929649353, Validation 

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▁▁▂▂▂▂▂▂▁▁▁▁▂▂▁▁▁▁▂▂▁▂▁▁▁▁▂▂▂
train_loss,▁▇▃▁▁█▁▃▃▂
val_loss,▁▇▃▁▁█▁▃▃▂

0,1
epoch,10.0
loss,0.15013
train_loss,0.15038
val_loss,0.15028


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: k41i0wb1 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1  batch 1 . Training Loss:  0.4885199964046478
Epoch 1  batch 10001 . Training Loss:  0.18409141898155212
Epoch 1  batch 20001 . Training Loss:  0.16045378148555756
Epoch [1/10], Train Loss: 0.15046894550323486, Validation Loss: 0.15037208795547485
Epoch 2  batch 1 . Training Loss:  0.17643283307552338
Epoch 2  batch 10001 . Training Loss:  0.11759474873542786
Epoch 2  batch 20001 . Training Loss:  0.1683395653963089
Epoch [2/10], Train Loss: 0.1500239074230194, Validation Loss: 0.14992879331111908
Epoch 3  batch 1 . Training Loss:  0.14875829219818115
Epoch 3  batch 10001 . Training Loss:  0.17602869868278503
Epoch 3  batch 20001 . Training Loss:  0.14499790966510773
Epoch [3/10], Train Loss: 0.14996600151062012, Validation Loss: 0.1498720347881317
Epoch 4  batch 1 . Training Loss:  0.14871719479560852
Epoch 4  batch 10001 . Training Loss:  0.19150158762931824
Epoch 4  batch 20001 . Training Loss:  0.1452411413192749
Epoch [4/10], Train Loss: 0.14994069933891296, Validation Los

VBox(children=(Label(value='0.002 MB of 0.014 MB uploaded\r'), FloatProgress(value=0.12937358916478556, max=1.…

0,1
epoch,▁▂▃▄▅▆▇█
loss,█▂▂▂▁▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▂▁▁▂
train_loss,█▃▃▂▁▅▇▄
val_loss,█▃▃▂▁▅▇▃

0,1
epoch,8.0
loss,0.15281
train_loss,0.15004
val_loss,0.14994


[34m[1mwandb[0m: Agent Starting Run: 9ei5va6c with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.01


Epoch 1  batch 1 . Training Loss:  0.4485552906990051
Epoch 1  batch 10001 . Training Loss:  0.15033277869224548
Epoch [1/10], Train Loss: 0.1506001502275467, Validation Loss: 0.15049661695957184
Epoch 2  batch 1 . Training Loss:  0.16280457377433777
Epoch 2  batch 10001 . Training Loss:  0.15129302442073822
Epoch [2/10], Train Loss: 0.14987291395664215, Validation Loss: 0.14977458119392395
Epoch 3  batch 1 . Training Loss:  0.17202851176261902
Epoch 3  batch 10001 . Training Loss:  0.13614621758460999
Epoch [3/10], Train Loss: 0.149770587682724, Validation Loss: 0.14966534078121185
Epoch 4  batch 1 . Training Loss:  0.1484878957271576
Epoch 4  batch 10001 . Training Loss:  0.19167758524417877
Epoch [4/10], Train Loss: 0.15091216564178467, Validation Loss: 0.1508076786994934
Epoch 5  batch 1 . Training Loss:  0.13803544640541077
Epoch 5  batch 10001 . Training Loss:  0.16924616694450378
Epoch [5/10], Train Loss: 0.1499035656452179, Validation Loss: 0.1497960239648819
Epoch 6  batch 1 .

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▄▅▇█
loss,█▁▂▁▂▁▁▂▁▂▁▁
train_loss,▆▂▁█▂▁
val_loss,▆▂▁█▂▂

0,1
epoch,6.0
loss,0.15654
train_loss,0.14985
val_loss,0.14976


[34m[1mwandb[0m: Agent Starting Run: 99mtjyow with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1  batch 1 . Training Loss:  0.4454803466796875
Epoch 1  batch 10001 . Training Loss:  0.12356576323509216
Epoch [1/10], Train Loss: 0.1501377373933792, Validation Loss: 0.15003560483455658
Epoch 2  batch 1 . Training Loss:  0.1469590961933136
Epoch 2  batch 10001 . Training Loss:  0.15698452293872833
Epoch [2/10], Train Loss: 0.15009655058383942, Validation Loss: 0.1499951034784317
Epoch 3  batch 1 . Training Loss:  0.14101357758045197
Epoch 3  batch 10001 . Training Loss:  0.14296910166740417
Epoch [3/10], Train Loss: 0.14996588230133057, Validation Loss: 0.1498667597770691
Epoch 4  batch 1 . Training Loss:  0.1467316746711731
Epoch 4  batch 10001 . Training Loss:  0.1622784435749054
Epoch [4/10], Train Loss: 0.1498270183801651, Validation Loss: 0.14971651136875153
Epoch 5  batch 1 . Training Loss:  0.14073917269706726
Epoch 5  batch 10001 . Training Loss:  0.1585732400417328
Epoch [5/10], Train Loss: 0.14996489882469177, Validation Loss: 0.14986349642276764
Epoch 6  batch 1 . 

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▅▆▇█
loss,█▁▂▂▁▁▂▂▁▂▂▂▁▂
train_loss,▅▅▃▁▃█▄
val_loss,▅▅▃▁▃█▄

0,1
epoch,7.0
loss,0.15848
train_loss,0.15003
val_loss,0.14992


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [None]:
# Once training is done, load the best model (might not be the last model due to early stopping)
model2 = torch.load(r"./best_model.pt")
test_loss = get_loss(test_dataloader, model2)
print("best model performance Test set:", test_loss)

best model performance Test set: tensor(0.3975, device='cuda:0')
