# Problem 3 - Ray Tune for Hyperparameter Optimization

## 3.1

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from ray import tune
from ray.tune.schedulers import HyperBandScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from time import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential

import ray
from ray.tune.integration.keras import TuneReportCallback
from ray.tune.schedulers import ASHAScheduler
from ray.air import session
from ray.tune.search.bayesopt import BayesOptSearch

2023-12-05 11:52:12.760326: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-05 11:52:18.569590: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-05 11:52:18.569626: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-05 11:52:18.585610: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-05 11:52:18.627292: I tensorflow/core/platform/cpu_feature_guar

In [10]:
def train_mnist(config):
    # Load MNIST data
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Add a channels dimension
    x_train = x_train[..., tf.newaxis].astype("float32")
    x_test = x_test[..., tf.newaxis].astype("float32")

    # Define the model
    model = Sequential([
        Conv2D(filters=config["conv_filters"], kernel_size=(3, 3), activation="relu", input_shape=(28, 28, 1)),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation="relu"),
        Dropout(config["dropout"]),
        Dense(10, activation="softmax")
    ])

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=config["lr"]),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    # Train the model
    model.fit(
        x_train, y_train,
        validation_data=(x_test, y_test),
        batch_size=config["batch_size"],
        epochs=10,
        callbacks=[TuneReportCallback({"accuracy": "accuracy"})]
    )

# Define the search space
search_space = {
    "conv_filters": tune.choice([64, 128, 256]),
    "lr": tune.loguniform(0.001, 0.1),
    "batch_size": tune.choice([64, 128, 256]),
    "dropout": tune.uniform(0, 1)
}

# Initialize Ray
ray.init(ignore_reinit_error=True)

2023-12-05 12:14:55,573	INFO worker.py:1507 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.11.5
Ray version:,2.8.1


In [11]:
def train_mnist(config):
    num_classes = 10
    epochs = 12

    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train.reshape(-1, 28, 28, 1) / 255.0, x_test.reshape(-1, 28, 28, 1) / 255.0
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(filters=config["conv_filters"], kernel_size=(3, 3), activation="relu", input_shape=(28, 28, 1)),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(config["dropout"]),
        tf.keras.layers.Dense(num_classes, activation="softmax")
    ])

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=config["lr"]),
        metrics=["accuracy"])

    for epoch in range(epochs):
        model.fit(
            x_train,
            y_train,
            batch_size=config["batch_size"],
            epochs=1,
            verbose=0,
            validation_data=(x_test, y_test))

        # Evaluate the model
        i, accuracy = model.evaluate(x_test, y_test, verbose=0)
        session.report({"mean_accuracy": accuracy})

### Grid 

In [12]:
# Grid Search
start_time = time()

grid_analysis = tune.run(
    train_mnist,
    name="exp",
    metric="mean_accuracy",
    mode="max",
    stop={"mean_accuracy": 0.99},
    resources_per_trial={"gpu": 1},
    config={
        "conv_filters": tune.grid_search([64, 128, 256]),
        "lr": tune.grid_search([0.001, 0.01, 0.1]),
        "batch_size": tune.grid_search([64, 128, 256]),
        "dropout": tune.grid_search([0.0, 0.25, 0.5, 0.75, 0.9]),
    }
)

end_time = time()
grid_time = end_time - start_time

2023-12-05 12:14:58,081	INFO tune.py:586 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-12-05 14:13:29
Running for:,01:58:30.97
Memory:,60.0/377.3 GiB

Trial name,status,loc,batch_size,conv_filters,dropout,lr,acc,iter,total time (s)
train_mnist_d0fea_00000,TERMINATED,10.32.35.81:3179883,64,64,0.0,0.001,0.9875,12,62.2493
train_mnist_d0fea_00001,TERMINATED,10.32.35.81:3184307,128,64,0.0,0.001,0.9879,12,40.4822
train_mnist_d0fea_00002,TERMINATED,10.32.35.81:3186750,256,64,0.0,0.001,0.9875,12,29.8463
train_mnist_d0fea_00003,TERMINATED,10.32.35.81:3189247,64,128,0.0,0.001,0.988,12,61.7737
train_mnist_d0fea_00004,TERMINATED,10.32.35.81:3193022,128,128,0.0,0.001,0.9871,12,40.8085
train_mnist_d0fea_00005,TERMINATED,10.32.35.81:3195582,256,128,0.0,0.001,0.9862,12,33.7739
train_mnist_d0fea_00006,TERMINATED,10.32.35.81:3198748,64,256,0.0,0.001,0.9871,12,66.7068
train_mnist_d0fea_00007,TERMINATED,10.32.35.81:3202163,128,256,0.0,0.001,0.9862,12,50.9287
train_mnist_d0fea_00008,TERMINATED,10.32.35.81:3205785,256,256,0.0,0.001,0.9871,12,43.7762
train_mnist_d0fea_00009,TERMINATED,10.32.35.81:3208531,64,64,0.25,0.001,0.9883,12,64.5574


[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

Trial name,mean_accuracy
train_mnist_d0fea_00000,0.9875
train_mnist_d0fea_00001,0.9879
train_mnist_d0fea_00002,0.9875
train_mnist_d0fea_00003,0.988
train_mnist_d0fea_00004,0.9871
train_mnist_d0fea_00005,0.9862
train_mnist_d0fea_00006,0.9871
train_mnist_d0fea_00007,0.9862
train_mnist_d0fea_00008,0.9871
train_mnist_d0fea_00009,0.9883


[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

In [13]:
best_trial = grid_analysis.get_best_trial("mean_accuracy", "max", "last")
best_config = best_trial.config

print("Best trial config: {}".format(best_config))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result["mean_accuracy"]))
print("Time taken for Grid Search: {} seconds".format(grid_time))
# Naming error; takes too long to run but fixed the error in the code

Best trial config: {'conv_filters': 64, 'lr': 0.001, 'batch_size': 256, 'dropout': 0.25}
Best trial final validation accuracy: 0.9894000291824341
Time taken for Grid Search: 7111.168742179871 seconds


### Bayesian

In [23]:
def train_mnist_bayes(config):
    # Convert continuous parameters back to categorical
    #conv_filters_map = {0: 64, 1: 128, 2: 256}
    batch_size_map = {0: 64, 1: 128, 2: 256}

    config["conv_filters"] = int(round(config["conv_filters"]))
    config["batch_size"] = batch_size_map[int(round(config["batch_size"]))]

    # Load MNIST data
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train.reshape(-1, 28, 28, 1) / 255.0, x_test.reshape(-1, 28, 28, 1) / 255.0

    # Define the model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(filters=config["conv_filters"], kernel_size=(3, 3), activation="relu", input_shape=(28, 28, 1)),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(config["dropout"]),
        tf.keras.layers.Dense(10, activation="softmax")  # Assuming 10 classes for MNIST
    ])

    # Compile the model
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=config["lr"]),
        metrics=["accuracy"]
    )

    # Train the model
    model.fit(
        x_train,
        y_train,
        batch_size=config["batch_size"],
        epochs=12,  # or any other number of epochs you wish to use
        verbose=1,
        validation_data=(x_test, y_test)
    )

    # Evaluate the model
    i, accuracy = model.evaluate(x_test, y_test, verbose=0)
    session.report({"mean_accuracy": accuracy})

In [31]:
bayes_search_space = {
    "conv_filters": tune.uniform(64,256),
    "lr": tune.loguniform(0.001, 0.1),
    "batch_size": tune.uniform(0, 2),  # 0 for 64, 1 for 128, 2 for 256
    "dropout": tune.uniform(0, 1)
}

# Initialize Bayesian optimization search algorithm
bayesopt_search = BayesOptSearch()

start_time = time()

bayes_analysis = tune.run(
    train_mnist_bayes,
    name="exp_bayes",
    metric="mean_accuracy",
    mode="max",
    stop={"mean_accuracy": 0.99},
    resources_per_trial={"gpu": 1},
    config=search_space,
    search_alg=bayesopt_search,
    num_samples=10
)

end_time = time()
bayes_time = end_time - start_time

2023-12-05 09:44:19,935	INFO tune.py:586 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-12-05 09:51:25
Running for:,00:07:05.54
Memory:,40.6/377.3 GiB

Trial name,status,loc,batch_size,conv_filters,dropout,lr,acc,iter,total time (s)
train_mnist_a19e9cc9,TERMINATED,10.32.35.81:2617307,0.74908,246.537,0.731994,0.0602672,0.1135,1,56.3798
train_mnist_580f069e,TERMINATED,10.32.35.81:2621766,0.312037,93.9509,0.0580836,0.0867514,0.1135,1,54.36
train_mnist_27ad4a92,TERMINATED,10.32.35.81:2626033,1.20223,199.95,0.0205845,0.0970211,0.101,1,36.624
train_mnist_36566aaa,TERMINATED,10.32.35.81:2628692,1.66489,104.769,0.181825,0.019157,0.9587,1,21.6515
train_mnist_3d3d47c2,TERMINATED,10.32.35.81:2630824,0.608484,164.753,0.431945,0.0298317,0.9727,1,34.1948
train_mnist_0f0fb4fc,TERMINATED,10.32.35.81:2634070,1.22371,90.7828,0.292145,0.0372698,0.9589,1,30.6681
train_mnist_f53e8a34,TERMINATED,10.32.35.81:2636148,0.91214,214.754,0.199674,0.0519092,0.9516,1,38.052
train_mnist_050b7b5d,TERMINATED,10.32.35.81:2638393,1.18483,72.9185,0.607545,0.0178819,0.9649,1,30.3309
train_mnist_b010433f,TERMINATED,10.32.35.81:2641581,0.130103,246.186,0.965632,0.0810313,0.1032,1,58.6808
train_mnist_460491b1,TERMINATED,10.32.35.81:2645324,0.609228,82.753,0.684233,0.0445751,0.1135,1,30.4812


[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2617307)[0m Epoch 1/12


[36m(train_mnist pid=2617307)[0m 2023-12-05 09:44:27.622406: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2617307)[0m 2023-12-05 09:44:38.600964: I external/local_xla/xla/service/service.cc:168] XLA service 0x14d3902f69f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2617307)[0m 2023-12-05 09:44:38.601007: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2617307)[0m 2023-12-05 09:44:38.610090: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2617307)[0m I0000 00:00:1701787478.845065 2617902 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  9/469 [..............................] - ETA: 3s - loss: 44.1109 - accuracy: 0.0972    
 27/469 [>.............................] - ETA: 2s - loss: 16.2429 - accuracy: 0.1062
 45/469 [=>............................] - ETA: 2s - loss: 10.6681 - accuracy: 0.1043
 63/469 [===>..........................] - ETA: 2s - loss: 8.2774 - accuracy: 0.1063
 81/469 [====>.........................] - ETA: 2s - loss: 6.9504 - accuracy: 0.1068
109/469 [=====>........................] - ETA: 2s - loss: 5.7573 - accuracy: 0.1059
[36m(train_mnist pid=2617307)[0m Epoch 2/12
  1/469 [..............................] - ETA: 3s - loss: 2.3082 - accuracy: 0.0703
 19/469 [>.............................] - ETA: 2s - loss: 2.3056 - accuracy: 0.1040
 38/469 [=>............................] - ETA: 2s - loss: 2.3076 - accuracy: 0.1026
 47/469 [==>...........................] - ETA: 2s - loss: 2.3073 - accuracy: 0.1016
 65/469 [===>..........................] - ETA: 2s - loss: 2.3065 - accuracy: 0.1025
 83/469 [===

Trial name,mean_accuracy
train_mnist_050b7b5d,0.9649
train_mnist_0f0fb4fc,0.9589
train_mnist_27ad4a92,0.101
train_mnist_36566aaa,0.9587
train_mnist_3d3d47c2,0.9727
train_mnist_460491b1,0.1135
train_mnist_580f069e,0.1135
train_mnist_a19e9cc9,0.1135
train_mnist_b010433f,0.1032
train_mnist_f53e8a34,0.9516


[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2621766)[0m Epoch 1/12


[36m(train_mnist pid=2621766)[0m 2023-12-05 09:45:25.316133: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2621766)[0m 2023-12-05 09:45:26.460376: I external/local_xla/xla/service/service.cc:168] XLA service 0x1484018104b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2621766)[0m 2023-12-05 09:45:26.460404: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2621766)[0m 2023-12-05 09:45:26.467439: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2621766)[0m I0000 00:00:1701787526.581369 2621957 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


 13/938 [..............................] - ETA: 4s - loss: 33.9118 - accuracy: 0.1130  
 38/938 [>.............................] - ETA: 3s - loss: 13.1240 - accuracy: 0.1053
 51/938 [>.............................] - ETA: 3s - loss: 10.3684 - accuracy: 0.1032
 79/938 [=>............................] - ETA: 3s - loss: 7.5097 - accuracy: 0.1044
105/938 [==>...........................] - ETA: 3s - loss: 6.2209 - accuracy: 0.1018
132/938 [===>..........................] - ETA: 3s - loss: 5.4205 - accuracy: 0.1014
157/938 [====>.........................] - ETA: 3s - loss: 4.9252 - accuracy: 0.1012
182/938 [====>.........................] - ETA: 3s - loss: 4.5656 - accuracy: 0.1002
207/938 [=====>........................] - ETA: 2s - loss: 4.2931 - accuracy: 0.0992
[36m(train_mnist pid=2621766)[0m Epoch 2/12
  1/938 [..............................] - ETA: 3s - loss: 2.3370 - accuracy: 0.0625
 27/938 [..............................] - ETA: 3s - loss: 2.3093 - accuracy: 0.0938
 39/938 [>....

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2626033)[0m Epoch 1/12


[36m(train_mnist pid=2626033)[0m 2023-12-05 09:46:23.191254: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2626033)[0m 2023-12-05 09:46:24.204022: I external/local_xla/xla/service/service.cc:168] XLA service 0x15063553cbe0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2626033)[0m 2023-12-05 09:46:24.204075: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2626033)[0m 2023-12-05 09:46:24.212268: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2626033)[0m I0000 00:00:1701787584.326535 2626358 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1/469 [..............................] - ETA: 17:58 - loss: 2.3066 - accuracy: 0.1016
 20/469 [>.............................] - ETA: 2s - loss: 28.0685 - accuracy: 0.1066 
 40/469 [=>............................] - ETA: 2s - loss: 15.1894 - accuracy: 0.1023
 60/469 [==>...........................] - ETA: 2s - loss: 10.8950 - accuracy: 0.1007
 81/469 [====>.........................] - ETA: 2s - loss: 8.6671 - accuracy: 0.1059
 92/469 [====>.........................] - ETA: 1s - loss: 7.9065 - accuracy: 0.1052
[36m(train_mnist pid=2626033)[0m Epoch 2/12
 11/469 [..............................] - ETA: 2s - loss: 2.3079 - accuracy: 0.0966
 31/469 [>.............................] - ETA: 2s - loss: 2.3064 - accuracy: 0.0988
 51/469 [==>...........................] - ETA: 2s - loss: 2.3061 - accuracy: 0.1048
 61/469 [==>...........................] - ETA: 2s - loss: 2.3061 - accuracy: 0.1031
 82/469 [====>.........................] - ETA: 2s - loss: 2.3057 - accuracy: 0.1023
103/469 [===

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2628692)[0m Epoch 1/12


[36m(train_mnist pid=2628692)[0m 2023-12-05 09:47:03.163106: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2628692)[0m 2023-12-05 09:47:04.169716: I external/local_xla/xla/service/service.cc:168] XLA service 0x148b7dd153a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2628692)[0m 2023-12-05 09:47:04.169742: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2628692)[0m 2023-12-05 09:47:04.175343: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2628692)[0m I0000 00:00:1701787624.290322 2629239 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1/235 [..............................] - ETA: 8:56 - loss: 2.3047 - accuracy: 0.0859
 29/235 [==>...........................] - ETA: 1s - loss: 2.0288 - accuracy: 0.4535
 48/235 [=====>........................] - ETA: 1s - loss: 1.4468 - accuracy: 0.5966
[36m(train_mnist pid=2628692)[0m Epoch 2/12
  1/235 [..............................] - ETA: 1s - loss: 0.2874 - accuracy: 0.9062
 20/235 [=>............................] - ETA: 1s - loss: 0.2498 - accuracy: 0.9223
 39/235 [===>..........................] - ETA: 1s - loss: 0.2500 - accuracy: 0.9197
[36m(train_mnist pid=2628692)[0m Epoch 3/12
 11/235 [>.............................] - ETA: 1s - loss: 0.2161 - accuracy: 0.9215
 31/235 [==>...........................] - ETA: 1s - loss: 0.2087 - accuracy: 0.9309
 51/235 [=====>........................] - ETA: 0s - loss: 0.2074 - accuracy: 0.9320
[36m(train_mnist pid=2628692)[0m Epoch 4/12
  1/235 [..............................] - ETA: 1s - loss: 0.1574 - accuracy: 0.9492
 21/235 [=

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2630824)[0m Epoch 1/12


[36m(train_mnist pid=2630824)[0m 2023-12-05 09:47:28.158403: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2630824)[0m 2023-12-05 09:47:29.161821: I external/local_xla/xla/service/service.cc:168] XLA service 0x14b6e86d0b10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2630824)[0m 2023-12-05 09:47:29.161864: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2630824)[0m 2023-12-05 09:47:29.168858: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2630824)[0m I0000 00:00:1701787649.283083 2630978 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1/469 [..............................] - ETA: 18:05 - loss: 2.3033 - accuracy: 0.0859
 21/469 [>.............................] - ETA: 2s - loss: 3.5186 - accuracy: 0.5409 
 43/469 [=>............................] - ETA: 2s - loss: 1.9969 - accuracy: 0.6919
 65/469 [===>..........................] - ETA: 2s - loss: 1.4598 - accuracy: 0.7563
 87/469 [====>.........................] - ETA: 1s - loss: 1.1749 - accuracy: 0.7915
 99/469 [=====>........................] - ETA: 1s - loss: 1.0663 - accuracy: 0.8066
[36m(train_mnist pid=2630824)[0m Epoch 2/12
 12/469 [..............................] - ETA: 2s - loss: 0.1673 - accuracy: 0.9525
 47/469 [==>...........................] - ETA: 1s - loss: 0.1754 - accuracy: 0.9480
 70/469 [===>..........................] - ETA: 1s - loss: 0.1733 - accuracy: 0.9485
 91/469 [====>.........................] - ETA: 1s - loss: 0.1700 - accuracy: 0.9491
[36m(train_mnist pid=2630824)[0m Epoch 3/12
  1/469 [..............................] - ETA: 2s - l

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2634070)[0m Epoch 1/12


[36m(train_mnist pid=2634070)[0m 2023-12-05 09:48:06.210753: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2634070)[0m 2023-12-05 09:48:07.172116: I external/local_xla/xla/service/service.cc:168] XLA service 0x15393c6d1b50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2634070)[0m 2023-12-05 09:48:07.172161: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2634070)[0m 2023-12-05 09:48:07.184929: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2634070)[0m I0000 00:00:1701787687.303039 2634428 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1/469 [..............................] - ETA: 17:47 - loss: 2.3120 - accuracy: 0.0859
 25/469 [>.............................] - ETA: 1s - loss: 3.1347 - accuracy: 0.4759 
 61/469 [==>...........................] - ETA: 1s - loss: 1.6042 - accuracy: 0.6887
 87/469 [====>.........................] - ETA: 1s - loss: 1.2495 - accuracy: 0.7443
[36m(train_mnist pid=2634070)[0m Epoch 2/12
  1/469 [..............................] - ETA: 2s - loss: 0.5610 - accuracy: 0.8828
 27/469 [>.............................] - ETA: 1s - loss: 0.2812 - accuracy: 0.9161
 39/469 [=>............................] - ETA: 1s - loss: 0.2652 - accuracy: 0.9211
 64/469 [===>..........................] - ETA: 1s - loss: 0.2425 - accuracy: 0.9276
 89/469 [====>.........................] - ETA: 1s - loss: 0.2476 - accuracy: 0.9257
[36m(train_mnist pid=2634070)[0m Epoch 3/12
  1/469 [..............................] - ETA: 2s - loss: 0.2284 - accuracy: 0.9375
 14/469 [..............................] - ETA: 1s - l

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2636148)[0m Epoch 1/12


[36m(train_mnist pid=2636148)[0m 2023-12-05 09:48:40.088592: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2636148)[0m 2023-12-05 09:48:41.057778: I external/local_xla/xla/service/service.cc:168] XLA service 0x14781830c0c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2636148)[0m 2023-12-05 09:48:41.057822: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2636148)[0m 2023-12-05 09:48:41.067655: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2636148)[0m I0000 00:00:1701787721.181994 2636467 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1/469 [..............................] - ETA: 17:37 - loss: 2.3049 - accuracy: 0.1094
 19/469 [>.............................] - ETA: 2s - loss: 9.0065 - accuracy: 0.5058  
 39/469 [=>............................] - ETA: 2s - loss: 4.6923 - accuracy: 0.6705
 58/469 [==>...........................] - ETA: 2s - loss: 3.2901 - accuracy: 0.7380
 77/469 [===>..........................] - ETA: 2s - loss: 2.5732 - accuracy: 0.7751
 97/469 [=====>........................] - ETA: 2s - loss: 2.1118 - accuracy: 0.8012
106/469 [=====>........................] - ETA: 2s - loss: 1.9586 - accuracy: 0.8109
[36m(train_mnist pid=2636148)[0m Epoch 2/12
 11/469 [..............................] - ETA: 2s - loss: 0.2129 - accuracy: 0.9354
 30/469 [>.............................] - ETA: 2s - loss: 0.2311 - accuracy: 0.9328
 50/469 [==>...........................] - ETA: 2s - loss: 0.2306 - accuracy: 0.9291
 69/469 [===>..........................] - ETA: 2s - loss: 0.2218 - accuracy: 0.9310
 88/469 [====>

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2638393)[0m Epoch 1/12


[36m(train_mnist pid=2638393)[0m 2023-12-05 09:49:21.264479: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2638393)[0m 2023-12-05 09:49:22.258548: I external/local_xla/xla/service/service.cc:168] XLA service 0x152f05bfddc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2638393)[0m 2023-12-05 09:49:22.258591: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2638393)[0m 2023-12-05 09:49:22.269470: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2638393)[0m I0000 00:00:1701787762.391399 2638606 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


 12/469 [..............................] - ETA: 2s - loss: 3.0743 - accuracy: 0.2220   
 36/469 [=>............................] - ETA: 1s - loss: 1.8649 - accuracy: 0.4631
 49/469 [==>...........................] - ETA: 1s - loss: 1.5739 - accuracy: 0.5351
 73/469 [===>..........................] - ETA: 1s - loss: 1.2793 - accuracy: 0.6153
 97/469 [=====>........................] - ETA: 1s - loss: 1.1088 - accuracy: 0.6637
[36m(train_mnist pid=2638393)[0m Epoch 2/12
  1/469 [..............................] - ETA: 2s - loss: 0.2431 - accuracy: 0.9297
 28/469 [>.............................] - ETA: 1s - loss: 0.3722 - accuracy: 0.8823
 53/469 [==>...........................] - ETA: 1s - loss: 0.3785 - accuracy: 0.8818
 78/469 [===>..........................] - ETA: 1s - loss: 0.3808 - accuracy: 0.8815
105/469 [=====>........................] - ETA: 1s - loss: 0.3789 - accuracy: 0.8836
[36m(train_mnist pid=2638393)[0m Epoch 3/12
  1/469 [..............................] - ETA: 1s - lo

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2641581)[0m Epoch 1/12


[36m(train_mnist pid=2641581)[0m 2023-12-05 09:49:55.205359: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2641581)[0m 2023-12-05 09:49:56.172718: I external/local_xla/xla/service/service.cc:168] XLA service 0x145915ce0780 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2641581)[0m 2023-12-05 09:49:56.172760: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2641581)[0m 2023-12-05 09:49:56.179660: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2641581)[0m I0000 00:00:1701787796.293462 2641734 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


 11/938 [..............................] - ETA: 4s - loss: 68.8632 - accuracy: 0.1094  
 35/938 [>.............................] - ETA: 4s - loss: 23.2338 - accuracy: 0.1112
 59/938 [>.............................] - ETA: 4s - loss: 14.7219 - accuracy: 0.1110
 83/938 [=>............................] - ETA: 3s - loss: 11.1315 - accuracy: 0.1101
107/938 [==>...........................] - ETA: 3s - loss: 9.1512 - accuracy: 0.1097 
130/938 [===>..........................] - ETA: 3s - loss: 7.9430 - accuracy: 0.1053
154/938 [===>..........................] - ETA: 3s - loss: 7.0644 - accuracy: 0.1063
165/938 [====>.........................] - ETA: 3s - loss: 6.7478 - accuracy: 0.1065
189/938 [=====>........................] - ETA: 3s - loss: 6.1840 - accuracy: 0.1064
213/938 [=====>........................] - ETA: 3s - loss: 5.7473 - accuracy: 0.1055
[36m(train_mnist pid=2641581)[0m Epoch 2/12
 13/938 [..............................] - ETA: 4s - loss: 2.3094 - accuracy: 0.0962
 38/938 [>..

[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

[36m(train_mnist pid=2645324)[0m Epoch 1/12


[36m(train_mnist pid=2645324)[0m 2023-12-05 09:50:57.172051: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
[36m(train_mnist pid=2645324)[0m 2023-12-05 09:50:58.147996: I external/local_xla/xla/service/service.cc:168] XLA service 0x1499f830bc80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
[36m(train_mnist pid=2645324)[0m 2023-12-05 09:50:58.148045: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
[36m(train_mnist pid=2645324)[0m 2023-12-05 09:50:58.157088: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
[36m(train_mnist pid=2645324)[0m I0000 00:00:1701787858.281410 2645459 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1/469 [..............................] - ETA: 17:50 - loss: 2.2962 - accuracy: 0.1406
 24/469 [>.............................] - ETA: 1s - loss: 4.3634 - accuracy: 0.1175 
 49/469 [==>...........................] - ETA: 1s - loss: 3.3118 - accuracy: 0.1134
 75/469 [===>..........................] - ETA: 1s - loss: 2.9613 - accuracy: 0.1124
 88/469 [====>.........................] - ETA: 1s - loss: 2.8648 - accuracy: 0.1101
[36m(train_mnist pid=2645324)[0m Epoch 2/12
 15/469 [..............................] - ETA: 1s - loss: 2.3090 - accuracy: 0.1036
 42/469 [=>............................] - ETA: 1s - loss: 2.3056 - accuracy: 0.1031
 68/469 [===>..........................] - ETA: 1s - loss: 2.3051 - accuracy: 0.1043
 94/469 [=====>........................] - ETA: 1s - loss: 2.3036 - accuracy: 0.1098
[36m(train_mnist pid=2645324)[0m Epoch 3/12
 14/469 [..............................] - ETA: 1s - loss: 2.3068 - accuracy: 0.0887
 38/469 [=>............................] - ETA: 1s - l

2023-12-05 09:51:25,501	INFO tune.py:1047 -- Total run time: 425.57 seconds (425.53 seconds for the tuning loop).


In [33]:
best_trial = bayes_analysis.get_best_trial("mean_accuracy", "max", "last")
best_config = best_trial.config

print("Best trial config: {}".format(best_config))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result["mean_accuracy"]))
print("Time taken for Bayesian Search: {} seconds".format(bayes_time))

Best trial config: {'conv_filters': 164.75323487338966, 'lr': 0.029831684879606152, 'batch_size': 0.6084844859190754, 'dropout': 0.43194501864211576}
Best trial final validation accuracy: 0.9726999998092651
Time taken for Bayesian Search: 425.58768105506897 seconds


### Hyperband 

In [4]:
hyperband_search_space = {
    "conv_filters": tune.choice([64, 128, 256]),
    "lr": tune.loguniform(0.001, 0.1),
    "batch_size": tune.choice([64, 128, 256]),
    "dropout": tune.uniform(0, 1)
}

In [5]:
from ray.tune.schedulers import HyperBandScheduler

# Initialize Ray
ray.init(ignore_reinit_error=True)

# Start time
start_time = time()

# Define the HyperBandScheduler with metric and mode
hyperband_scheduler = HyperBandScheduler(metric="mean_accuracy", mode="max")

# Run Hyperband optimization without metric and mode in tune.run()
hyperband_analysis = tune.run(
    train_mnist,
    name="hyperband_exp",
    stop={"mean_accuracy": 0.99, "training_iteration": 10},
    resources_per_trial={"gpu": 1},
    config=hyperband_search_space,
    num_samples=20,  # Number of different hyperparameter combinations to try
    scheduler=hyperband_scheduler
)

# End time
end_time = time()
hyperband_time = end_time - start_time

2023-12-05 11:54:08,350	INFO worker.py:1507 -- Calling ray.init() again after it has already been called.
2023-12-05 11:54:08,354	INFO tune.py:586 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
2023-12-05 11:54:08,367	INFO tensorboardx.py:178 -- pip install "ray[tune]" to see TensorBoard files.


0,1
Current time:,2023-12-05 12:14:18
Running for:,00:20:10.30
Memory:,56.3/377.3 GiB

Trial name,status,loc,batch_size,conv_filters,dropout,lr,acc,iter,total time (s)
train_mnist_e8192_00000,TERMINATED,10.32.35.81:3103815,256,128,0.624152,0.00310254,0.9865,10,49.5617
train_mnist_e8192_00001,TERMINATED,10.32.35.81:3106917,256,256,0.31604,0.00198681,0.9887,10,37.5665
train_mnist_e8192_00002,TERMINATED,10.32.35.81:3109579,128,64,0.763749,0.0788753,0.098,10,35.3228
train_mnist_e8192_00003,TERMINATED,10.32.35.81:3111666,64,256,0.781244,0.00525434,0.9839,10,58.1243
train_mnist_e8192_00004,TERMINATED,10.32.35.81:3115598,64,64,0.550632,0.0263207,0.9527,10,54.4922
train_mnist_e8192_00005,TERMINATED,10.32.35.81:3119875,64,256,0.52224,0.0551661,0.1009,10,58.0458
train_mnist_e8192_00006,TERMINATED,10.32.35.81:3123659,128,64,0.373677,0.00210428,0.9888,10,35.4678
train_mnist_e8192_00007,TERMINATED,10.32.35.81:3126529,256,64,0.739291,0.0239357,0.9447,10,26.2396
train_mnist_e8192_00008,TERMINATED,10.32.35.81:3128058,64,128,0.293566,0.00765962,0.984,10,54.5348
train_mnist_e8192_00009,TERMINATED,10.32.35.81:3131728,64,128,0.343429,0.0293293,0.9528,10,54.5154


[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

Trial name,mean_accuracy
train_mnist_e8192_00000,0.9865
train_mnist_e8192_00001,0.9887
train_mnist_e8192_00002,0.098
train_mnist_e8192_00003,0.9839
train_mnist_e8192_00004,0.9527
train_mnist_e8192_00005,0.1009
train_mnist_e8192_00006,0.9888
train_mnist_e8192_00007,0.9447
train_mnist_e8192_00008,0.984
train_mnist_e8192_00009,0.9528


[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in module_from_spec
[33m(raylet)[0m   AttributeError: 'NoneType' object has no attribute 'loader'
[33m(raylet)[0m 
[33m(raylet)[0m Remainder of file ignored
[33m(raylet)[0m Error processing line 3 of /home/gha2009/.local/lib/python3.11/site-packages/googleapis_common_protos-1.61.0-py3.9-nspkg.pth:
[33m(raylet)[0m 
[33m(raylet)[0m   Traceback (most recent call last):
[33m(raylet)[0m     File "<frozen site>", line 186, in addpackage
[33m(raylet)[0m     File "<string>", line 1, in <module>
[33m(raylet)[0m     File "<frozen importlib._bootstrap>", line 570, in modu

In [6]:
best_trial = hyperband_analysis.get_best_trial("mean_accuracy", "max", "last")
best_config = best_trial.config

print("Best trial config: {}".format(best_config))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result["mean_accuracy"]))
print("Time taken for Hyperband Search: {} seconds".format(hyperband_time))

Best trial config: {'conv_filters': 64, 'lr': 0.0021042765420631734, 'batch_size': 128, 'dropout': 0.3736771928672201}
Best trial final validation accuracy: 0.9887999892234802
Time taken for Hyperband Search: 1210.3595538139343 seconds


## 3.2

In [8]:
def display_results(analysis, technique_name, time_taken):
    best_trial = analysis.get_best_trial("mean_accuracy", "max", "last")
    best_config = best_trial.config
    best_accuracy = best_trial.last_result["mean_accuracy"]

    print(f"{technique_name} Results:")
    print(f"Time Taken: {time_taken:.2f} seconds")
    print(f"Best Hyperparameters: {best_config}")
    print(f"Best Validation Accuracy: {best_accuracy:.4f}")
    print("-" * 30)

In [14]:
# Display results for Grid Search
display_results(grid_analysis, "Grid Search", grid_time)

Grid Search Results:
Time Taken: 7111.17 seconds
Best Hyperparameters: {'conv_filters': 64, 'lr': 0.001, 'batch_size': 256, 'dropout': 0.25}
Best Validation Accuracy: 0.9894
------------------------------


In [36]:
# Display results for Bayesian Search
display_results(bayes_analysis, "Bayesian Search", bayes_time)

Bayesian Search Results:
Time Taken: 425.59 seconds
Best Hyperparameters: {'conv_filters': 164.75323487338966, 'lr': 0.029831684879606152, 'batch_size': 0.6084844859190754, 'dropout': 0.43194501864211576}
Best Validation Accuracy: 0.9727
------------------------------


In [9]:
# Display results for Hyperband
display_results(hyperband_analysis, "Hyperband", hyperband_time)

Hyperband Results:
Time Taken: 1210.36 seconds
Best Hyperparameters: {'conv_filters': 64, 'lr': 0.0021042765420631734, 'batch_size': 128, 'dropout': 0.3736771928672201}
Best Validation Accuracy: 0.9888
------------------------------


## 3.3

**Answer:**

Analyzing your results, several key observations can be made regarding the time taken for hyperparameter optimization and the performance of the best model identified by each search method:

Grid Search took the longest time (7111.17 seconds) among the three methods. This is expected as grid search exhaustively explores the defined hyperparameter space, which can be time-consuming, especially for large search spaces. The best model achieved a high validation accuracy of 0.9894. This indicates that despite its time-consuming nature, grid search can be effective in finding well-performing hyperparameters.

Bayesian Search method was significantly faster (425.59 seconds), demonstrating Bayesian search's efficiency in navigating the hyperparameter space by building a probabilistic model and using it to select the most promising hyperparameters. The best model had a lower validation accuracy (0.9727) compared to grid search. This could be due to the Bayesian search exploring more diverse regions of the hyperparameter space, which might lead to finding good but not necessarily the best parameters within the limited number of trials.

Hyperband was faster than grid search but slower than Bayesian search (1210.36 seconds). Hyperband, being a bandit-based approach, efficiently allocates resources to promising configurations and quickly discards poor-performing ones, which can lead to time savings. The best model identified by Hyperband achieved a validation accuracy of 0.9888, which is very close to Grid Search yet better than Bayesian Search. This suggests a good balance between exploration and exploitation in Hyperband's approach.

In general, there is a clear trade-off between the time taken for hyperparameter optimization and the performance of the best model. Grid search, while time-consuming, found the most accurate model. Bayesian search, despite being much faster, compromised slightly on accuracy. Bayesian search and Hyperband offer more efficient hyperparameter tuning compared to grid search, though they may require careful tuning of their own parameters to achieve the best results. Overall, the choice of method depends on the specific requirements and constraints of the project goal. If computational resources and time are limited, Bayesian search or Hyperband might be more suitable. If the highest possible accuracy is paramount and resources are ample, grid search might be the better choice. Regardless, all accuracy scores are high and very close to each other, so the specific case in this scenario depends more. 