In [1]:
# set the matplotlib backend so figures can be saved in the background
import matplotlib
matplotlib.use("Agg")

# import the necessary packages
from sklearn.preprocessing import LabelBinarizer
from pyimagesearch.nn.conv import ResNet
# from pyimagesearch.callbacks import EpochCheckpoint
from pyimagesearch.callbacks import TrainingMonitor
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
import numpy as np
import argparse
import sys

2024-12-05 23:19:20.277088: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-05 23:19:20.374345: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733422760.411964    4355 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733422760.422531    4355 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 23:19:20.514697: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:

# set a high recursion limit so Theano doesn’t complain
sys.setrecursionlimit(5000)

In [3]:
# load the training and testing data, converting the images from
# integers to floats
print("[INFO] loading CIFAR-10 data...")
((trainX, trainY), (testX, testY)) = cifar10.load_data()
trainX = trainX.astype("float")
testX = testX.astype("float")


[INFO] loading CIFAR-10 data...


In [4]:

# apply mean subtraction to the data
mean = np.mean(trainX, axis=0)
trainX -= mean
testX -= mean

# convert the labels from integers to vectors
lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)

# construct the image generator for data augmentation
aug = ImageDataGenerator(width_shift_range=0.1,
                         height_shift_range=0.1,
                         horizontal_flip=True,
                         fill_mode="nearest")

In [5]:
args = {"model": None, "start_epoch": 1}
if args["model"] is None:
    print("[INFO] compiling model...")
    opt = SGD(learning_rate=1e-1)
    model = ResNet.build(32, 32, 3, 10, (9, 9, 9),
                         (64, 64, 128, 256), reg=0.0005)
    model.compile(loss="categorical_crossentropy", 
                  optimizer=opt,metrics=["accuracy"])


# otherwise, load the checkpoint from disk
else:
    print("[INFO] loading {}...".format(args["model"]))
    model = load_model(args["model"])

    # update the learning rate
    print("[INFO] old learning rate: {}".format( K.get_value(model.optimizer.lr)))
   
    K.set_value(model.optimizer.lr, 1e-5)
    print("[INFO] new learning rate: {}".format( K.get_value(model.optimizer.lr)))
   

[INFO] compiling model...


I0000 00:00:1733422765.464596    4355 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6070 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [6]:

# construct the set of callbacks
callbacks = [
    # EpochCheckpoint(args["checkpoints"], every=5,
    #                 startAt=args["start_epoch"]),
    TrainingMonitor("output/resnet56_cifar10.png",
                jsonPath="output/resnet56_cifar10.json",
                startAt=args.get("start_epoch", 0))  # Default to 0 if not set

                    ]


In [7]:
import os

# Create the directory if it doesn't exist
os.makedirs("output", exist_ok=True)

In [None]:
# train the network
print("[INFO] training network...")
model.fit(
    aug.flow(trainX, trainY, batch_size=64),
    validation_data=(testX, testY),
    steps_per_epoch=len(trainX) // 128, epochs=10,
    callbacks=callbacks, verbose=1)

[INFO] training network...
Epoch 1/10


  self._warn_if_super_not_called()
I0000 00:00:1733422773.818247    4762 service.cc:148] XLA service 0x7cb70c0036e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733422773.819999    4762 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2024-12-05 23:19:34.182228: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1733422775.220677    4762 cuda_dnn.cc:529] Loaded cuDNN version 90300
2024-12-05 23:19:35.790604: W external/local_xla/xla/service/gpu/nvptx_compiler.cc:930] The NVIDIA driver's CUDA version is 12.4 which is older than the PTX compiler version 12.5.82. Because the driver is older than the PTX compiler version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forw

[1m  3/390[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 61ms/step - accuracy: 0.0998 - loss: 2.8938 

I0000 00:00:1733422784.800039    4762 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 109ms/step - accuracy: 0.2536 - loss: 2.4461 - val_accuracy: 0.4034 - val_loss: 2.0992
Epoch 2/10
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 71ms/step - accuracy: 0.4176 - loss: 2.0427 - val_accuracy: 0.4496 - val_loss: 1.9742
Epoch 3/10
[1m  2/390[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m19s[0m 51ms/step - accuracy: 0.4570 - loss: 1.9089



[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.4609 - loss: 1.9033 - val_accuracy: 0.4806 - val_loss: 1.9089
Epoch 4/10
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 80ms/step - accuracy: 0.4864 - loss: 1.8644 - val_accuracy: 0.5006 - val_loss: 1.9145
Epoch 5/10
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 109ms/step - accuracy: 0.5429 - loss: 1.7165 - val_accuracy: 0.5768 - val_loss: 1.6299
Epoch 6/10
[1m  2/390[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m31s[0m 80ms/step - accuracy: 0.6602 - loss: 1.4018



[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6640 - loss: 1.4073 - val_accuracy: 0.5808 - val_loss: 1.6027
Epoch 7/10
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 120ms/step - accuracy: 0.5980 - loss: 1.5888 - val_accuracy: 0.5964 - val_loss: 1.6305
Epoch 8/10
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 93ms/step - accuracy: 0.6371 - loss: 1.4761 - val_accuracy: 0.6401 - val_loss: 1.5000
Epoch 9/10
[1m  2/390[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m39s[0m 101ms/step - accuracy: 0.6133 - loss: 1.4188

