In [1]:
# suppress tensorflow logging, usually not useful unless you are having problems with tensorflow or accessing gput
# it seems necessary to have this environment variable set before tensorflow is imported, or else it doesn't take effect
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

# Check GPU Status / Availability

On linux systems you can and should use the `nvidia-smi` tool to check that the gpu is visible, is active and has drivers installed.  You can run the command from a terminal like the following cell.

I also find the following commands useful to monitor the gpu performance from the command line

```
# use watch so basic nvidia-smi redraws at top of screen each second
$ watch -n 1 nvidia-smi

# nvtop is basiclly like top for nvidia gpu
$ sudo apt install nvtop
$ nvtop

# nvitop is similar, gives about same information, but some may prefer this one
$ sudo apt install nvitop
$ nvitop
```

In [2]:
!nvidia-smi

Thu May 15 21:59:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07             Driver Version: 570.133.07     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro M2200                   Off |   00000000:01:00.0  On |                  N/A |
| N/A   63C    P0            N/A  /   80W |     590MiB /   4096MiB |      6%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# GPU to TF/Keras Availability

We can check that tensorflow recognizes the presence of a GPU device as follows.

In [3]:
print('Available Devices : ', tf.config.list_physical_devices())
print('Num GPUs Available: ', len(tf.config.list_physical_devices('GPU')))

Available Devices :  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Num GPUs Available:  1


# Test Training Performance with only CPU

We can compare performance with and without the gpu.  First we will train a model using only the cpu and record time it takes to train model.  We just train a simple dens NN on mnist data to test 
performance on fitting a model.

In [4]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0


loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

with tf.device('/CPU:0'):
    model_cpu_base = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=(28, 28)),
      tf.keras.layers.Dense(4096, activation='relu'),
      tf.keras.layers.Dropout(0.25),
      tf.keras.layers.Dense(10)
    ])


  super().__init__(**kwargs)


In [5]:
def fit_cpu():
    with tf.device('/CPU:0'):
        model_cpu = tf.keras.models.clone_model(model_cpu_base)
        model_cpu.compile(optimizer='adam',
                      loss=loss_fn,
                      metrics=['accuracy'])
        model_cpu.fit(x_train, y_train, epochs=5)
        print('=' * 70)
        
%timeit fit_cpu()

Epoch 1/5


I0000 00:00:1747346397.538393    8462 service.cc:145] XLA service 0x78408c00a210 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747346397.538428    8462 service.cc:153]   StreamExecutor device (0): Host, Default Version


[1m   9/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m26s[0m 14ms/step - accuracy: 0.3885 - loss: 1.8857

I0000 00:00:1747346397.969420    8462 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 12ms/step - accuracy: 0.9083 - loss: 0.3050
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 12ms/step - accuracy: 0.9695 - loss: 0.0975
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 12ms/step - accuracy: 0.9789 - loss: 0.0684
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 12ms/step - accuracy: 0.9830 - loss: 0.0534
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 12ms/step - accuracy: 0.9867 - loss: 0.0438
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 12ms/step - accuracy: 0.9045 - loss: 0.3044
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 12ms/step - accuracy: 0.9699 - loss: 0.0942
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 12ms/step - accuracy: 0.9787 - loss: 0.0674
Epoch 4/5
[1m1875/1875[0

# Test Training Performance with GPU

Now do same model and fit it on the gpu device.

In [6]:
# recreate a random starting model and compile it
with tf.device('/GPU:0'):
    model_gpu_base = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=(28, 28)),
      tf.keras.layers.Dense(4096, activation='relu'),
      tf.keras.layers.Dropout(0.25),
      tf.keras.layers.Dense(10)
    ])


In [7]:
def fit_gpu():
    with tf.device('/GPU:0'):
        model_gpu = tf.keras.models.clone_model(model_gpu_base)
        model_gpu.compile(optimizer='adam',
                      loss=loss_fn,
                      metrics=['accuracy'])
        model_gpu.fit(x_train, y_train, epochs=5)
        print('=' * 70)

%timeit fit_gpu()

Epoch 1/5


I0000 00:00:1747347272.645629    8457 service.cc:145] XLA service 0x7840a1abd3f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747347272.645647    8457 service.cc:153]   StreamExecutor device (0): Quadro M2200, Compute Capability 5.2


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9036 - loss: 0.3053
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9695 - loss: 0.0963
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9773 - loss: 0.0722
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9820 - loss: 0.0591
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9857 - loss: 0.0457
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9049 - loss: 0.3106
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9683 - loss: 0.0998
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9781 - loss: 0.0676
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━