# Testing the speed of GPU compared to CPU for Conv nets

In [1]:
import os

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.optimizers import SGD
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.client import device_lib
import tensorflow as tf
import os

import time

Using TensorFlow backend.


In [2]:
print(device_lib.list_local_devices()) # Available CPU/GPUS

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7146110755515451855
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3149044121
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9288180566016982544
physical_device_desc: "device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [3]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
n_train, ndim1_train, ndim2_train = X_train.shape
n_test, ndim1_test, ndim2_test = X_test.shape

X_train = X_train.reshape((n_train, ndim1_train, ndim2_train, 1))
X_test = X_test.reshape((n_test, ndim1_test, ndim2_test, 1))

# one hot encode target values
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [4]:
# convert from integers to floats
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# normalize to range 0-1 (Alternatively, we could apply BatchNorm)
X_train = X_train / 255.0
X_test = X_test / 255.0 

In [5]:
# Creating a Sequential Model with Keras
# Convolutional Neural Network
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform'))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(10, activation='softmax'))

# Optimizer
opt = SGD(lr=0.01,
          momentum=0.9)

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# Summary per layer
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 9, 9, 64)          36928     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 4, 4, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)              

# Training with GPU

In [10]:
start_time = time.time()

model.fit(X_train, y_train, epochs=2, batch_size=32)

t2=time.time()
total_time=t2-t1

print('Total running time: {}'.format(time.time() - start_time))

Epoch 1/2
Epoch 2/2
Total running time: 25.25180220603943


In [30]:
results = model.evaluate(X_test, y_test, batch_size=128)



# Training with CPU

In [9]:
start_time = time.time()

with tf.device("cpu:0"):
    model.fit(X_train, y_train, epochs=2, batch_size=32)

t2=time.time()
total_time=t2-t1

print('Total running time: {}'.format(time.time() - start_time))

Epoch 1/2
Epoch 2/2
Total running time: 124.67148756980896


GPU is around 6 times faster than CPU