## Ajuste del rendimiento de la CPU
Empezamos usando el modelo tal y como los definimos y lo entrenamos en el código de referencia: OneNodeResnet50TrainingTF.py

In [3]:
import tensorflow as tf
import os

tf.keras.backend.clear_session()
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(32)
#tf.config.optimizer.set_jit(True)
#os.environ["TF_XLA_FLAGS"]="--tf_xla_cpu_global_jit"
#os.environ["OMP_NUM_THREADS"]="32"

import tensorflow.keras as K

def preprocess_data(X, Y):
    """
    a function that trains a convolutional neural network to classify the
    CIFAR 10 dataset
    :param X: X is a numpy.ndarray of shape (m, 32, 32, 3) containing the
    CIFAR 10 data, where m is the number of data points
    :param Y: Y is a numpy.ndarray of shape (m,) containing the CIFAR 10
    labels for X
    :return: X_p, Y_p
        X_p is a numpy.ndarray containing the preprocessed X
        Y_p is a numpy.ndarray containing the preprocessed Y
    """
    X_p = K.applications.resnet50.preprocess_input(X)
    Y_p = K.utils.to_categorical(Y, 10)
    return X_p, Y_p

(x_train, y_train), (x_test, y_test) = K.datasets.cifar10.load_data()
#print((x_train.shape, y_train.shape))
x_train, y_train = preprocess_data(x_train, y_train)
x_test, y_test = preprocess_data(x_test, y_test)
#print((x_train.shape, y_train.shape))

input_t = K.Input(shape=(32, 32, 3))
res_model = K.applications.ResNet50(include_top=False,
                                        weights="imagenet",
                                        input_tensor=input_t)

for layer in res_model.layers[:143]:
    layer.trainable = False
# Check the freezed was done ok
#for i, layer in enumerate(res_model.layers):
#    print(i, layer.name, "-", layer.trainable)

#    to_res = (224, 224)
to_res = (32, 32)

model = K.models.Sequential()
model.add(K.layers.Lambda(lambda image: tf.image.resize(image, to_res)))
model.add(res_model)
model.add(K.layers.Flatten())
model.add(K.layers.BatchNormalization())
model.add(K.layers.Dense(256, activation='relu'))
model.add(K.layers.Dropout(0.5))
model.add(K.layers.BatchNormalization())
model.add(K.layers.Dense(128, activation='relu'))
model.add(K.layers.Dropout(0.5))
model.add(K.layers.BatchNormalization())
model.add(K.layers.Dense(64, activation='relu'))
model.add(K.layers.Dropout(0.5))
model.add(K.layers.BatchNormalization())
model.add(K.layers.Dense(10, activation='softmax'))
check_point = K.callbacks.ModelCheckpoint(filepath="cifar10.h5",
                                              monitor="val_acc",
                                              mode="max",
                                              save_best_only=True,
                                              )

model.compile(loss='categorical_crossentropy',
                  optimizer=K.optimizers.RMSprop(learning_rate=2e-5),
                  metrics=['accuracy'])

Ahora probamos la influencia de ir variando el batch_size

In [4]:
    
    #history = model.fit(x_train, y_train, batch_size=4, epochs=1, verbose=1,
    #                    validation_data=(x_test, y_test),
    #                    callbacks=[check_point])
    #history = model.fit(x_train, y_train, batch_size=8, epochs=1, verbose=1,
    #                    validation_data=(x_test, y_test),
    #                    callbacks=[check_point])
    #history = model.fit(x_train, y_train, batch_size=16, epochs=1, verbose=1,
    #                   validation_data=(x_test, y_test),
    #                    callbacks=[check_point])           
    #history = model.fit(x_train, y_train, batch_size=32, epochs=1, verbose=1,
    #                    validation_data=(x_test, y_test),
    #                    callbacks=[check_point])
    #history = model.fit(x_train, y_train, batch_size=64, epochs=1, verbose=1,
    #                    validation_data=(x_test, y_test),
    #                    callbacks=[check_point])
    history = model.fit(x_train, y_train, batch_size=128, epochs=1, verbose=1,
                        validation_data=(x_test, y_test),
                        callbacks=[check_point])
                    



Probamos a usar un creciente número de hilo modificando el valor a través de la variable de entorno OMP_NUM_THREADS

In [1]:
!OMP_NUM_THREADS=4 python OneNodeResnet50TrainingTF.py
!OMP_NUM_THREADS=8 python OneNodeResnet50TrainingTF.py
!OMP_NUM_THREADS=16 python OneNodeResnet50TrainingTF.py
!OMP_NUM_THREADS=32 python OneNodeResnet50TrainingTF.py

ERROR: ld.so: object ' ' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object ' ' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
2022-12-06 08:30:06.133032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
((50000, 32, 32, 3), (50000, 1))
((50000, 32, 32, 3), (50000, 10))
2022-12-06 08:30:08.964015: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow wi

Probamos a utilizar la variable KMP_AFFINITY para controlar la afinidad de los hilos a cores

In [2]:
!KMP_AFFINITY="granularity=fine,compact,1,0" OMP_NUM_THREADS=32 python OneNodeResnet50TrainingTF.py

ERROR: ld.so: object ' ' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object ' ' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
2022-12-06 08:36:25.465648: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
((50000, 32, 32, 3), (50000, 1))
((50000, 32, 32, 3), (50000, 10))
2022-12-06 08:36:27.792887: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow wi