In [1]:
def naive_relu(x):
    assert len(x.shape) == 2
    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i,j] = max(x[i,j], 0)

    return x

In [2]:
def naive_add(x, y):
    assert len(x.shape) == 2
    assert x.shape == y.shape

    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i,j] += y[i,j]
    
    return x

In [3]:
def naive_add_matrix_and_vector(x, y):
    assert len(x.shape) == 2
    assert len(y.shape) == 1
    assert x.shape[1] == y.shape[0]

    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i, j] += y[j]

    return x

In [4]:
import numpy as np

x = np.random.random((64, 3, 32, 10))
y = np.random.random((32, 10))
z = np.maximum(x, y)
print(z)

[[[[0.26904389 0.8705948  0.4418309  ... 0.4599279  0.85260934
    0.84002684]
   [0.31166864 0.64759484 0.47019876 ... 0.55616407 0.63169478
    0.98993992]
   [0.7770974  0.62513907 0.92647312 ... 0.58971143 0.92650393
    0.99272361]
   ...
   [0.80005817 0.7262103  0.56048654 ... 0.67565854 0.76455534
    0.66494453]
   [0.80121628 0.74490646 0.51168756 ... 0.87525406 0.97515618
    0.41276969]
   [0.56951414 0.29062341 0.97026291 ... 0.898567   0.35458657
    0.98480159]]

  [[0.26904389 0.8705948  0.41585944 ... 0.4599279  0.44120435
    0.35115919]
   [0.50116676 0.17812621 0.65698579 ... 0.59260867 0.63169478
    0.55844427]
   [0.89524223 0.24437161 0.92647312 ... 0.79516633 0.90251856
    0.47235908]
   ...
   [0.17900692 0.80394267 0.84194667 ... 0.67565854 0.49133814
    0.97372693]
   [0.79498989 0.95689586 0.51168756 ... 0.87525406 0.31477705
    0.85818958]
   [0.87459097 0.2304525  0.41342798 ... 0.57823184 0.35458657
    0.92382938]]

  [[0.26904389 0.8705948  0.963255

In [5]:
def naive_vector_dot(x, y):
    assert len(x.shape) == 1
    assert len(y.shape) == 1
    assert x.shape[0] == y.shape[0]

    z = 0.
    for i in range(x.shape[0]):
        z += x[i] * y[i]
    
    return z

In [6]:
import numpy as np

def naive_matrix_vector_dot(x, y):
    assert len(x.shape) == 2
    assert len(y.shape) == 1
    assert x.shape[1] == y.shape[0]
    z = np.zeros(x.shape[0])
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            z[i] += x[i,j] * y[j]

    return z

In [7]:
def naive_matrix_dot(x, y):
    assert len(x.shape) == 2
    assert len(y.shape) == 2
    assert x.shape[1] == y.shape[0]

    z = np.zeros((x.shape[0], y.shape[1]))
    for i in range(x.shape[0]):
        for j in range(y.shape[1]):
            row_x = x[i, :]
            column_y = y[:, j]
            z[i, j] = naive_vector_dot(row_x, column_y)
    
    return z

In [8]:
x = np.array([[0., 1.],
              [2., 3.],
              [4., 5.]])
print(x.shape)

(3, 2)


In [9]:
x = x.reshape((6, 1))
print(x)

[[0.]
 [1.]
 [2.]
 [3.]
 [4.]
 [5.]]


In [10]:
x = x.reshape((2, 3))
print(x)

[[0. 1. 2.]
 [3. 4. 5.]]


In [11]:
x = np.zeros((300, 20))
x = np.transpose(x)
print(x.shape)

(20, 300)


In [12]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_lables) = mnist.load_data()

train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28 *28))
test_images = test_images.astype('float32') / 255

2024-02-13 11:33:19.380097: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-13 11:33:19.380125: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-13 11:33:19.381124: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-13 11:33:19.386153: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
from tensorflow.keras import layers

network = tf.keras.models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
network.add(layers.Dense(10, activation='softmax'))

2024-02-13 11:33:20.914557: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-13 11:33:20.946818: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-13 11:33:20.947004: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [14]:
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [15]:
from tensorflow.keras.utils import to_categorical
train_labels_one_hot = to_categorical(train_labels, num_classes=10)
network.fit(train_images, train_labels_one_hot, epochs=5, batch_size=128)

Epoch 1/5


2024-02-13 11:33:21.763857: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-02-13 11:33:21.829670: I external/local_xla/xla/service/service.cc:168] XLA service 0x741de140d130 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-13 11:33:21.829686: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2024-02-13 11:33:21.833493: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-13 11:33:21.841847: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1707842001.882736  141273 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x741f0a290890>