## 6.1. **Layers and Modules**

In [1]:
import tensorflow as tf

In [2]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10),
])

X = tf.random.uniform((2, 20))
net(X).shape


TensorShape([2, 10])

TF 프레임워크의 keras Library로 모델을 쉽게 정의할 수 있다. d2l에서 Concise Implementation 하면 다 이걸 썼다.

\[2,10] => 20차원의 두 벡터에 대한 2개의 10차원 output

#### 6.1.1. A Custom Module

In [3]:
class MLP(tf.keras.Model):
    def __init__(self):
        # Call the constructor of the parent class tf.keras.Model to perform
        # the necessary initialization
        super().__init__()
        self.hidden = tf.keras.layers.Dense(units=256, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(units=10)

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input X
    def call(self, X):
        return self.out(self.hidden((X)))

In [4]:
net = MLP()
net(X).shape

TensorShape([2, 10])

위 절의 모델을 클래스화하여 정의

#### 6.1.2. Sequential Module

In [5]:
class MySequential(tf.keras.Model):
    def __init__(self, *args):
        super().__init__()
        self.modules = args

    def call(self, X):
        for module in self.modules:
            X = module(X)
        return X

In [6]:
net = MySequential(
    tf.keras.layers.Dense(units=256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10))
net(X).shape

TensorShape([2, 10])

Layer를 arg로 받아서 정의할 수도 있다.

#### 6.1.3. Executing Code in the Forward Propagation Method

In [7]:
class FixedHiddenMLP(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.flatten = tf.keras.layers.Flatten()
        # Random weight parameters created with tf.constant are not updated
        # during training (i.e., constant parameters)
        self.rand_weight = tf.constant(tf.random.uniform((20, 20)))
        self.dense = tf.keras.layers.Dense(20, activation=tf.nn.relu)

    def call(self, inputs):
        X = self.flatten(inputs)
        # Use the created constant parameters, as well as the relu and
        # matmul functions
        X = tf.nn.relu(tf.matmul(X, self.rand_weight) + 1)
        # Reuse the fully connected layer. This is equivalent to sharing
        # parameters with two fully connected layers
        X = self.dense(X)
        # Control flow
        while tf.reduce_sum(tf.math.abs(X)) > 1:
            X /= 2
        return tf.reduce_sum(X)

In [8]:
net = FixedHiddenMLP()
net(X)

<tf.Tensor: shape=(), dtype=float32, numpy=0.9346579>

rand_weight => 고정된 parameter

근데 이걸 왜 쓰지?

## 6.2. **Parameter Management**

In [9]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

X = tf.random.uniform((2, 4))
net(X).shape

TensorShape([2, 1])

#### 6.2.1. Parameter Access

keras.sequential 모델은 layer를 인덱스로 접근하여 weights attribute를 통해 weight parameter를 얻을 수 있다.

In [10]:
net.layers[2].weights

[<tf.Variable 'dense_8/kernel:0' shape=(4, 1) dtype=float32, numpy=
 array([[ 0.6598879 ],
        [-0.33984178],
        [ 0.58537877],
        [-0.25434136]], dtype=float32)>,
 <tf.Variable 'dense_8/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

In [11]:
type(net.layers[2].weights[1]), tf.convert_to_tensor(net.layers[2].weights[1])

(tensorflow.python.ops.resource_variable_ops.ResourceVariable,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>)

weights\[1]은 bias를 의미한다. 현재 값은 0

In [12]:
net.get_weights()

[array([[-0.8287021 ,  0.63501054, -0.61087656,  0.74562865],
        [-0.5236827 ,  0.17749792,  0.7323863 ,  0.45467585],
        [ 0.35315686, -0.34327447, -0.7816826 ,  0.2980644 ],
        [ 0.18441182,  0.12961483,  0.51729864, -0.32771587]],
       dtype=float32),
 array([0., 0., 0., 0.], dtype=float32),
 array([[ 0.6598879 ],
        [-0.33984178],
        [ 0.58537877],
        [-0.25434136]], dtype=float32),
 array([0.], dtype=float32)]

모든 Layer의 값을 보고 싶은 때에는 get_weights() 메소드를 사용한다.

#### 6.2.2. Tied Parameters

In [13]:
# tf.keras behaves a bit differently. It removes the duplicate layer
# automatically
shared = tf.keras.layers.Dense(4, activation=tf.nn.relu)
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    shared,
    shared,
    tf.keras.layers.Dense(1),
])

net(X)
# Check whether the parameters are different
print(len(net.layers) == 3)

True


parameter의 공유도 가능하다. 위 net 상에서 두 번째 Layer와 세 번째 Layer는 완전히 같고, 값도 하나가 변하면 같이 변한다.

## 6.3. **Parameter Initialization**

Keras는 기본적으로 weight를 균등분포에 따라 초기화한다. bias는 싹다 0으로 초기화한다. keras.initializers를 사용하면 다양한 초기화 방법을 사용할 수 있다.

In [14]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

X = tf.random.uniform((2, 4))
net(X).shape, net.get_weights()

(TensorShape([2, 1]),
 [array([[ 0.738174  , -0.85181916,  0.69923896,  0.03848928],
         [ 0.45712692, -0.32807493,  0.23295754,  0.40230566],
         [ 0.6425889 ,  0.8027771 ,  0.8570635 ,  0.17281443],
         [ 0.5036543 , -0.37805462,  0.6669442 , -0.13839835]],
        dtype=float32),
  array([0., 0., 0., 0.], dtype=float32),
  array([[-0.8515353 ],
         [-0.21796656],
         [-0.94990987],
         [-0.34600186]], dtype=float32),
  array([0.], dtype=float32)])

#### 6.3.1. Built-in Initialization

위에서 언급했던 keras.initializer의 built-in 초기화 방법이다.

In [15]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4, activation=tf.nn.relu,
        kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01),
        bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1)])

net(X)
net.weights[0], net.weights[1]

(<tf.Variable 'dense_13/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[ 0.01197534,  0.01325932, -0.00242216, -0.01244132],
        [-0.00741366, -0.00666446,  0.00793555, -0.00231276],
        [-0.00587579, -0.00439682,  0.00777427,  0.01144099],
        [ 0.00412243,  0.02554751, -0.01060079,  0.00026228]],
       dtype=float32)>,
 <tf.Variable 'dense_13/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

In [16]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4, activation=tf.nn.relu,
        kernel_initializer=tf.keras.initializers.Constant(1),
        bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1),
])

net(X)
net.weights[0], net.weights[1]

(<tf.Variable 'dense_15/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], dtype=float32)>,
 <tf.Variable 'dense_15/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

In [17]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4,
        activation=tf.nn.relu,
        kernel_initializer=tf.keras.initializers.GlorotUniform()),
    tf.keras.layers.Dense(
        1, kernel_initializer=tf.keras.initializers.Constant(42)),
])

net(X)
print(net.layers[1].weights[0])
print(net.layers[2].weights[0])

<tf.Variable 'dense_17/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[-0.4596232 , -0.86197513, -0.32248867, -0.60097724],
       [-0.4070367 ,  0.03779143,  0.5044492 ,  0.34535187],
       [-0.62128997, -0.5323502 ,  0.5089659 , -0.11214465],
       [-0.11870503,  0.12151003, -0.22808039, -0.73223376]],
      dtype=float32)>
<tf.Variable 'dense_18/kernel:0' shape=(4, 1) dtype=float32, numpy=
array([[42.],
       [42.],
       [42.],
       [42.]], dtype=float32)>


위 초기화 방식은 5장에서 언급했던 Xavier Initializer이다.

##### 6.3.1.1. Custon Initialization

In [18]:
class MyInit(tf.keras.initializers.Initializer):
    def __call__(self, shape, dtype=None):
        data=tf.random.uniform(shape, -10, 10, dtype=dtype)
        factor=(tf.abs(data) >= 5)
        factor=tf.cast(factor, tf.float32)
        return data * factor

net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4,
        activation=tf.nn.relu,
        kernel_initializer=MyInit()),
    tf.keras.layers.Dense(1),
])

net(X)
print(net.layers[1].weights[0])

<tf.Variable 'dense_19/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[ 6.3823795, -8.453104 , -6.452868 , -6.4946675],
       [-8.785086 , -0.       , -9.939615 ,  7.583332 ],
       [-0.       , -5.453863 ,  8.878979 , -0.       ],
       [ 6.195122 , -0.       ,  7.169895 ,  0.       ]], dtype=float32)>


Initializer 클래스 상속으로 초기화 방법을 정의할 수도 있다.

$\begin{split}\begin{aligned}
    w \sim \begin{cases}
        U(5, 10) & \text{ with probability } \frac{1}{4} \\
            0    & \text{ with probability } \frac{1}{2} \\
        U(-10, -5) & \text{ with probability } \frac{1}{4}
    \end{cases}
\end{aligned}\end{split}$

In [19]:
net.layers[1].weights[0][:].assign(net.layers[1].weights[0] + 1)
net.layers[1].weights[0][0, 0].assign(42)
net.layers[1].weights[0]

<tf.Variable 'dense_19/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[42.       , -7.453104 , -5.452868 , -5.4946675],
       [-7.7850857,  1.       , -8.939615 ,  8.583332 ],
       [ 1.       , -4.453863 ,  9.878979 ,  1.       ],
       [ 7.195122 ,  1.       ,  8.169895 ,  1.       ]], dtype=float32)>

직접 값을 때려박을 수도 있다.

## 6.4. **Lazy Initialization**

In [20]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10),
])

이렇게 net을 선언만 하면 input 차원을 모르기 때문에 아직 weight가 정의되지 않은 상태이다.

In [21]:
[net.layers[i].get_weights() for i in range(len(net.layers))]

[[], []]

아무것도 안 나온 것을 볼 수 있다.

In [22]:
X = tf.random.uniform((2, 20))
net(X)
[w.shape for w in net.get_weights()]

[(20, 256), (256,), (256, 10), (10,)]

input을 한번 넣으면 X의 벡터 차원에 따라 weight와 bias가 생성된 것을 볼 수 있다.

## 6.5. **Custom Layers**

#### 6.5.1. Layers without Parameters

In [23]:
class CenteredLayer(tf.keras.Model):
    def __init__(self):
        super().__init__()

    def call(self, X):
        return X - tf.reduce_mean(X)

parameter 없이 mean을 0으로 만들어주는 Layer이다.

In [24]:
layer = CenteredLayer()
layer(tf.constant([1.0, 2, 3, 4, 5]))

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([-2., -1.,  0.,  1.,  2.], dtype=float32)>

In [25]:
net = tf.keras.Sequential([tf.keras.layers.Dense(128), CenteredLayer()])

In [26]:
Y = net(tf.random.uniform((4, 8)))
tf.reduce_mean(Y)

<tf.Tensor: shape=(), dtype=float32, numpy=-3.4924597e-10>

output 평균이 0에 매우 가깝다. floating point 오차인 것이다.

#### 6.5.2. Layers with Parameters

In [27]:
class MyDense(tf.keras.Model):
    def __init__(self, units):
        super().__init__()
        self.units = units

    def build(self, X_shape):
        self.weight = self.add_weight(name='weight',
            shape=[X_shape[-1], self.units],
            initializer=tf.random_normal_initializer())
        self.bias = self.add_weight(
            name='bias', shape=[self.units],
            initializer=tf.zeros_initializer())

    def call(self, X):
        linear = tf.matmul(X, self.weight) + self.bias
        return tf.nn.relu(linear)

In [28]:
dense = MyDense(3)
dense(tf.random.uniform((2, 5)))
dense.get_weights()

[array([[ 0.00103554,  0.04050731, -0.00267062],
        [-0.07613688,  0.01083075, -0.00709246],
        [-0.04965787,  0.04494112,  0.0365766 ],
        [ 0.05885713,  0.01667314, -0.04696984],
        [ 0.05154731, -0.04224121, -0.01240787]], dtype=float32),
 array([0., 0., 0.], dtype=float32)]

In [29]:
dense(tf.random.uniform((2, 5)))

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.02439984, 0.02518006, 0.        ],
       [0.        , 0.04655208, 0.        ]], dtype=float32)>

밑 코드는 Custom Layer만을 사용하여 Sequential 모델을 정의한 것이다.

In [30]:
net = tf.keras.models.Sequential([MyDense(8), MyDense(1)])
net(tf.random.uniform((2, 64)))

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.02213669],
       [0.01589044]], dtype=float32)>

## 6.6. **FIle I/O**

In [31]:
import numpy as np

numpy의 load와 save를 이용하여 Parameter를 저장하고 불러올 수 있다.

In [32]:
x = tf.range(4)
np.save('x-file.npy', x)

In [33]:
x2 = np.load('x-file.npy', allow_pickle=True)
x2

array([0, 1, 2, 3], dtype=int32)

In [34]:
y = tf.zeros(4)
np.save('xy-files.npy', [x, y])
x2, y2 = np.load('xy-files.npy', allow_pickle=True)
(x2, y2)

(array([0., 1., 2., 3.]), array([0., 0., 0., 0.]))

#### 6.6.2. Loading and Saving Model Parameters

In [35]:
class MLP(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.hidden = tf.keras.layers.Dense(units=256, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(units=10)

    def call(self, inputs):
        x = self.flatten(inputs)
        x = self.hidden(x)
        return self.out(x)

net = MLP()
X = tf.random.uniform((2, 20))
Y = net(X)

keras는 save_weights를 통해 Parameter를 저장할 수 있다.

In [36]:
net.save_weights('mlp.params')

In [37]:
clone = MLP()
clone.load_weights('mlp.params')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f4da01239d0>

In [38]:
Y_clone = clone(X)
Y_clone == Y

<tf.Tensor: shape=(2, 10), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]])>

완전히 같은 Parameter이기 때문에, 같은 input을 넣으면 같은 output이 나온다.

## 6.7. **GPUs**

In [39]:
!nvidia-smi
!pip install d2l

Fri May 12 10:03:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    28W /  70W |    567MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

위 명령어를 통해 GPU를 렌더링할 수 있다.

In [40]:
import tensorflow as tf
from d2l import tensorflow as d2l

#### 6.7.1. Computing Devices

하드웨어 정보를 가져오는 여러 인터페이스 정의

In [41]:
def cpu():
    """Get the CPU device."""
    return tf.device('/CPU:0')
def gpu(i=0):
    """Get a GPU device."""
    return tf.device(f'/GPU:{i}')
cpu(), gpu(), gpu(1)

(<tensorflow.python.eager.context._EagerDeviceContext at 0x7f4da011acc0>,
 <tensorflow.python.eager.context._EagerDeviceContext at 0x7f4da00dfd00>,
 <tensorflow.python.eager.context._EagerDeviceContext at 0x7f4da00dc6c0>)

In [42]:
def num_gpus():
    """Get the number of available GPUs."""
    return len(tf.config.experimental.list_physical_devices('GPU'))
num_gpus()

1

In [43]:
def try_gpu(i=0):
    """Return gpu(i) if exists, otherwise return cpu()."""
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()

def try_all_gpus():
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    return [gpu(i) for i in range(num_gpus())]

try_gpu(), try_gpu(10), try_all_gpus()

(<tensorflow.python.eager.context._EagerDeviceContext at 0x7f4dae430ec0>,
 <tensorflow.python.eager.context._EagerDeviceContext at 0x7f4da00de080>,
 [<tensorflow.python.eager.context._EagerDeviceContext at 0x7f4da00de140>])

#### 6.7.2. Tensors and GPUs

device attribute로 텐서가 어느 하드웨어에 있는지 알 수 있다.

In [44]:
x = tf.constant([1, 2, 3])
x.device

'/job:localhost/replica:0/task:0/device:GPU:0'

##### 6.7.2.1. Storage on the GPU

In [45]:
with try_gpu():
    X = tf.ones((2, 3))
X

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1., 1., 1.],
       [1., 1., 1.]], dtype=float32)>

In [46]:
with try_gpu(1):
    Y = tf.random.uniform((2, 3))
Y

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.51801133, 0.34969187, 0.23566723],
       [0.66548526, 0.8027836 , 0.8519317 ]], dtype=float32)>

##### 6.7.2.2. Copying

In [47]:
with try_gpu(1):
    Z = X
print(X)
print(Z)

tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]], shape=(2, 3), dtype=float32)
tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]], shape=(2, 3), dtype=float32)


텐서가 같은 GPU에 있어야 연산이 가능하다

In [48]:
Y + Z

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1.5180113, 1.3496919, 1.2356672],
       [1.6654853, 1.8027836, 1.8519317]], dtype=float32)>

In [49]:
with try_gpu(1):
    Z2 = Z
Z2 is Z

True

#### 6.7.3. Neural Networks and GPUs

In [50]:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    net = tf.keras.models.Sequential([
        tf.keras.layers.Dense(1)])

이 코드는 GPU에 Parameter를 저장한다.

In [51]:
net(X)

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.07153553],
       [0.07153553]], dtype=float32)>

위에처럼 input이 들어오면, 같은 GPU 상에서 연산을 한다.

In [52]:
net.layers[0].weights[0].device, net.layers[0].weights[1].device

('/job:localhost/replica:0/task:0/device:GPU:0',
 '/job:localhost/replica:0/task:0/device:GPU:0')