<a href="https://colab.research.google.com/github/chiyeon01/Hands_On_Machine_Learning/blob/main/11.%20%EC%8B%AC%EC%B8%B5_%EC%8B%A0%EA%B2%BD%EB%A7%9D_%ED%9B%88%EB%A0%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 그레디언트 소실과 폭주 문제

## 배치 정규화

In [1]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(300, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(10, activation="relu")
])

model.summary()

  super().__init__(**kwargs)


In [2]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('gamma', True),
 ('beta', True),
 ('moving_mean', False),
 ('moving_variance', False)]

In [3]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),

    tf.keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),

    tf.keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),

    tf.keras.layers.Dense(10, activation="softmax")
])

model.summary()

## 그레디언트 클리핑

In [4]:
# clipvalue를 설정하여, 방향성을 유지하며 그레디언트 폭주를 막아줌.
optimizer = tf.keras.optimizers.SGD(clipvalue=1.0)

In [5]:
# clipnorm을 설정하여, l2 규제를 적용함. 하지만, 방향성은 유지됨.
optimizer = tf.keras.optimizers.SGD(clipnorm=1.0)

# 사전 훈련된 층 재사용하기

## 케라스를 사용한 전이 학습

In [6]:
import numpy as np

class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
               "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]
X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255

pos_class_id = class_names.index("Pullover")
neg_class_id = class_names.index("T-shirt/top")

def split_dataset(X, y):
    y_for_B = (y == pos_class_id) | (y == neg_class_id)
    y_A = y[~y_for_B]
    y_B = (y[y_for_B] == pos_class_id).astype(np.float32)
    old_class_ids = list(set(range(10)) - set([neg_class_id, pos_class_id]))
    for old_class_id, new_class_id in zip(old_class_ids, range(8)):
        y_A[y_A == old_class_id] = new_class_id  # A에 대한 클래스 ID 재정의
    return ((X[~y_for_B], y_A), (X[y_for_B], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

tf.random.set_seed(42)

model_A = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(8, activation="softmax")
])

model_A.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                metrics=["accuracy"])
history = model_A.fit(X_train_A, y_train_A, epochs=20,
                      validation_data=(X_valid_A, y_valid_A))
model_A.save("my_model_A.h5")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.5652 - loss: 1.3718 - val_accuracy: 0.7819 - val_loss: 0.6487
Epoch 2/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m



In [7]:
tf.random.set_seed(42)
model_B = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model_B.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                metrics=["accuracy"])
history = model_B.fit(X_train_B, y_train_B, epochs=20,
                      validation_data=(X_valid_B, y_valid_B))
model_B.evaluate(X_test_B, y_test_B)

Epoch 1/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.5871 - loss: 0.7256 - val_accuracy: 0.6073 - val_loss: 0.6906
Epoch 2/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5774 - loss: 0.6935 - val_accuracy: 0.6528 - val_loss: 0.6636
Epoch 3/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.6271 - loss: 0.6672 - val_accuracy: 0.7270 - val_loss: 0.6415
Epoch 4/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6561 - loss: 0.6449 - val_accuracy: 0.7606 - val_loss: 0.6227
Epoch 5/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.7247 - loss: 0.6254 - val_accuracy: 0.7982 - val_loss: 0.6061
Epoch 6/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.7678 - loss: 0.6082 - val_accuracy: 0.8200 - val_loss: 0.5914
Epoch 7/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━

[0.4564388394355774, 0.9024999737739563]

In [8]:
model_A = tf.keras.models.load_model("my_model_A.h5")
model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))



In [9]:
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [10]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [11]:
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B))

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16, validation_data=(X_valid_B, y_valid_B))

Epoch 1/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.4301 - loss: 4.7075 - val_accuracy: 0.4847 - val_loss: 3.5249
Epoch 2/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.4301 - loss: 3.6313 - val_accuracy: 0.4847 - val_loss: 2.5665
Epoch 3/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.4301 - loss: 2.5907 - val_accuracy: 0.4837 - val_loss: 1.7059
Epoch 4/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.4350 - loss: 1.6859 - val_accuracy: 0.4352 - val_loss: 1.1490
Epoch 1/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/step - accuracy: 0.3523 - loss: 1.0799 - val_accuracy: 0.3917 - val_loss: 0.8453
Epoch 2/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.3707 - loss: 0.8210 - val_accuracy: 0.4659 - val_loss: 0.7408
Epoch 3/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [12]:
model_B_on_A.evaluate(X_test_B, y_test_B)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9096 - loss: 0.3341


[0.33761435747146606, 0.9045000076293945]

# 고속 옵티마이저

## 모멘텀 최적화

In [13]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)

## 네스테로프 가속 경사

In [14]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)

## AdaGrad

In [15]:
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.001)

## RMSProp

In [16]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)

## Adam

In [17]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

## AdaMax

In [18]:
optimizer = tf.keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

## Nadam

In [19]:
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

## AdamW

In [20]:
optimizer = tf.keras.optimizers.AdamW(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

# 학습률 스케줄링

## 거듭제곱 기반 스케줄링

In [21]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, weight_decay=1e-4)

## 지수 기반 스케줄링

In [22]:
def exponential_decay_fn(epoch):
    return 0.01 * 0.1**(epoch / 20)

def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1**(epoch / s)

    return exponential_decay_fn

exponential_decay_fn = exponential_decay(lr0=0.01, s=20)

In [23]:
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16, validation_data=(X_valid_B, y_valid_B), callbacks=[lr_scheduler])

Epoch 1/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.9364 - loss: 0.3029 - val_accuracy: 0.6499 - val_loss: 0.6039 - learning_rate: 0.0100
Epoch 2/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.7790 - loss: 0.4348 - val_accuracy: 0.9090 - val_loss: 0.2762 - learning_rate: 0.0089
Epoch 3/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.9205 - loss: 0.2226 - val_accuracy: 0.9347 - val_loss: 0.2270 - learning_rate: 0.0079
Epoch 4/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9638 - loss: 0.1908 - val_accuracy: 0.9397 - val_loss: 0.2055 - learning_rate: 0.0071
Epoch 5/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.9569 - loss: 0.1748 - val_accuracy: 0.9436 - val_loss: 0.1937 - learning_rate: 0.0063
Epoch 6/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - 

In [24]:
# LearningRateScheduler 함수는 현재 learning rate도 입력받을 수 있음.
def exponential_decay_fn(epoch, lr):
    return lr * 0.1**(1 / 20)

## 구간별 고정 스케줄링

In [25]:
def piecewise_constant_fn(epoch):
    if epoch < 5:
        return 0.01
    elif epoch < 15:
        return 0.005
    else:
        return 0.001

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(piecewise_constant_fn)

## 성능 기반 스케줄링

In [26]:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)

## tf.keras.optimizers.optimizers.schedules 활용

In [27]:
import math

batch_size = 32
n_epochs = 25
n_steps = n_epochs * math.ceil(len(X_train) / batch_size)

scheduled_learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01, decay_steps=n_epochs, decay_rate=0.1,
)

optimizer = tf.keras.optimizers.SGD(learning_rate=scheduled_learning_rate)

# 규제를 사용해 과대적합 피하기

## l1과 l2 규제

In [28]:
# l2 규제
layer = tf.keras.layers.Dense(100, activation="relu",
                              kernel_initializer="he_normal",
                              kernel_regularizer=tf.keras.regularizers.l2(0.01))

In [29]:
# l1 규제
layer = tf.keras.layers.Dense(100, activation="relu",
                              kernel_initializer="he_normal",
                              kernel_regularizer=tf.keras.regularizers.l1(0.01))

In [30]:
# l1과 l2 규제 동시 적용
layer = tf.keras.layers.Dense(100, activation="relu",
                              kernel_initializer="he_normal",
                              kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01))

## 드롭아웃

In [31]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),

    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),

    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(10, activation="softmax")
])

In [50]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets.mnist import load_data
from tensorflow.keras.utils import to_categorical

(X_train, y_train), (X_test, y_test) = load_data()
y_train, y_test = to_categorical(y_train), to_categorical(y_test)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)
optimizer = tf.keras.optimizers.AdamW(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.4088 - loss: 15.6816 - val_accuracy: 0.5962 - val_loss: 1.1977
Epoch 2/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.5419 - loss: 1.4763 - val_accuracy: 0.7130 - val_loss: 0.8977
Epoch 3/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.6547 - loss: 1.1687 - val_accuracy: 0.8342 - val_loss: 0.6273
Epoch 4/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.7448 - loss: 0.8999 - val_accuracy: 0.8867 - val_loss: 0.4687
Epoch 5/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.7983 - loss: 0.7235 - val_accuracy: 0.9087 - val_loss: 0.3734
Epoch 6/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8352 - loss: 0.5926 - val_accuracy: 0.9192 - val_loss: 0.3313
Epoch 7/20


## 몬테 카를로 드롭아웃

In [51]:
import numpy as np

y_probas = np.stack([model(X_test, training=True) for sample in range(100)])
y_proba = y_probas.mean(axis=0)

In [52]:
y_proba.shape

(10000, 10)

In [53]:
model.predict(X_test[:1]).round(3)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]], dtype=float32)

In [54]:
y_proba[0].round(3)

array([0.   , 0.   , 0.004, 0.003, 0.001, 0.   , 0.   , 0.986, 0.002,
       0.004], dtype=float32)

In [55]:
y_std = y_probas.std(axis=0)
y_std[0].round(3)

array([0.002, 0.001, 0.026, 0.018, 0.004, 0.004, 0.   , 0.071, 0.01 ,
       0.019], dtype=float32)

In [57]:
y_pred = y_proba.argmax(axis=1)
accuracy = (y_pred == y_test.argmax(axis=1)).sum() / len(y_test)
accuracy

np.float64(0.9404)

## 맥스-노름 규제

In [58]:
dense = tf.keras.layers.Dense(
    100, activation="relu", kernel_initializer="he_normal",
    kernel_constraint=tf.keras.constraints.max_norm(1.)
)