# Assignment on Regularization and Optimization of Deep Learning

이번 과제에서는 reagularization과 optimization에서 배운 내용들을 이용해 최대한 Deep Models의 성능을 높여보고자 합니다. Layer 4개짜리 MLP (각 hidden layer는 512개의 unit을 가짐) 상황에서 정규화와 최적화 방법론들을 총 동원해 성능을 높여주시면 됩니다.

먼저, 아래 코드는 데이터 셋을 셋팅하는 부분입니다. 이 부분은 건드리시면 안됩니다. 이 부분을 건드리시면 0점 처리 됩니다. 외부 데이터 사용하셔도 안됩니다.

In [1]:
%matplotlib inline
import math
import random 

import tensorflow as tf
import numpy as np 
import matplotlib.pyplot as plt

seed = 1
random.seed(seed)
np.random.seed(seed=seed)
tf.random.set_random_seed(seed)

(x_1, y_1), (x_2, y_2) = tf.keras.datasets.cifar100.load_data()
x_total = np.concatenate([x_1, x_2], axis=0).astype(np.float64)
y_total = np.concatenate([y_1, y_2], axis=0)

n_output = 10

valid_index, _ = np.where(y_total < n_output)
y_total = y_total[valid_index].reshape([-1])
x_total = x_total[valid_index]

i = np.arange(x_total.shape[0])
np.random.shuffle(i)
x_total = x_total[i]
y_total = y_total[i]

train_size = 100 * n_output
x_train = x_total[:train_size]
y_train = y_total[:train_size]
x_test = x_total[train_size:]
y_test = y_total[train_size:]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1000, 32, 32, 3)
(1000,)
(5000, 32, 32, 3)
(5000,)


validation set을 나눕니다. 
- 실습시간에 배웠던 것처럼 Validation set 비율은 조정하셔도 됩니다. 

In [2]:
split = x_train.shape[0] // 5
x_valid = x_train[:split]
y_valid = y_train[:split]

x_train = x_train[split:]
y_train = y_train[split:]

이미지를 greyscale로 변경합니다. 
1. RGB 값을 고려한 코드로 변경하셔도 됩니다. 
2. Augmentation을 고려해보세요.

In [3]:
# x_train = np.mean(x_train, axis=3, keepdims=True)
# x_valid = np.mean(x_valid, axis=3, keepdims=True)
# x_test = np.mean(x_test, axis=3, keepdims=True)
# Use RGB and normalize pixel values by training data-channel
mean = np.zeros(3)
std = np.ones(3)
for i in range(3):
    mean[i] = np.mean(x_train[:, :, :, i])
    std[i] = np.std(x_train[:, :, :, i])    
x_train = (x_train - mean) / std
x_valid = (x_valid - mean) / std
x_test = (x_test - mean) / std

epoch = 10000
batch_size = 64

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

(800, 32, 32, 3)
(200, 32, 32, 3)
(5000, 32, 32, 3)


## Make Data Generator
for augmentation

In [4]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    width_shift_range=5,
    height_shift_range=5,
    fill_mode='constant',
    horizontal_flip=True,
    vertical_flip=False,
    rotation_range=15,
    zoom_range=[0.8, 1.2]
    )

In [5]:
n_input = 32 * 32 * 3   # H * W * C

# x_train = x_train.reshape([-1, n_input]) This will be done after augmentation.
x_valid = x_valid.reshape([-1, n_input])
x_test = x_test.reshape([-1, n_input])

print(x_valid.shape)
print(x_test.shape)

(200, 3072)
(5000, 3072)


이제 모델을 만듭니다.

1. Optimizer를 다른 걸로 바꿔보세요
2. Learning Rate를 바꿔보세요. Learning Rate Scheduling도 고려해보세요.
3. Activation Function을 바꿔보세요. 
4. Dropout, DropConnect, Gaussian Dropout 을 고려해보세요.
5. Augmentation을 고려해보세요. 

In [6]:
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.int32, [None])
training = tf.placeholder(tf.bool)

n_units = [n_input, 512, 512, 512, n_output]

weights, biases = [], []
for i, (n_in, n_out) in enumerate(zip(n_units[:-1], n_units[1:])):
    stddev = math.sqrt(2 / n_in) # Kaiming He Initialization
    weight = tf.Variable(tf.random.truncated_normal([n_in, n_out], mean=0, stddev=stddev))
    bias = tf.Variable(tf.zeros([n_out]))
    weights.append(weight)
    biases.append(bias)
    
layer = x
for i, (weight, bias) in enumerate(zip(weights, biases)):
    if i < len(weights) - 1:
        rate = tf.cond(training, lambda: 0.5, lambda: 0.0)
        weight = tf.nn.dropout(weight, rate=rate) * (1 - rate) # DropConnect
        layer = tf.matmul(layer, weight) + bias
        layer = tf.nn.relu(layer)
    else:
        layer = tf.matmul(layer, weight) + bias
y_hat = layer

y_hot = tf.one_hot(y, n_output)
costs = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=y_hot, logits=y_hat)
cross_entropy_loss = tf.reduce_mean(costs)
# L2-regularization
l2_loss = 0.0
for weight in weights:
    l2_loss = l2_loss + tf.nn.l2_loss(weight)
loss = tf.reduce_mean(cross_entropy_loss + 0.001 * l2_loss)

y_label = tf.argmax(y_hat, 1)
accuracy = tf.count_nonzero(
        tf.cast(tf.equal(tf.argmax(y_hot, 1), y_label),
                tf.int64)) / tf.cast(tf.shape(y_hot)[0], tf.int64)

extra_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_ops):
    global_step = tf.Variable(1, trainable=False)
    decay_steps = 50000
    learning_rate = 0.0001
    lr_decayed = tf.train.cosine_decay(learning_rate, global_step, decay_steps)
    optimizer = tf.train.AdamOptimizer(lr_decayed)
    train_op = optimizer.minimize(loss, global_step=global_step)

In [7]:
gpu_options = tf.GPUOptions()
gpu_options.allow_growth = True
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
session.run(tf.global_variables_initializer())

In [8]:
max_valid_epoch_idx = 0
max_valid_accuracy = 0.0
final_test_accuracy = 0.0
for epoch_idx in range(1, 10000 + 1):
    batches = 0
    for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=batch_size):
        x_batch = x_batch.reshape([-1, n_input])
        session.run(
                train_op,
                feed_dict={
                    x: x_batch,
                    y: y_batch,
                    training: True
                })
        batches += 1
        if batches >= len(x_train) / batch_size:
            break

    if epoch_idx % 10 == 0:
        train_loss_value, train_accuracy_value, lr = session.run(
            [loss, accuracy, lr_decayed],
            feed_dict={
                x: x_train.reshape([-1, n_input]),
                y: y_train,
                training: False
            })
        
        valid_loss_value, valid_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_valid,
                y: y_valid,
                training: False
            })
            
        test_loss_value, test_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_test,
                y: y_test,
                training: False
            })

        print(epoch_idx, '%.4f' % train_loss_value, '%.4f' % valid_loss_value, '%.4f' % test_loss_value, '%.4f' % train_accuracy_value, '%.4f' % valid_accuracy_value, '%.4f' % test_accuracy_value)
        print('%.5f' % lr)
        if max_valid_accuracy < valid_accuracy_value:
            max_valid_accuracy = valid_accuracy_value 
            max_valid_epoch_idx = epoch_idx
            final_test_accuracy = test_accuracy_value
            
    # Early Stop
    if max_valid_epoch_idx + 100 < epoch_idx: # Increase Early Stop bound
        break
        
print(final_test_accuracy)

10 5.1349 5.4907 5.3698 0.3225 0.2900 0.2942
0.00010
20 6.5508 7.4678 7.1394 0.3812 0.3100 0.3400
0.00010
30 6.7322 7.9138 7.7379 0.4163 0.3350 0.3718
0.00010
40 6.9494 8.0710 8.2941 0.4450 0.3700 0.3932
0.00010
50 7.0061 8.2878 8.6709 0.4662 0.3850 0.4036
0.00010
60 6.8437 8.5515 8.8495 0.4788 0.3800 0.4084
0.00010
70 6.5268 8.5816 8.9344 0.4950 0.3850 0.4076
0.00010
80 6.4803 8.4495 9.0404 0.5038 0.4350 0.4218
0.00010
90 6.7956 9.0715 9.7076 0.5138 0.4100 0.4146
0.00010
100 6.0640 8.2558 9.1370 0.5262 0.4500 0.4252
0.00010
110 6.1583 8.7615 9.4902 0.5300 0.4450 0.4226
0.00010
120 5.9347 8.6623 9.5268 0.5575 0.4600 0.4294
0.00010
130 5.7374 8.9362 9.4924 0.5637 0.4550 0.4316
0.00010
140 5.5874 8.6649 9.6636 0.5800 0.4500 0.4362
0.00010
150 5.3928 8.6939 9.6157 0.5975 0.4700 0.4424
0.00010
160 5.4659 9.0024 9.9041 0.5850 0.4450 0.4348
0.00010
170 5.0970 8.7022 9.7021 0.6200 0.4600 0.4482
0.00010
180 5.2280 9.2686 10.1750 0.6050 0.4450 0.4466
0.00010
190 4.9346 8.9417 9.8379 0.6212 0.47

32.88% 의 성능을 확인할 수 있습니다. 실습시간 배운 몇 가지 정규화와 최적화 과정을 동원하면 50% 정도의 성능까지는 쉽게 달성할 수 있음을 확인했습니다. 수업시간에 배운 내용들을 사용해 최대한 높은 성능을 나타내는 모델을 만들어보세요! 
주피터 노트북 파일을 제출해주시면 되며, 성능을 기준으로 점수를 매길 예정입니다. (상대평가)