# Assignment on Regularization and Optimization of Deep Learning

이번 과제에서는 reagularization과 optimization에서 배운 내용들을 이용해 최대한 Deep Models의 성능을 높여보고자 합니다. Layer 4개짜리 MLP (각 hidden layer는 512개의 unit을 가짐) 상황에서 정규화와 최적화 방법론들을 총 동원해 성능을 높여주시면 됩니다.

먼저, 아래 코드는 데이터 셋을 셋팅하는 부분입니다. 이 부분은 건드리시면 안됩니다. 이 부분을 건드리시면 0점 처리 됩니다. 외부 데이터 사용하셔도 안됩니다.

In [1]:
%matplotlib inline
import math
import random 

import tensorflow as tf
import numpy as np 
import matplotlib.pyplot as plt

seed = 1
random.seed(seed)
np.random.seed(seed=seed)
tf.random.set_random_seed(seed)

(x_1, y_1), (x_2, y_2) = tf.keras.datasets.cifar100.load_data()
x_total = np.concatenate([x_1, x_2], axis=0).astype(np.float64)
y_total = np.concatenate([y_1, y_2], axis=0)

n_output = 10

valid_index, _ = np.where(y_total < n_output)
y_total = y_total[valid_index].reshape([-1])
x_total = x_total[valid_index]

i = np.arange(x_total.shape[0])
np.random.shuffle(i)
x_total = x_total[i]
y_total = y_total[i]

train_size = 100 * n_output
x_train = x_total[:train_size]
y_train = y_total[:train_size]
x_test = x_total[train_size:]
y_test = y_total[train_size:]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1000, 32, 32, 3)
(1000,)
(5000, 32, 32, 3)
(5000,)


validation set을 나눕니다. 
- 실습시간에 배웠던 것처럼 Validation set 비율은 조정하셔도 됩니다. 

In [2]:
split = x_train.shape[0] // 6
x_valid = x_train[:split]
y_valid = y_train[:split]

x_train = x_train[split:]
y_train = y_train[split:]

이미지를 greyscale로 변경합니다. 
1. RGB 값을 고려한 코드로 변경하셔도 됩니다. 
2. Augmentation을 고려해보세요.

In [3]:
# x_train = np.mean(x_train, axis=3, keepdims=True)
# x_valid = np.mean(x_valid, axis=3, keepdims=True)
# x_test = np.mean(x_test, axis=3, keepdims=True)
# Use RGB and normalize pixel values by training data-channel
mean = np.zeros(3)
std = np.ones(3)
for i in range(3):
    mean[i] = np.mean(x_train[:, :, :, i])
    std[i] = np.std(x_train[:, :, :, i])    
x_train = (x_train - mean) / std
x_valid = (x_valid - mean) / std
x_test = (x_test - mean) / std

epoch = 10000
batch_size = 64

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

(834, 32, 32, 3)
(166, 32, 32, 3)
(5000, 32, 32, 3)


## Make Data Generator
for augmentation

In [4]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    width_shift_range=5,
    height_shift_range=5,
    fill_mode='constant',
    horizontal_flip=True,
    vertical_flip=False,
    rotation_range=10,
    zoom_range=[0.8, 1.2]
    )

In [5]:
n_input = 32 * 32 * 3   # H * W * C

# x_train = x_train.reshape([-1, n_input]) This will be done after augmentation.
x_valid = x_valid.reshape([-1, n_input])
x_test = x_test.reshape([-1, n_input])

print(x_valid.shape)
print(x_test.shape)

(166, 3072)
(5000, 3072)


이제 모델을 만듭니다.

1. Optimizer를 다른 걸로 바꿔보세요
2. Learning Rate를 바꿔보세요. Learning Rate Scheduling도 고려해보세요.
3. Activation Function을 바꿔보세요. 
4. Dropout, DropConnect, Gaussian Dropout 을 고려해보세요.
5. Augmentation을 고려해보세요. 

In [6]:
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.int32, [None])
training = tf.placeholder(tf.bool)

n_units = [n_input, 512, 512, 512, n_output]

weights, biases = [], []
for i, (n_in, n_out) in enumerate(zip(n_units[:-1], n_units[1:])):
    stddev = math.sqrt(2 / n_in) # Kaiming He Initialization
    weight = tf.Variable(tf.random.truncated_normal([n_in, n_out], mean=0, stddev=stddev))
    bias = tf.Variable(tf.zeros([n_out]))
    weights.append(weight)
    biases.append(bias)
    
layer = x
for i, (weight, bias) in enumerate(zip(weights, biases)):
    if i < len(weights) - 1:
        rate = tf.cond(training, lambda: 0.5, lambda: 0.0)
        maxnorm = tf.keras.constraints.MaxNorm(2)
        weight = maxnorm(weight)
        # weight = tf.nn.dropout(weight, rate=rate) * (1 - rate) # DropConnect
        layer = tf.matmul(layer, weight) + bias
        layer = tf.nn.relu(layer)
        layer = tf.keras.layers.GaussianDropout(0.5)(layer, training)
        #layer = tf.nn.dropout(layer, rate=rate)
    else:
        layer = tf.matmul(layer, weight) + bias
y_hat = layer

y_hot = tf.one_hot(y, n_output)
costs = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=y_hot, logits=y_hat)
cross_entropy_loss = tf.reduce_mean(costs)
# L2-regularization
l2_loss = 0.0
for weight in weights:
    l2_loss = l2_loss + tf.nn.l2_loss(weight)
loss = cross_entropy_loss + 0.009 * l2_loss

y_label = tf.argmax(y_hat, 1)
accuracy = tf.count_nonzero(
        tf.cast(tf.equal(tf.argmax(y_hot, 1), y_label),
                tf.int64)) / tf.cast(tf.shape(y_hot)[0], tf.int64)

extra_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_ops):
    global_step = tf.Variable(1, trainable=False)
    decay_steps = 20000
    learning_rate = 0.00011
    lr_decayed = tf.train.cosine_decay(learning_rate, global_step, decay_steps)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr_decayed, beta1=0.85)
    train_op = optimizer.minimize(loss, global_step=global_step)

W0728 07:01:30.473685  2244 deprecation.py:323] From c:\users\ironm\tf-nightly\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
gpu_options = tf.GPUOptions()
gpu_options.allow_growth = True
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
session.run(tf.global_variables_initializer())

In [8]:
max_valid_epoch_idx = 0
max_valid_accuracy = 0.0
final_test_accuracy = 0.0
for epoch_idx in range(1, 10000 + 1):
    batches = 0
    for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=batch_size):
        x_batch = x_batch.reshape([-1, n_input])
        session.run(
                train_op,
                feed_dict={
                    x: x_batch,
                    y: y_batch,
                    training: True
                })
        batches += 1
        if batches >= len(x_train) / batch_size:
            break

    if epoch_idx % 10 == 0:
        train_loss_value, train_accuracy_value, lr = session.run(
            [loss, accuracy, lr_decayed],
            feed_dict={
                x: x_train.reshape([-1, n_input]),
                y: y_train,
                training: False
            })
        
        valid_loss_value, valid_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_valid,
                y: y_valid,
                training: False
            })
            
        test_loss_value, test_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_test,
                y: y_test,
                training: False
            })

        print(epoch_idx, '%.4f' % train_loss_value, '%.4f' % valid_loss_value, '%.4f' % test_loss_value, '%.4f' % train_accuracy_value, '%.4f' % valid_accuracy_value, '%.4f' % test_accuracy_value)
        print('%.5f' % lr)
        if max_valid_accuracy < valid_accuracy_value:
            max_valid_accuracy = valid_accuracy_value 
            max_valid_epoch_idx = epoch_idx
            final_test_accuracy = test_accuracy_value
            
    # Early Stop
    if max_valid_epoch_idx + 500 < epoch_idx: # Increase Early Stop bound
        break
        
print(final_test_accuracy)

10 12.2947 12.4052 12.3542 0.3405 0.2892 0.3104
0.00011
20 11.8864 12.0211 11.9620 0.3909 0.3614 0.3580
0.00011
30 11.5158 11.6776 11.6101 0.3993 0.3855 0.3584
0.00011
40 11.1815 11.3188 11.2715 0.4388 0.4277 0.3796
0.00011
50 10.8461 11.0050 10.9452 0.4197 0.3614 0.3740
0.00011
60 10.5164 10.7004 10.6270 0.4448 0.3976 0.3958
0.00011
70 10.2051 10.3966 10.3274 0.4508 0.3675 0.3968
0.00011
80 9.8795 10.0613 10.0049 0.4736 0.3916 0.4092
0.00011
90 9.5851 9.7945 9.7250 0.4652 0.3735 0.4038
0.00011
100 9.2810 9.4849 9.4253 0.4940 0.4096 0.4114
0.00011
110 8.9952 9.2144 9.1514 0.4736 0.3855 0.4030
0.00011
120 8.7099 8.9320 8.8749 0.4928 0.4096 0.4100
0.00011
130 8.4335 8.6834 8.6145 0.4808 0.4036 0.4046
0.00011
140 8.1734 8.4473 8.3602 0.4760 0.3855 0.4012
0.00011
150 7.8847 8.1611 8.0851 0.4904 0.4157 0.4094
0.00011
160 7.6465 7.8905 7.8394 0.4856 0.4458 0.4088
0.00011
170 7.3789 7.6397 7.5835 0.5060 0.4277 0.4264
0.00011
180 7.1295 7.4000 7.3460 0.5108 0.4398 0.4214
0.00011
190 6.8912 7.1

    32.88% 의 성능을 확인할 수 있습니다. 실습시간 배운 몇 가지 정규화와 최적화 과정을 동원하면 50% 정도의 성능까지는 쉽게 달성할 수 있음을 확인했습니다. 수업시간에 배운 내용들을 사용해 최대한 높은 성능을 나타내는 모델을 만들어보세요! 
주피터 노트북 파일을 제출해주시면 되며, 성능을 기준으로 점수를 매길 예정입니다. (상대평가)