# Assignment on Regularization and Optimization of Deep Learning

이번 과제에서는 reagularization과 optimization에서 배운 내용들을 이용해 최대한 Deep Models의 성능을 높여보고자 합니다. Layer 4개짜리 MLP (각 hidden layer는 512개의 unit을 가짐) 상황에서 정규화와 최적화 방법론들을 총 동원해 성능을 높여주시면 됩니다.

먼저, 아래 코드는 데이터 셋을 셋팅하는 부분입니다. 이 부분은 건드리시면 안됩니다. 이 부분을 건드리시면 0점 처리 됩니다. 외부 데이터 사용하셔도 안됩니다.

In [1]:
%matplotlib inline
import math
import random 

import tensorflow as tf
import numpy as np 
import matplotlib.pyplot as plt

seed = 1
random.seed(seed)
np.random.seed(seed=seed)
tf.random.set_random_seed(seed)

(x_1, y_1), (x_2, y_2) = tf.keras.datasets.cifar100.load_data()
x_total = np.concatenate([x_1, x_2], axis=0).astype(np.float64)
y_total = np.concatenate([y_1, y_2], axis=0)

n_output = 10

valid_index, _ = np.where(y_total < n_output)
y_total = y_total[valid_index].reshape([-1])
x_total = x_total[valid_index]

i = np.arange(x_total.shape[0])
np.random.shuffle(i)
x_total = x_total[i]
y_total = y_total[i]

train_size = 100 * n_output
x_train = x_total[:train_size]
y_train = y_total[:train_size]
x_test = x_total[train_size:]
y_test = y_total[train_size:]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1000, 32, 32, 3)
(1000,)
(5000, 32, 32, 3)
(5000,)


validation set을 나눕니다. 
- 실습시간에 배웠던 것처럼 Validation set 비율은 조정하셔도 됩니다. 

In [2]:
split = x_train.shape[0] // 5
x_valid = x_train[:split]
y_valid = y_train[:split]

x_train = x_train[split:]
y_train = y_train[split:]

이미지를 greyscale로 변경합니다. 
1. RGB 값을 고려한 코드로 변경하셔도 됩니다. 
2. Augmentation을 고려해보세요.

In [3]:
# x_train = np.mean(x_train, axis=3, keepdims=True)
# x_valid = np.mean(x_valid, axis=3, keepdims=True)
# x_test = np.mean(x_test, axis=3, keepdims=True)
# Use RGB and normalize pixel values by training data-channel
mean = np.zeros(3)
std = np.ones(3)
for i in range(3):
    mean[i] = np.mean(x_train[:, :, :, i])
    std[i] = np.std(x_train[:, :, :, i])    
x_train = (x_train - mean) / std
x_valid = (x_valid - mean) / std
x_test = (x_test - mean) / std

epoch = 10000
batch_size = 64

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

(800, 32, 32, 3)
(200, 32, 32, 3)
(5000, 32, 32, 3)


## Make Data Generator
for augmentation

In [4]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    width_shift_range=3,
    height_shift_range=3,
    fill_mode='constant',
    horizontal_flip=True,
    vertical_flip=False,
    rotation_range=10,
    zoom_range=[0.8, 1.2]
    )

In [5]:
n_input = 32 * 32 * 3   # H * W * C

# x_train = x_train.reshape([-1, n_input]) This will be done after augmentation.
x_valid = x_valid.reshape([-1, n_input])
x_test = x_test.reshape([-1, n_input])

print(x_valid.shape)
print(x_test.shape)

(200, 3072)
(5000, 3072)


이제 모델을 만듭니다.

1. Optimizer를 다른 걸로 바꿔보세요
2. Learning Rate를 바꿔보세요. Learning Rate Scheduling도 고려해보세요.
3. Activation Function을 바꿔보세요. 
4. Dropout, DropConnect, Gaussian Dropout 을 고려해보세요.
5. Augmentation을 고려해보세요. 

In [6]:
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.int32, [None])
training = tf.placeholder(tf.bool)

n_units = [n_input, 512, 512, 512, n_output]

weights, biases = [], []
for i, (n_in, n_out) in enumerate(zip(n_units[:-1], n_units[1:])):
    stddev = math.sqrt(2 / n_in) # Kaiming He Initialization
    weight = tf.Variable(tf.random.truncated_normal([n_in, n_out], mean=0, stddev=stddev))
    bias = tf.Variable(tf.zeros([n_out]))
    weights.append(weight)
    biases.append(bias)

layer = x

for i, (weight, bias) in enumerate(zip(weights, biases)):
    if i < len(weights) - 1:
        rate = tf.cond(training, lambda: 0.5, lambda: 0.0)
        maxnorm = tf.keras.constraints.MaxNorm(2)
        weight = maxnorm(weight)
        # weight = tf.nn.dropout(weight, rate=rate) * (1 - rate) # DropConnect
        layer = tf.matmul(layer, weight) + bias
        layer = tf.nn.relu(layer)
        layer = tf.keras.layers.GaussianDropout(0.5)(layer, training)
        #layer = tf.nn.dropout(layer, rate=rate)
    else:
        layer = tf.matmul(layer, weight) + bias
y_hat = layer

y_hot = tf.one_hot(y, n_output)
costs = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=y_hot, logits=y_hat)
cross_entropy_loss = tf.reduce_mean(costs)
# L2-regularization
l2_loss = 0.0
for weight in weights:
    l2_loss = l2_loss + tf.nn.l2_loss(weight)
loss = cross_entropy_loss + 0.009 * l2_loss

y_label = tf.argmax(y_hat, 1)
accuracy = tf.count_nonzero(
        tf.cast(tf.equal(tf.argmax(y_hot, 1), y_label),
                tf.int64)) / tf.cast(tf.shape(y_hot)[0], tf.int64)

extra_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_ops):
    global_step = tf.Variable(1, trainable=False)
    decay_steps = 20000
    learning_rate = 0.0002
    lr_decayed = tf.train.cosine_decay(learning_rate, global_step, decay_steps)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr_decayed, beta1=0.85)
    train_op = optimizer.minimize(loss, global_step=global_step)

W0727 00:13:14.106511  4652 deprecation.py:323] From c:\users\ironm\tf-nightly\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
gpu_options = tf.GPUOptions()
gpu_options.allow_growth = True
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
session.run(tf.global_variables_initializer())

In [8]:
max_valid_epoch_idx = 0
max_valid_accuracy = 0.0
final_test_accuracy = 0.0
for epoch_idx in range(1, 10000 + 1):
    batches = 0
    for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=batch_size):
        x_batch = x_batch.reshape([-1, n_input])
        session.run(
                train_op,
                feed_dict={
                    x: x_batch,
                    y: y_batch,
                    training: True
                })
        batches += 1
        if batches >= len(x_train) / batch_size:
            break

    if epoch_idx % 10 == 0:
        train_loss_value, train_accuracy_value, lr = session.run(
            [loss, accuracy, lr_decayed],
            feed_dict={
                x: x_train.reshape([-1, n_input]),
                y: y_train,
                training: False
            })
        
        valid_loss_value, valid_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_valid,
                y: y_valid,
                training: False
            })
            
        test_loss_value, test_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_test,
                y: y_test,
                training: False
            })

        print(epoch_idx, '%.4f' % train_loss_value, '%.4f' % valid_loss_value, '%.4f' % test_loss_value, '%.4f' % train_accuracy_value, '%.4f' % valid_accuracy_value, '%.4f' % test_accuracy_value)
        print('%.5f' % lr)
        if max_valid_accuracy < valid_accuracy_value:
            max_valid_accuracy = valid_accuracy_value 
            max_valid_epoch_idx = epoch_idx
            final_test_accuracy = test_accuracy_value
            
    # Early Stop
    if max_valid_epoch_idx + 500 < epoch_idx: # Increase Early Stop bound
        break
        
print(final_test_accuracy)

10 11.8350 12.0090 11.9367 0.4425 0.3550 0.3572
0.00020
20 11.0438 11.2382 11.1845 0.4738 0.3750 0.3954
0.00020
30 10.2703 10.5014 10.4570 0.4775 0.4000 0.4058
0.00020
40 9.5445 9.7873 9.7458 0.5262 0.3750 0.4212
0.00020
50 8.8365 9.1099 9.0674 0.5350 0.4000 0.4274
0.00020
60 8.1561 8.4662 8.4269 0.5463 0.4100 0.4354
0.00020
70 7.5125 7.8560 7.8238 0.5575 0.4100 0.4360
0.00020
80 6.9075 7.2777 7.2448 0.5763 0.4450 0.4452
0.00020
90 6.3273 6.7353 6.7236 0.6050 0.4100 0.4466
0.00020
100 5.7958 6.2456 6.2157 0.6188 0.4550 0.4560
0.00020
110 5.3106 5.7818 5.7659 0.6250 0.4700 0.4528
0.00020
120 4.8512 5.3553 5.3582 0.6462 0.4600 0.4570
0.00020
130 4.4351 4.9843 4.9934 0.6562 0.4650 0.4568
0.00020
140 4.0800 4.6306 4.6422 0.6550 0.4650 0.4616
0.00020
150 3.7341 4.3514 4.3605 0.6713 0.4750 0.4690
0.00020
160 3.4367 4.0737 4.0946 0.6800 0.4650 0.4724
0.00019
170 3.1413 3.7668 3.8184 0.7137 0.4700 0.4760
0.00019
180 2.8775 3.5351 3.6079 0.7325 0.4850 0.4752
0.00019
190 2.6909 3.3406 3.4393 0.7

    32.88% 의 성능을 확인할 수 있습니다. 실습시간 배운 몇 가지 정규화와 최적화 과정을 동원하면 50% 정도의 성능까지는 쉽게 달성할 수 있음을 확인했습니다. 수업시간에 배운 내용들을 사용해 최대한 높은 성능을 나타내는 모델을 만들어보세요! 
주피터 노트북 파일을 제출해주시면 되며, 성능을 기준으로 점수를 매길 예정입니다. (상대평가)