## Q1

In [2]:
import numpy as np
import pickle
import mxnet as mx

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
from mxnet import gluon, init, npx, autograd
from mxnet.gluon import nn
from mxnet.gluon.data.dataset import ArrayDataset
from mxnet.gluon.data import DataLoader
from mxnet.gluon.loss import L2Loss

In [5]:
file = open('msd_full.pickle', 'rb')
raw_data = pickle.load(file)

In [6]:
x_train = raw_data['X_train'].astype('float32')
y_train = raw_data['Y_train'].astype('float32')
x_test = raw_data['X_test'].astype('float32')
y_test = raw_data['Y_test'].astype('float32')

In [7]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
x_train_10000 = x_train[0:10000]
y_train_10000 = y_train[0:10000]
x_subtrain = x_train[0:int(len(x_train)*0.9)]
y_subtrain = y_train[0:int(len(y_train)*0.9)]
x_tune = x_train[int(len(x_train)*0.9):]
y_tune = y_train[int(len(y_train)*0.9):]

In [14]:
y_train_mean = y_train.mean()

### 1. OLS
- y: keep
- num of obs.: 10,000
- model: linear regression
- regularization: none

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [10]:
regressor = LinearRegression()  
regressor.fit(x_train_10000, y_train_10000)
y_pred = regressor.predict(x_test)
print('test rmse', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

test rmse 9.550725


### 2. MLP_0_dm
- y: de-mean
- num of obs: 10,000
- model: MLP, no hidden layers
- regularization: None

In [39]:
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
y_train_demean = y_train_10000 - y_train_mean
y_tune_demean = y_tune - y_train_mean
y_test_demean = y_test - y_train_mean
y_train_fortest = y_train - y_train_mean

dataset = ArrayDataset(x_train_10000, y_train_demean)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune_demean)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train_fortest)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test_demean)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [40]:
# construct and initialize network.
net = nn.Sequential()
net.add(nn.Dense(1))

# define loss and trainer.
criterion = gluon.loss.L2Loss()

epochs = 1000
lr = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]

In [42]:
result_lr = {}

for index in lr:
    net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index})
    train_history = []
    
    for epoch in range(epochs):
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        for x, y in train_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            with autograd.record():
                y_train_pred = net(x)
                loss = criterion(y_train_pred, y)
            loss.backward()
            trainer.step(512)
            cumulative_train_loss += loss.sum()
            training_samples += x.shape[0]
        train_loss = cumulative_train_loss.asscalar()/training_samples
        train_rmse = np.sqrt(train_loss)
        if epoch%100 == 0:
            print('epoch:', epoch)
        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        for x, y in tune_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            y_tune_pred = net(x)
            loss = criterion(y_tune_pred, y)
            cumulative_valid_loss += loss.sum()
            valid_samples += x.shape[0]
        valid_loss = cumulative_valid_loss.asscalar()/valid_samples
        valid_rmse = np.sqrt(valid_loss)
        train_history.append(valid_rmse)
        if len(train_history) > 50:
            if train_history[-51] < min(train_history[-50:]):
                print("Early Stopped", len(train_history))
                break
        
    result_lr[index] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
print("best lr:", best_lr)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)
    if epoch%100 == 0:
        print('epoch:', epoch)
    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

epoch: 0
Early Stopped 63
epoch: 0
Early Stopped 72
epoch: 0
epoch: 100
Early Stopped 172
epoch: 0
epoch: 100
epoch: 200
Early Stopped 277
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
best lr: 0.001
epoch: 0
epoch: 100
epoch: 200
Early Stopped 226
test rmse: 6.724543103699715


### 3. MLP_1_dm
- y: de-mean
- num of obs: 10,000
- model: MLP, one hidden layer + ReLU
- regularization: None

In [43]:
y_train_demean = y_train_10000 - y_train_mean
y_tune_demean = y_tune - y_train_mean
y_test_demean = y_test - y_train_mean
y_train_fortest = y_train - y_train_mean

dataset = ArrayDataset(x_train_10000, y_train_demean)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune_demean)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train_fortest)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test_demean)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [44]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(1, activation="relu"))

# define loss and trainer.
criterion = gluon.loss.L2Loss()

# epochs = 1000
# lr = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]

In [45]:
result_lr = {}

for index in lr:
    net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index})
    train_history = []
    
    for epoch in range(epochs):
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        for x, y in train_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            with autograd.record():
                y_train_pred = net(x)
                loss = criterion(y_train_pred, y)
            loss.backward()
            trainer.step(512)
            cumulative_train_loss += loss.sum()
            training_samples += x.shape[0]
        train_loss = cumulative_train_loss.asscalar()/training_samples
        train_rmse = np.sqrt(train_loss)
        if epoch%100 == 0:
            print('epoch:', epoch)
        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        for x, y in tune_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            y_tune_pred = net(x)
            loss = criterion(y_tune_pred, y)
            cumulative_valid_loss += loss.sum()
            valid_samples += x.shape[0]
        valid_loss = cumulative_valid_loss.asscalar()/valid_samples
        valid_rmse = np.sqrt(valid_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
        train_history.append(valid_rmse)
        if len(train_history) > 50:
            if train_history[-51] < min(train_history[-50:]):
                print("Early Stopped", len(train_history))
                break
        
    result_lr[index] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
print("best lr:", best_lr)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

epoch: 0
Early Stopped 63
epoch: 0
Early Stopped 66
epoch: 0
Early Stopped 100
epoch: 0
epoch: 100
Early Stopped 141
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
Early Stopped 484
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
best lr: 0.001
test rmse: 7.167127556584797


### 4. MLP_2_dm
- y: de-mean
- num of obs: 10,000
- model: MLP, two hidden layers + ReLu
- regularization: None

In [46]:
y_train_demean = y_train_10000 - y_train_mean
y_tune_demean = y_tune - y_train_mean
y_test_demean = y_test - y_train_mean
y_train_fortest = y_train - y_train_mean

dataset = ArrayDataset(x_train_10000, y_train_demean)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune_demean)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train_fortest)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test_demean)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [47]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(1, activation="relu"))

# define loss and trainer.
criterion = gluon.loss.L2Loss()

# epochs = 1000
# lr = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]

In [48]:
result_lr = {}

for index in lr:
    net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index})
    train_history = []
    
    for epoch in range(epochs):
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        for x, y in train_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            with autograd.record():
                y_train_pred = net(x)
                loss = criterion(y_train_pred, y)
            loss.backward()
            trainer.step(512)
            cumulative_train_loss += loss.sum()
            training_samples += x.shape[0]
        train_loss = cumulative_train_loss.asscalar()/training_samples
        train_rmse = np.sqrt(train_loss)
        if epoch%100 == 0:
            print('epoch:', epoch)
        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        for x, y in tune_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            y_tune_pred = net(x)
            loss = criterion(y_tune_pred, y)
            cumulative_valid_loss += loss.sum()
            valid_samples += x.shape[0]
        valid_loss = cumulative_valid_loss.asscalar()/valid_samples
        valid_rmse = np.sqrt(valid_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
        train_history.append(valid_rmse)
        if len(train_history) > 50:
            if train_history[-51] < min(train_history[-50:]):
                print("Early Stopped", len(train_history))
                break
        
    result_lr[index] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
print("best lr:", best_lr)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

epoch: 0
Early Stopped 64
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
Early Stopped 113
epoch: 0
epoch: 100
Early Stopped 137
epoch: 0
epoch: 100
epoch: 200
epoch: 300
Early Stopped 335
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
Early Stopped 596
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
best lr: 0.001
Early Stopped 430
test rmse: 7.166892780058871


### 5. MLP_2_dm_L2
- y: de-mean
- num of obs: 10,000
- model: MLP, two hidden layers + ReLu
- regularization: Weight Decay (L2)

In [49]:
y_train_demean = y_train_10000 - y_train_mean
y_tune_demean = y_tune - y_train_mean
y_test_demean = y_test - y_train_mean
y_train_fortest = y_train - y_train_mean

dataset = ArrayDataset(x_train_10000, y_train_demean)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune_demean)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train_fortest)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test_demean)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [50]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(1, activation="relu"))
# define loss and trainer.
criterion = gluon.loss.L2Loss()

weight_decay = [0.1, 0.001, 0.0001]
# epochs = 1000
# lr = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]

In [51]:
result_lr = {}
result_wd = {}
for wd in weight_decay:
    for index in lr:
        net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
        trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index, 'wd': wd})
        train_history = []

        for epoch in range(epochs):
            cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
            training_samples = 0
            for x, y in train_data_loader:
                x = x.as_in_context(ctx)
                y = y.as_in_context(ctx)
                with autograd.record():
                    y_train_pred = net(x)
                    loss = criterion(y_train_pred, y)
                loss.backward()
                trainer.step(512)
                cumulative_train_loss += loss.sum()
                training_samples += x.shape[0]
            train_loss = cumulative_train_loss.asscalar()/training_samples
            train_rmse = np.sqrt(train_loss)
            if epoch%100 == 0:
                print('epoch:', epoch)
            # validation loop
            cumulative_valid_loss = mx.nd.zeros(1, ctx)
            valid_samples = 0
            for x, y in tune_data_loader:
                x = x.as_in_context(ctx)
                y = y.as_in_context(ctx)
                y_tune_pred = net(x)
                loss = criterion(y_tune_pred, y)
                cumulative_valid_loss += loss.sum()
                valid_samples += x.shape[0]
            valid_loss = cumulative_valid_loss.asscalar()/valid_samples
            valid_rmse = np.sqrt(valid_loss)
    #         if epoch%20 == 0:
    #             print('epoch:', epoch, 'loss:', valid_rmse)
            train_history.append(valid_rmse)
            if len(train_history) > 50:
                if train_history[-51] < min(train_history[-50:]):
                    print("Early Stopped", len(train_history))
                    break

        result_lr[index] = valid_rmse
        result_wd[wd] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
best_wd, _ = sorted(result_wd.items(), key = lambda item: item[1])[0]

print("best lr:", best_lr)
print("best wd:", best_wd)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr, 'wd': best_wd})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
Early Stopped 65
epoch: 0
epoch: 100
Early Stopped 105
epoch: 0
epoch: 100
Early Stopped 150
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
Early Stopped 519
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
Early Stopped 856
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
Early Stopped 87
epoch: 0
epoch: 100
Early Stopped 126
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
Early Stopped 404
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
Early Stopped 763
epoch: 0
epoch: 100
epoch: 200

### 6. MLP_2_dm_dropout
- y: de-mean
- num of obs: 10,000
- model: MLP, two hidden layers + ReLu
- regularization: Dropout

In [52]:
y_train_demean = y_train_10000 - y_train_mean
y_tune_demean = y_tune - y_train_mean
y_test_demean = y_test - y_train_mean
y_train_fortest = y_train - y_train_mean

dataset = ArrayDataset(x_train_10000, y_train_demean)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune_demean)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train_fortest)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test_demean)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [53]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(gluon.nn.Dropout(.5))
net.add(nn.Dense(45, activation="relu"))
net.add(gluon.nn.Dropout(.5))
net.add(nn.Dense(1, activation="relu"))
# define loss and trainer.
criterion = gluon.loss.L2Loss()

# epochs = 1000
# lr = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]

In [54]:
result_lr = {}

for index in lr:
    net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index})
    train_history = []
    
    for epoch in range(epochs):
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        for x, y in train_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            with autograd.record():
                y_train_pred = net(x)
                loss = criterion(y_train_pred, y)
            loss.backward()
            trainer.step(512)
            cumulative_train_loss += loss.sum()
            training_samples += x.shape[0]
        train_loss = cumulative_train_loss.asscalar()/training_samples
        train_rmse = np.sqrt(train_loss)
        if epoch%100 == 0:
            print('epoch:', epoch)
        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        for x, y in tune_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx) 
            y_tune_pred = net(x)
            loss = criterion(y_tune_pred, y)
            cumulative_valid_loss += loss.sum()
            valid_samples += x.shape[0]
        valid_loss = cumulative_valid_loss.asscalar()/valid_samples
        valid_rmse = np.sqrt(valid_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
        train_history.append(valid_rmse)
        if len(train_history) > 50:
            if train_history[-51] < min(train_history[-50:]):
                print("Early Stopped", len(train_history))
                break
        
    result_lr[index] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
print("best lr:", best_lr)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
Early Stopped 761
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
Early Stopped 51
epoch: 0
Early Stopped 56
best lr: 0.005
Early Stopped 362
test rmse: 7.1594137408943075


### 7. MLP_2_ykeep
- y: keep
- num of obs: 10,000
- model: MLP, two hidden layers + ReLu
- regularization: None

In [55]:
dataset = ArrayDataset(x_train_10000, y_train_10000)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [56]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(1, activation="relu"))
# define loss and trainer.
criterion = gluon.loss.L2Loss()

# epochs = 1000
lr = [0.00001, 0.000005, 0.000001]

In [57]:
result_lr = {}

for index in lr:
    net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index})
    train_history = []
    
    for epoch in range(epochs):
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        for x, y in train_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            with autograd.record():
                y_train_pred = net(x)
                loss = criterion(y_train_pred, y)
            loss.backward()
            trainer.step(512)
            cumulative_train_loss += loss.sum()
            training_samples += x.shape[0]
        train_loss = cumulative_train_loss.asscalar()/training_samples
        train_rmse = np.sqrt(train_loss)
        if epoch%100 == 0:
            print('epoch:', epoch)
        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        for x, y in tune_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            y_tune_pred = net(x)
            loss = criterion(y_tune_pred, y)
            cumulative_valid_loss += loss.sum()
            valid_samples += x.shape[0]
        valid_loss = cumulative_valid_loss.asscalar()/valid_samples
        valid_rmse = np.sqrt(valid_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
        train_history.append(valid_rmse)
        if len(train_history) > 50:
            if train_history[-51] < min(train_history[-50:]):
                print("Early Stopped", len(train_history))
                break
        
    result_lr[index] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
print("best lr:", best_lr)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
best lr: 1e-05
Early Stopped 916
test rmse: 6.665264112583238


### 8. MLP_2_ykeep_L2
- y: keep
- num of obs: 10,000
- model: MLP, two hidden layers + ReLu
- regularization: Weight Decay (L2)

In [58]:
dataset = ArrayDataset(x_train_10000, y_train_10000)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [59]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(45, activation="relu"))
net.add(nn.Dense(1, activation="relu"))
# define loss and trainer.
criterion = gluon.loss.L2Loss()

# epochs = 1000
lr = [0.00001, 0.000005, 0.000001]

In [None]:
result_lr = {}
result_wd = {}

for wd in weight_decay:
    for index in lr:
        net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
        trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index, 'wd': wd})
        train_history = []

        for epoch in range(epochs):
            cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
            training_samples = 0
            for x, y in train_data_loader:
                x = x.as_in_context(ctx)
                y = y.as_in_context(ctx)
                with autograd.record():
                    y_train_pred = net(x)
                    loss = criterion(y_train_pred, y)
                loss.backward()
                trainer.step(512)
                cumulative_train_loss += loss.sum()
                training_samples += x.shape[0]
            train_loss = cumulative_train_loss.asscalar()/training_samples
            train_rmse = np.sqrt(train_loss)
            if epoch%100 == 0:
                print('epoch:', epoch)
            # validation loop
            cumulative_valid_loss = mx.nd.zeros(1, ctx)
            valid_samples = 0
            for x, y in tune_data_loader:
                x = x.as_in_context(ctx)
                y = y.as_in_context(ctx)
                y_tune_pred = net(x)
                loss = criterion(y_tune_pred, y)
                cumulative_valid_loss += loss.sum()
                valid_samples += x.shape[0]
            valid_loss = cumulative_valid_loss.asscalar()/valid_samples
            valid_rmse = np.sqrt(valid_loss)
    #         if epoch%20 == 0:
    #             print('epoch:', epoch, 'loss:', valid_rmse)
            train_history.append(valid_rmse)
            if len(train_history) > 50:
                if train_history[-51] < min(train_history[-50:]):
                    print("Early Stopped", len(train_history))
                    break

        result_lr[index] = valid_rmse
        result_wd[wd] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
best_wd, _ = sorted(result_wd.items(), key = lambda item: item[1])[0]

print("best lr:", best_lr)
print("best wd:", best_wd)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr, 'wd': best_wd})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
epoch: 0
epoch: 100
epoch: 200
epoch: 300
epoch: 400
epoch: 500
epoch: 600
epoch: 700
epoch: 800
epoch: 900
best lr: 1e-05
best wd: 0.00

### 9. MLP_2_ykeep_dropout
- y: keep
- num of obs: 10,000
- model: MLP, two hidden layers + ReLu
- regularization: Dropout

In [None]:
dataset = ArrayDataset(x_train_10000, y_train_10000)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [None]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(gluon.nn.Dropout(.5))
net.add(nn.Dense(45, activation="relu"))
net.add(gluon.nn.Dropout(.5))
net.add(nn.Dense(1, activation="relu"))
# define loss and trainer.
criterion = gluon.loss.L2Loss()

# epochs = 1000
lr = [0.00001, 0.000005, 0.000001]

In [None]:
result_lr = {}

for index in lr:
    net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index})
    train_history = []
    
    for epoch in range(epochs):
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        for x, y in train_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)         
            with autograd.record():
                y_train_pred = net(x)
                loss = criterion(y_train_pred, y)
            loss.backward()
            trainer.step(512)
            cumulative_train_loss += loss.sum()
            training_samples += x.shape[0]
        train_loss = cumulative_train_loss.asscalar()/training_samples
        train_rmse = np.sqrt(train_loss)
        if epoch%100 == 0:
            print('epoch:', epoch)
        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        for x, y in tune_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            y_tune_pred = net(x)
            loss = criterion(y_tune_pred, y)
            cumulative_valid_loss += loss.sum()
            valid_samples += x.shape[0]
        valid_loss = cumulative_valid_loss.asscalar()/valid_samples
        valid_rmse = np.sqrt(valid_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
        train_history.append(valid_rmse)
        if len(train_history) > 50:
            if train_history[-51] < min(train_history[-50:]):
                print("Early Stopped", len(train_history))
                break
        
    result_lr[index] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
print("best lr:", best_lr)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

### 10. MLP_2_dm_dropout_full
- y: de-mean
- num of obs: All
- model: MLP, two hidden layers + ReLu
- regularization: Dropout

In [None]:
y_train_demean = y_subtrain - y_train_mean
y_tune_demean = y_tune - y_train_mean
y_test_demean = y_test - y_train_mean
y_train_fortest = y_train - y_train_mean

dataset = ArrayDataset(x_subtrain, y_train_demean)
train_data_loader = mx.gluon.data.DataLoader(dataset, batch_size = 512)
dataset_tune = ArrayDataset(x_tune, y_tune_demean)
tune_data_loader = mx.gluon.data.DataLoader(dataset_tune, batch_size = 512)

## for test
dataset_fortest = ArrayDataset(x_train, y_train_fortest)
train_dara_loader_fortest = mx.gluon.data.DataLoader(dataset_fortest, batch_size = 512)
dataset_test = ArrayDataset(x_test, y_test_demean)
test_data_loader = mx.gluon.data.DataLoader(dataset_test, batch_size = 512)

In [None]:
# construct and initialize network.
ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()

net = nn.Sequential()
net.add(nn.Dense(45, activation="relu"))
net.add(gluon.nn.Dropout(.5))
net.add(nn.Dense(45, activation="relu"))
net.add(gluon.nn.Dropout(.5))
net.add(nn.Dense(1, activation="relu"))
# define loss and trainer.
criterion = gluon.loss.L2Loss()

epochs = 50
lr = [0.00001, 0.000005, 0.000001]

In [None]:
result_lr = {}

for index in lr:
    net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': index})
    train_history = []
    
    for epoch in range(epochs):
        cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
        training_samples = 0
        for x, y in train_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            with autograd.record():
                y_train_pred = net(x)
                loss = criterion(y_train_pred, y)
            loss.backward()
            trainer.step(512)
            cumulative_train_loss += loss.sum()
            training_samples += x.shape[0]
        train_loss = cumulative_train_loss.asscalar()/training_samples
        train_rmse = np.sqrt(train_loss)
        if epoch%10 == 0:
            print('epoch:', epoch)
        # validation loop
        cumulative_valid_loss = mx.nd.zeros(1, ctx)
        valid_samples = 0
        for x, y in tune_data_loader:
            x = x.as_in_context(ctx)
            y = y.as_in_context(ctx)
            y_tune_pred = net(x)
            loss = criterion(y_tune_pred, y)
            cumulative_valid_loss += loss.sum()
            valid_samples += x.shape[0]
        valid_loss = cumulative_valid_loss.asscalar()/valid_samples
        valid_rmse = np.sqrt(valid_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
        train_history.append(valid_rmse)
        if len(train_history) > 50:
            if train_history[-51] < min(train_history[-50:]):
                print("Early Stopped", len(train_history))
                break
        
    result_lr[index] = valid_rmse
            
## use the best lr for testing
best_lr, _ = sorted(result_lr.items(), key = lambda item: item[1])[0]
print("best lr:", best_lr)

##########

net.initialize(init.Normal(sigma=0.28993590), force_reinit = True, ctx=ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': best_lr})
train_history = []

for epoch in range(epochs):
    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
    training_samples = 0
    for x, y in train_dara_loader_fortest:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        with autograd.record():
            y_train_pred = net(x)
            loss = criterion(y_train_pred, y)
        loss.backward()
        trainer.step(512)
        cumulative_train_loss += loss.sum()
        training_samples += x.shape[0]
    train_loss = cumulative_train_loss.asscalar()/training_samples
    train_rmse = np.sqrt(train_loss)

    # test loop
    cumulative_test_loss = mx.nd.zeros(1, ctx)
    test_samples = 0
    for x, y in test_data_loader:
        x = x.as_in_context(ctx)
        y = y.as_in_context(ctx)
        y_test_pred = net(x)
        loss = criterion(y_test_pred, y)
        cumulative_test_loss += loss.sum()
        test_samples += x.shape[0]
    test_loss = cumulative_test_loss.asscalar()/test_samples
    test_rmse = np.sqrt(test_loss)
#         if epoch%20 == 0:
#             print('epoch:', epoch, 'loss:', valid_rmse)
    train_history.append(test_rmse)
    if len(train_history) > 50:
        if train_history[-51] < min(train_history[-50:]):
            print("Early Stopped", len(train_history))
            break
print('test rmse:', test_rmse)

## Q2

| Model  | y  | # of obs. in subtraining   | Regularization  | test RMSE |
|:---:|:---:|:---:|:---:|:---:|
| Linea Regression  | keep  | 10,000    | None  | 9.550725 |
| MLP, 0 hidden layer  | de-mean  | 10,000  | None  |6.724543|
| MLP, 1 hidden layer + ReLu  | de-mean  | 10,000    | None  | 7.161275|
| MLP, 2 hidden layers + ReLu  | de-mean  | 10,000    | None  | 7.166892 |
| MLP, 2 hidden layers + ReLu | de-mean  | 10,000    | Weight Decay(L2)  | 7.166494 |
| MLP, 2 hidden layers + ReLu  | de-mean  | 10,000   | Dropout  | 7.159413 |
| MLP, 2 hidden layers + ReLu  | keep  | 10,000   | None  | 6.665264 |
| MLP, 2 hidden layers + ReLu  | keep  | 10,000    | Weight Decay(L2) | |
| MLP, 2 hidden layers + ReLu  | keep  | 10,000    | Dropout  | |
| MLP, 2 hidden layers + ReLu  | de-mean  | All    | Dropout  | |