Note: If you are having difficulty installing the tensorflow, keras and pytorch libraries, use google colab!


# Library imports

In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

# Keras implementations

In [None]:
from keras.datasets import boston_housing
(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()

# 404 training samples and 102 test samples, 
# each with 13 numerical feature
print("train_data.shape", train_data.shape)

# normalize the data
mean = train_data.mean(axis=0)

train_data -= mean
std = train_data.std(axis=0)
train_data /= std

test_data -= mean
test_data /= std

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/boston_housing.npz
train_data.shape (404, 13)


In [None]:
# few samples are available, use a very small network 
# with two hidden layers, each with 64 units. 
# In general, the less training data you have, 
# the worse overfitting will be, and using a small network 
# is one way to mitigate overfitting.

from keras import Sequential, layers
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu',
            input_shape = (train_data.shape[1],)))
    
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(64,activation='relu'))
    model.add(layers.Dense(32,activation = 'relu'))

    # network ends with a single unit and no activation. 
    # This is a typical setup for scalar regression 
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    return model

In [None]:
model = build_model()

model.fit(train_data, train_targets, epochs=80, batch_size=16, verbose=0)

test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)

print("test_mae_score", np.round(test_mae_score,3)) 

# mae value around 2.54 -> \$2,540 
# (house price range \$10,000-\$50,000)

test_mae_score 2.712


# PyTorch implementations

In [None]:
#Define the model 
import torch
import torch.nn as nn
import torch.nn.functional as F

## data preprocessing
- data house pricing data is downloaded from another source in this exercise

In [None]:
#From sklearn tutorial.
from sklearn.datasets import load_boston
boston = load_boston()
# print( "Type of boston dataset:", type(boston))


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [None]:
#A bunch is you remember is a dictionary based dataset.  Dictionaries are addressed by keys. 
#Let's look at the keys. 
# print(boston.keys())

#DESCR sounds like it could be useful. Let's print the description.
# print(boston['DESCR'])

In [None]:
# Let's change the data to a Panda's Dataframe
boston_df = pd.DataFrame(boston['data'] )
boston_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [None]:
#Now add the column names.
boston_df.columns = boston['feature_names']
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [None]:
#Add the target as PRICE. 
boston_df['PRICE']= boston['target']
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [None]:
#This will throw and error at import if haven't upgraded. 
# from sklearn.cross_validation  import train_test_split  
from sklearn.model_selection  import train_test_split
#y is the dependent variable.
y = boston_df['PRICE']
#As we know, iloc is used to slice the array by index number. Here this is the matrix of 
#independent variables.
X = boston_df.iloc[:,0:13]

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(354, 13) (152, 13) (354,) (152,)


In [None]:
#Change to numpy array. 
X_train=X_train.values
y_train=y_train.values
X_test=X_test.values
y_test=y_test.values

In [None]:
#Define training hyperprameters.
batch_size = 50
num_epochs = 200
learning_rate = 0.01
size_hidden= 100

#Calculate some other hyperparameters based on data.  
batch_no = len(X_train) // batch_size  #batches
cols = X_train.shape[1] #Number of columns in input matrix
n_output=1


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assume that we are on a CUDA machine, then this should print a CUDA device:
print("Executing the model on :",device)

#Create the model object
class Net(torch.nn.Module):
    def __init__(self, n_feature, size_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(cols, size_hidden)   # hidden layer
        self.predict = torch.nn.Linear(size_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

model_pytorch = Net(cols, size_hidden, n_output)

Executing the model on : cpu


In [None]:
#Adam is a specific flavor of gradient decent which is typically better
optimizer = torch.optim.Adam(model_pytorch.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(net.parameters(), lr=0.2)
criterion = torch.nn.MSELoss(size_average=False)  # this is for regression mean squared loss



In [None]:
from sklearn.utils import shuffle
from torch.autograd import Variable
running_loss = 0.0
for epoch in range(num_epochs):
    
    #Shuffle just mixes up the dataset between epochs
    X_train, y_train = shuffle(X_train, y_train)
    
    # Mini batch learning
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        inputs = Variable(torch.FloatTensor(X_train[start:end]))
        labels = Variable(torch.FloatTensor(y_train[start:end]))
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model_pytorch(inputs)
        #print("outputs",outputs)
        #print("outputs",outputs,outputs.shape,"labels",labels, labels.shape)
        
        loss = criterion(outputs, torch.unsqueeze(labels,dim=1))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

    # print progress every 20th epoch
    if (epoch+1) % 20 == 0:
        print('Epoch {}'.format(epoch+1), "loss: ",running_loss)
    running_loss = 0.0



Epoch 20 loss:  3090.8970336914062
Epoch 40 loss:  3142.290237426758
Epoch 60 loss:  2969.9196166992188
Epoch 80 loss:  2844.443878173828
Epoch 100 loss:  4001.4786376953125
Epoch 120 loss:  2559.8975219726562
Epoch 140 loss:  2969.085174560547
Epoch 160 loss:  3262.7972106933594
Epoch 180 loss:  2391.389617919922
Epoch 200 loss:  2381.66455078125


In [None]:
#This is a little bit tricky to get the resulting prediction.  
def calculate_r2_mae(x,y=[]):
    """
    This function will return the r2 if passed x and y or return predictions if just passed x. 
    """

    # Evaluate the model with the test set. 
    X = Variable(torch.FloatTensor(x))  
    
    result = model_pytorch(X) #This outputs the value for regression
    result = result.data[:,0].numpy()
  
    if len(y) != 0:
        r2 = r2_score(result, y)
        mae = mean_absolute_error(result, y)
        print("R-Squared: %.3f, MAE: %.2f" %(r2, mae))
        
        #print('Accuracy {:.2f}'.format(num_right / len(y)), "for a total of ", len(y), "records")
        return pd.DataFrame(data= {'actual': y, 'predicted': result})
    else:
        print("returning predictions")
        return result

In [None]:
result1 = calculate_r2_mae(X_train,y_train)
result2 = calculate_r2_mae(X_test,y_test)

R-Squared: 0.905, MAE: 1.89
R-Squared: 0.678, MAE: 2.74


# Exercises:
- modify above NN model with different number of dense layers, hidden units, loss function, # of training epochs etc to identify a prediction model with better performance (i.e., lower MAE value and higher r2 value) 

In [None]:
import numpy as np
import pandas as pd

# %matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from keras.datasets import boston_housing
from keras import Sequential, layers

class Foo:
    def __init__(self):
        (self.train_data, self.train_targets), (self.test_data, self.test_targets) = boston_housing.load_data()
        # 404 training samples and 102 test samples,
        # each with 13 numerical feature
        print("train_data.shape", self.train_data.shape)

        # normalize the data
        mean = self.train_data.mean(axis=0)
        self.train_data -= mean
        std = self.train_data.std(axis=0)
        self.train_data /= std
        self.test_data -= mean
        self.test_data /= std

    # few samples are available, use a very small network
    # with two hidden layers, each with 64 units.
    # In general, the less training data you have,
    # the worse overfitting will be, and using a small network
    # is one way to mitigate overfitting.

    def build_model(self, l1, a1, l2=128, a2='relu', l3=64, a3='relu', l4=32, a4='relu'):
        model = Sequential()
        model.add(layers.Dense(l1, activation=a1, input_shape=(self.train_data.shape[1],)))
        model.add(layers.Dense(l2, activation=a2))
        model.add(layers.Dense(l3, activation=a3))
        model.add(layers.Dense(l4, activation=a4))

        # network ends with a single unit and no activation.
        # This is a typical setup for scalar regression
        model.add(layers.Dense(1))
        model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
        return model

    def eval(self, l1=128, a1='relu', l2=128, a2='relu', l3=64, a3='relu', l4=32, a4='relu', epoc=80):
        model = self.build_model(l1, a1, l2, a2, l3, a3, l4, a4)
        model.fit(self.train_data, self.train_targets, epochs=epoc, batch_size=16, verbose=0)
        test_mse_score, test_mae_score = model.evaluate(self.test_data, self.test_targets)
        print("test_mae_score", np.round(test_mae_score, 3))
        # mae value around 2.54 -> \$2,540 
        # (house price range \$10,000-\$50,000)

foo = Foo()
foo.eval(l1=128, a1='relu', l2=128, a2='relu', l3=64, a3='relu', l4=32, a4='relu', epoc=80)

train_data.shape (404, 13)
test_mae_score 3.071


In [168]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.datasets import load_boston
from sklearn.utils import shuffle
from torch.autograd import Variable
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split


class Net(torch.nn.Module):
    def __init__(self, input_features, l1=128, l2=128, l3=16, l4=16, l5=4, l6=4, output=1):
        super(Net, self).__init__()
        self.h0 = torch.nn.Linear(input_features, l1)
        self.h1 = torch.nn.Linear(l1, l2)
        self.h2 = torch.nn.Linear(l2, l3)
        self.h3 = torch.nn.Linear(l3, l4)
        self.h4 = torch.nn.Linear(l4, l5)
        self.h5 = torch.nn.Linear(l5, l6)
        self.output_layer = torch.nn.Linear(l6, output)

    def forward(self, x):
        x = F.relu(self.h0(x))
        x = F.relu(self.h1(x))
        x = F.relu(self.h2(x))
        x = F.relu(self.h3(x))
        x = F.relu(self.h4(x))
        x = F.relu(self.h5(x))
        # x = F.rrelu(self.h0(x))
        # x = F.rrelu(self.h1(x))
        # x = F.rrelu(self.h2(x))
        # x = F.rrelu(self.h3(x))
        # x = F.rrelu(self.h4(x))
        # x = F.rrelu(self.h5(x))
        x = self.output_layer(x)
        return x


class Foo:
    def __init__(self):
        boston = load_boston()
        boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
        boston_df['PRICE'] = boston['target']
        self.X = boston_df.iloc[:, 0:13]
        self.y = boston_df['PRICE']
        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(self.X, self.y, test_size=0.3, random_state=0)

        print(self.X_train.shape, self.X_test.shape, self.y_train.shape, self.y_test.shape)

        self.X_train = self.X_train.values
        self.y_train = self.y_train.values
        self.X_test = self.X_test.values
        self.y_test = self.y_test.values
        self.cols = self.X_train.shape[1]  # Number of columns in input matrix
        self.model_pytorch = None

    def eval(self, batch_size=50, num_epochs=200, learning_rate=0.01, criterion=torch.nn.HuberLoss(reduction='sum')):
        batch_no = len(self.X_train) // batch_size  # batches

        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # # Assume that we are on a CUDA machine, then this should print a CUDA device:
        # print("Executing the model on :", device)

        # self.model_pytorch = Net(self.cols, size_hidden, self.n_output)

        # Adam is a specific flavor of gradient decent which is typically better
        optimizer = torch.optim.Adam(self.model_pytorch.parameters(), lr=learning_rate)
        # optimizer = torch.optim.SGD(net.parameters(), lr=0.2)

        running_loss = 0.0
        for epoch in range(num_epochs):

            # Shuffle just mixes up the dataset between epochs
            X_train, y_train = shuffle(self.X_train, self.y_train)

            # Mini batch learning
            for i in range(batch_no):
                start = i * batch_size
                end = start + batch_size
                inputs = Variable(torch.FloatTensor(X_train[start:end]))
                labels = Variable(torch.FloatTensor(y_train[start:end]))

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = self.model_pytorch(inputs)
                # print("outputs",outputs)
                # print("outputs",outputs,outputs.shape,"labels",labels, labels.shape)

                loss = criterion(outputs, torch.unsqueeze(labels, dim=1))
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()

            # print progress every 20th epoch
            if (epoch + 1) % 1000 == 0:
                print('Epoch {}'.format(epoch + 1), "loss: ", running_loss)
            running_loss = 0.0
        result1 = self.calculate_r2_mae(self.X_train, self.y_train)
        result2 = self.calculate_r2_mae(self.X_test, self.y_test)
        pass

    def calculate_r2_mae(self, x, y=None):
        """
        This function will return the r2 if passed x and y or return predictions if just passed x.
        """

        # Evaluate the model with the test set.
        if y is None:
            y = []
        X = Variable(torch.FloatTensor(x))

        result = self.model_pytorch(X)  # This outputs the value for regression
        result = result.data[:, 0].numpy()

        if len(y) != 0:
            r2 = r2_score(result, y)
            mae = mean_absolute_error(result, y)
            print("R-Squared: %.3f, MAE: %.2f" % (r2, mae))

            # print('Accuracy {:.2f}'.format(num_right / len(y)), "for a total of ", len(y), "records")
            return pd.DataFrame(data={'actual': y, 'predicted': result})
        else:
            print("returning predictions")
            return result


foo = Foo()

(354, 13) (152, 13) (354,) (152,)



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

##### Round One ：random tuning

In [122]:
foo.model_pytorch = Net(foo.cols, l1=256, l2=256, l3=16, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=80, num_epochs=3000, learning_rate=0.002, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  4.399343550205231
Epoch 2000 loss:  1.8710781633853912
Epoch 3000 loss:  2.811720758676529
R-Squared: 0.984, MAE: 0.89
R-Squared: 0.736, MAE: 3.19


In [126]:
# candidate
foo.model_pytorch = Net(foo.cols, l1=32, l2=32, l3=16, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.001, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  2.36799955368042
Epoch 2000 loss:  1.702791452407837
Epoch 3000 loss:  1.538000226020813
Epoch 4000 loss:  1.2366721630096436
Epoch 5000 loss:  1.1024945974349976
Epoch 6000 loss:  1.203973412513733
Epoch 7000 loss:  1.1298693418502808
Epoch 8000 loss:  1.0961107015609741
Epoch 9000 loss:  0.8285300731658936
Epoch 10000 loss:  0.9039367437362671
R-Squared: 0.936, MAE: 1.24
R-Squared: 0.732, MAE: 2.74


In [121]:
foo.model_pytorch = Net(foo.cols, l1=32, l2=32, l3=16, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=80, num_epochs=3000, learning_rate=0.002, criterion=torch.nn.MSELoss())

Epoch 1000 loss:  28.672372817993164
Epoch 2000 loss:  10.635968804359436
Epoch 3000 loss:  5.3301162123680115
R-Squared: 0.984, MAE: 0.85
R-Squared: 0.670, MAE: 2.92


In [106]:
# candidate
foo.model_pytorch = Net(foo.cols, l1=32, l2=32, l3=16, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=80, num_epochs=3000, learning_rate=0.002, criterion=torch.nn.L1Loss())

Epoch 1000 loss:  8.045549631118774
Epoch 2000 loss:  6.6873239278793335
Epoch 3000 loss:  5.609277009963989
R-Squared: 0.959, MAE: 1.20
R-Squared: 0.686, MAE: 2.76


In [112]:
foo.model_pytorch = Net(foo.cols, l1=32, l2=32, l3=16, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=80, num_epochs=3000, learning_rate=0.002, criterion=torch.nn.SmoothL1Loss())

Epoch 1000 loss:  6.032895803451538
Epoch 2000 loss:  3.2592626214027405
Epoch 3000 loss:  3.3008354902267456
R-Squared: 0.952, MAE: 1.41
R-Squared: 0.616, MAE: 2.95


##### Round Two

In [131]:
# candidate
foo.model_pytorch = Net(foo.cols, l1=40, l2=40, l3=20, l4=20, l5=5, l6=5, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  3.0773916244506836
Epoch 2000 loss:  1.939092755317688
Epoch 3000 loss:  1.6914304494857788
Epoch 4000 loss:  1.4642313718795776
Epoch 5000 loss:  1.327182650566101
Epoch 6000 loss:  1.3554364442825317
Epoch 7000 loss:  1.2726833820343018
Epoch 8000 loss:  1.2102246284484863
Epoch 9000 loss:  1.153406023979187
Epoch 10000 loss:  1.1837083101272583
R-Squared: 0.907, MAE: 1.57
R-Squared: 0.727, MAE: 2.81


In [136]:
foo.model_pytorch = Net(foo.cols, l1=40, l2=40, l3=20, l4=20, l5=5, l6=5, output=1)
foo.eval(batch_size=50, num_epochs=2000, learning_rate=0.0025, criterion=torch.nn.L1Loss())

Epoch 1000 loss:  11.214197635650635
Epoch 2000 loss:  5.816370368003845
R-Squared: 0.986, MAE: 0.73
R-Squared: 0.702, MAE: 2.95


In [132]:
foo.model_pytorch = Net(foo.cols, l1=40, l2=40, l3=20, l4=20, l5=5, l6=5, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.L1Loss())

Epoch 1000 loss:  3.4325881004333496
Epoch 2000 loss:  2.1185545921325684
Epoch 3000 loss:  2.0576887130737305
Epoch 4000 loss:  1.8347935676574707
Epoch 5000 loss:  1.881821870803833
Epoch 6000 loss:  1.7643747329711914
Epoch 7000 loss:  1.715798258781433
Epoch 8000 loss:  1.703466534614563
Epoch 9000 loss:  1.5681095123291016
Epoch 10000 loss:  1.8093498945236206
R-Squared: 0.903, MAE: 1.60
R-Squared: 0.701, MAE: 2.81


In [137]:
foo.model_pytorch = Net(foo.cols, l1=40, l2=40, l3=20, l4=20, l5=5, l6=5, output=1)
foo.eval(batch_size=50, num_epochs=2000, learning_rate=0.0025, criterion=torch.nn.L1Loss())

Epoch 1000 loss:  10.722804069519043
Epoch 2000 loss:  6.662779927253723
R-Squared: 0.973, MAE: 1.01
R-Squared: 0.676, MAE: 3.10


##### Round Three

In [139]:
foo.model_pytorch = Net(foo.cols, l1=80, l2=80, l3=10, l4=10, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  3.0664167404174805
Epoch 2000 loss:  2.0225892066955566
Epoch 3000 loss:  1.9428519010543823
Epoch 4000 loss:  1.618125081062317
Epoch 5000 loss:  1.5389405488967896
Epoch 6000 loss:  1.4338011741638184
Epoch 7000 loss:  1.4214279651641846
Epoch 8000 loss:  1.2357490062713623
Epoch 9000 loss:  1.3781185150146484
Epoch 10000 loss:  1.203445553779602
R-Squared: 0.892, MAE: 1.66
R-Squared: 0.673, MAE: 2.75


In [140]:
# candidate
foo.model_pytorch = Net(foo.cols, l1=128, l2=128, l3=10, l4=10, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  3.0862627029418945
Epoch 2000 loss:  1.7049287557601929
Epoch 3000 loss:  1.6207152605056763
Epoch 4000 loss:  1.2779059410095215
Epoch 5000 loss:  1.2047287225723267
Epoch 6000 loss:  1.1051175594329834
Epoch 7000 loss:  1.0643014907836914
Epoch 8000 loss:  1.0528960227966309
Epoch 9000 loss:  0.9085228443145752
Epoch 10000 loss:  0.8846355080604553
R-Squared: 0.915, MAE: 1.35
R-Squared: 0.719, MAE: 2.76


In [141]:
# overfitting
foo.model_pytorch = Net(foo.cols, l1=256, l2=256, l3=10, l4=10, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  2.173978090286255
Epoch 2000 loss:  1.5364869832992554
Epoch 3000 loss:  1.2361032962799072
Epoch 4000 loss:  1.07366943359375
Epoch 5000 loss:  1.2258808612823486
Epoch 6000 loss:  2.56320858001709
Epoch 7000 loss:  0.8220564126968384
Epoch 8000 loss:  0.6012448668479919
Epoch 9000 loss:  0.5728933811187744
Epoch 10000 loss:  0.5458008050918579
R-Squared: 0.975, MAE: 0.89
R-Squared: 0.701, MAE: 2.91


##### Round Four

In [146]:
foo.model_pytorch = Net(foo.cols, l1=128, l2=32, l3=64, l4=16, l5=4, l6=1, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  2.1545987129211426
Epoch 2000 loss:  1.9172662496566772
Epoch 3000 loss:  1.6381511688232422
Epoch 4000 loss:  1.0400948524475098
Epoch 5000 loss:  1.1169161796569824
Epoch 6000 loss:  1.1672966480255127
Epoch 7000 loss:  1.0325645208358765
Epoch 8000 loss:  0.9722885489463806
Epoch 9000 loss:  0.8285640478134155
Epoch 10000 loss:  0.8253082633018494
R-Squared: 0.950, MAE: 1.20
R-Squared: 0.691, MAE: 2.78


In [147]:
# candidate - current best 
foo.model_pytorch = Net(foo.cols, l1=128, l2=32, l3=64, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  2.1702609062194824
Epoch 2000 loss:  1.7178232669830322
Epoch 3000 loss:  1.5234358310699463
Epoch 4000 loss:  1.3641221523284912
Epoch 5000 loss:  1.1590920686721802
Epoch 6000 loss:  1.0540438890457153
Epoch 7000 loss:  1.054677963256836
Epoch 8000 loss:  0.8930981159210205
Epoch 9000 loss:  0.9064716100692749
Epoch 10000 loss:  0.6914174556732178
R-Squared: 0.945, MAE: 1.11
R-Squared: 0.766, MAE: 2.78


In [148]:
foo.model_pytorch = Net(foo.cols, l1=128, l2=32, l3=64, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0003, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  3.3903427124023438
Epoch 2000 loss:  2.213636875152588
Epoch 3000 loss:  1.8323246240615845
Epoch 4000 loss:  1.6143088340759277
Epoch 5000 loss:  1.3897013664245605
Epoch 6000 loss:  1.3323924541473389
Epoch 7000 loss:  1.1653300523757935
Epoch 8000 loss:  1.1334720849990845
Epoch 9000 loss:  1.0416122674942017
Epoch 10000 loss:  1.1418401002883911
R-Squared: 0.925, MAE: 1.40
R-Squared: 0.711, MAE: 2.72


In [151]:
foo.model_pytorch = Net(foo.cols, l1=128, l2=32, l3=64, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0003, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  3.217740058898926
Epoch 2000 loss:  2.14133882522583
Epoch 3000 loss:  1.804438591003418
Epoch 4000 loss:  1.6042677164077759
Epoch 5000 loss:  1.4114569425582886
Epoch 6000 loss:  1.4309982061386108
Epoch 7000 loss:  1.301927089691162
Epoch 8000 loss:  1.1724416017532349
Epoch 9000 loss:  1.245043158531189
Epoch 10000 loss:  1.11747407913208
R-Squared: 0.911, MAE: 1.52
R-Squared: 0.701, MAE: 2.96


##### Round Five - final round

In [157]:
foo.model_pytorch = Net(foo.cols, l1=128, l2=32, l3=64, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=250, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  2.6856229305267334
Epoch 2000 loss:  1.8495086431503296
Epoch 3000 loss:  1.8609514236450195
Epoch 4000 loss:  1.566051959991455
Epoch 5000 loss:  1.6632754802703857
Epoch 6000 loss:  1.2772905826568604
Epoch 7000 loss:  1.1303715705871582
Epoch 8000 loss:  1.168574571609497
Epoch 9000 loss:  0.8516449928283691
Epoch 10000 loss:  0.8078053593635559
R-Squared: 0.949, MAE: 1.27
R-Squared: 0.719, MAE: 2.75


In [158]:
foo.model_pytorch = Net(foo.cols, l1=128, l2=32, l3=64, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=100, num_epochs=10000, learning_rate=0.0005, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  5.113327741622925
Epoch 2000 loss:  3.880610227584839
Epoch 3000 loss:  3.9647854566574097
Epoch 4000 loss:  3.136811673641205
Epoch 5000 loss:  3.622177839279175
Epoch 6000 loss:  2.1241248846054077
Epoch 7000 loss:  2.478312373161316
Epoch 8000 loss:  2.261405050754547
Epoch 9000 loss:  1.9742512106895447
Epoch 10000 loss:  1.709864854812622
R-Squared: 0.975, MAE: 0.87
R-Squared: 0.757, MAE: 2.63


In [163]:
foo.model_pytorch = Net(foo.cols, l1=128, l2=32, l3=64, l4=16, l5=4, l6=4, output=1)
foo.eval(batch_size=50, num_epochs=10000, learning_rate=0.0001, criterion=torch.nn.HuberLoss())

Epoch 1000 loss:  18.70051610469818
Epoch 2000 loss:  13.241290926933289
Epoch 3000 loss:  11.263829112052917
Epoch 4000 loss:  10.161400437355042
Epoch 5000 loss:  9.912397384643555
Epoch 6000 loss:  8.679338991641998
Epoch 7000 loss:  9.112246870994568
Epoch 8000 loss:  7.307150781154633
Epoch 9000 loss:  6.745783269405365
Epoch 10000 loss:  6.2382559180259705
R-Squared: 0.931, MAE: 1.25
R-Squared: 0.708, MAE: 2.75
