In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

c:\Users\zhufe\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\zhufe\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
c:\Users\zhufe\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate = 0.01, momentum = 0.9, dropout_rate = 0.05, lambda_l2 = 0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.dropout_rate = dropout_rate
        self.lambda_l2 = lambda_l2
        
        self.weights1 = np.random.randn(self.hidden_size, self.input_size) * np.sqrt(2 / (self.input_size+ self.hidden_size))
        self.biases1 = np.random.randn(self.hidden_size, 1)
        self.weights2 = np.random.randn(self.output_size, self.hidden_size) * np.sqrt(2 / (self.hidden_size+ self.output_size))
        self.biases2 = np.random.randn(self.output_size, 1)

        self.prev_weights1_update = np.zeros_like(self.weights1)
        self.prev_biases1_update = np.zeros_like(self.biases1)
        self.prev_weights2_update = np.zeros_like(self.weights2)
        self.prev_biases2_update = np.zeros_like(self.biases2)

        self.prev_error = np.inf
        
    def forward(self, X, training=True):
        self.z1 = self.weights1 @ X.T + self.biases1
        self.a1 = self.sigmoid(self.z1)
        if training:
            self.dropout_mask1 = np.random.rand(*self.a1.shape) > self.dropout_rate
            self.a1 *= self.dropout_mask1
        self.z2 = self.weights2 @ self.a1 + self.biases2
        self.a2 = self.sigmoid(self.z2)
        return self.a2
    
    def sigmoid(self, s):
        return 1/(1+np.exp(-s))

    def sigmoid_derivative(self, s):
        return s * (1 - s)

    def mse_derivative(self, y_true, y_pred):
        return -2*(y_true - y_pred)

    def backward(self, X, y):
        m = X.shape[0]
        y = y.reshape(-1, 1)
    
        self.error_delta = self.mse_derivative(y.T, self.a2)
        self.z2_delta = self.sigmoid_derivative(self.a2) * self.error_delta
        self.z1_delta = self.sigmoid_derivative(self.a1) * (self.weights2.T @ self.z2_delta)

        # Apply dropout mask during backpropagation
        self.z1_delta *= self.dropout_mask1

        self.weights2_delta = self.z2_delta @ self.a1.T / m
        self.biases2_delta = np.sum(self.z2_delta, axis=1, keepdims=True) / m

        self.weights1_delta = self.z1_delta @ X / m
        self.biases1_delta = np.sum(self.z1_delta, axis=1, keepdims=True) / m

    def mse(self, y_true, y_pred):
        return np.mean((y_true - y_pred)**2)

    def print_mse(self, X, y):
        print(f"MSE: {self.mse(y.T, self.forward(X, training=False))}")

    def update_learning_rate(self, current_error):
        if current_error < self.prev_error:
            self.learning_rate *= 1.1
        else:
            self.learning_rate *= 0.5
        self.prev_error = current_error

    def update(self, y):
        current_error = self.mse(self.a2, y)
        self.update_learning_rate(current_error)

        weights1_update = self.learning_rate * (self.weights1_delta + self.lambda_l2 * self.weights1) + self.momentum * self.prev_weights1_update
        biases1_update = self.learning_rate * self.biases1_delta + self.momentum * self.prev_biases1_update
        weights2_update = self.learning_rate * (self.weights2_delta + self.lambda_l2 * self.weights2) + self.momentum * self.prev_weights2_update
        biases2_update = self.learning_rate * self.biases2_delta + self.momentum * self.prev_biases2_update

        self.weights1 -= weights1_update
        self.biases1 -= biases1_update
        self.weights2 -= weights2_update
        self.biases2 -= biases2_update

        self.prev_weights1_update = weights1_update
        self.prev_biases1_update = biases1_update
        self.prev_weights2_update = weights2_update
        self.prev_biases2_update = biases2_update
   
    def batch_gradient_descent(self, X, y):
        self.forward(X)
        self.backward(X, y)
        self.update(y)
        self.print_mse(X, y)

    def mini_batch_gradient_descent(self, X, y, batch_size):
        num_samples = X.shape[0]
        num_batches = int(np.ceil(num_samples / batch_size))
        indices = np.random.permutation(num_samples)
        X = X[indices]
        y = y[indices]
        
        for i in range(num_batches):
            start = i * batch_size
            end = min((i + 1) * batch_size, num_samples)
            
            X_batch = X[start:end]
            y_batch = y[start:end]
            
            y_pred = self.forward(X_batch)
            self.backward(X_batch, y_batch)
            self.update(y_batch)

        self.print_mse(X, y)

    def stochastic_gradient_descent(self, X,y):
        self.mini_batch_gradient_descent(X, y, 1)


In [3]:
X = datasets.load_boston()['data']
y = datasets.load_boston()['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
self = NeuralNetwork(input_size = X_train.shape[1], hidden_size = 50, output_size = 1, learning_rate = 0.001)
for _ in range(100):
    self.batch_gradient_descent(X_train, y_train)

print("Test Results")
self.print_mse(X_test, y_test)

MSE: 576.0711191770384
MSE: 573.1106425736015
MSE: 569.6802690008059
MSE: 566.6984524344721
MSE: 564.6423891515038
MSE: 563.4276800526508
MSE: 562.7661061474814
MSE: 562.4140508710731
MSE: 562.2252139177251
MSE: 562.121557407253
MSE: 562.0627788884988
MSE: 562.0282559884657
MSE: 562.0072453145442
MSE: 561.9940183437078
MSE: 561.9854195563944
MSE: 561.9796614743186
MSE: 561.9756963868073
MSE: 561.9728958636574
MSE: 561.9708721383365
MSE: 561.9693786864989
MSE: 561.9682553022717
MSE: 561.9673954145884
MSE: 561.9667271940069
MSE: 561.9662003605976
MSE: 561.9657797595901
MSE: 561.9654403573894
MSE: 561.9651637642285
MSE: 561.9649365147477
MSE: 561.964748467636
MSE: 561.9645918712611
MSE: 561.9644607977184
MSE: 561.9643506808811
MSE: 561.9642580054887
MSE: 561.9641800833709
MSE: 561.9641147428007
MSE: 561.9640602545662
MSE: 561.9640152538127
MSE: 561.9639786894508
MSE: 561.9639478647601
MSE: 561.9639221782438
MSE: 561.9639011392322
MSE: 561.9638832955775
MSE: 561.9638683884571
MSE: 561.9638

In [5]:
self = NeuralNetwork(input_size = X_train.shape[1], hidden_size = 50, output_size = 1, learning_rate = 0.001)
for _ in range(10):
    self.mini_batch_gradient_descent(X_train, y_train, batch_size = 32)

print("Test Results")
self.print_mse(X_test, y_test)

MSE: 584.5009085624326
MSE: 574.9632273575268
MSE: 572.8346051942736
MSE: 572.3292400385624
MSE: 572.203346520549
MSE: 572.1715165906535
MSE: 572.163436922428
MSE: 572.1613838955531
MSE: 572.1608620899655
MSE: 572.1607294569781
Test Results
MSE: 502.8275707505013


In [6]:
self = NeuralNetwork(input_size = X_train.shape[1], hidden_size = 50, output_size = 1, learning_rate = 0.001)
for _ in range(10):
    self.stochastic_gradient_descent(X_train, y_train)
    
print("Test Results")
self.print_mse(X_test, y_test)

MSE: 562.5574240486769
MSE: 562.5574240486769
MSE: 562.5574240486769
MSE: 562.5574240486769
MSE: 562.5574240486769
MSE: 562.5574240486769
MSE: 562.5574240486769
MSE: 562.5574240486769
MSE: 562.5574240486768
MSE: 562.5574240486769
Test Results
MSE: 493.67076173894776
