In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

X = adult.data.features
y = adult.data.targets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:

def prep_data(X_train, X_test, y_train, y_test):
    # Combine train and test set to ensure same dummy variables
    X_combined = pd.concat([X_train, X_test])
    
    # Apply get_dummies to the combined dataset
    X_combined = pd.get_dummies(X_combined, drop_first=True)
    
    # Split the combined dataset back into train and test sets
    X_train = X_combined.iloc[:len(X_train)]
    X_test = X_combined.iloc[len(X_train):]
    
    # Z-score normalization
    X_train = (X_train - X_train.mean()) / X_train.std()
    X_test = (X_test - X_test.mean()) / X_test.std()

    # Handle potential NaN values resulting from normalization
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)

    # Convert categorical target variable y to binary (0/1)
    y_train = (y_train == '>50K').astype(int)
    y_test = (y_test == '>50K').astype(int)

    # Convert data to np arrays
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values

    return X_train, X_test, y_train, y_test



In [3]:
X_train, X_test, y_train, y_test = prep_data(X_train, X_test, y_train, y_test)

In [11]:
# how many of y_train are classified as 1
adults_over_50k = sum(y_train)
# how many of y_Train are classified as 0 
adults_under_50k = len(y_train) - adults_over_50k

print(f"Number of adults making over 50k: {adults_over_50k}")
print(f"Number of adults making under 50k: {adults_under_50k}")

Number of adults making over 50k: [6279]
Number of adults making under 50k: [32794]


In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39073, 100), (9769, 100), (39073, 1), (9769, 1))

In [14]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(x <= 0, 0, 1)

def softmax(x):
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

def softmax_prime(x):
    return softmax(x) * (1 - softmax(x))

class Layer():
    def __init__(self, n_inputs, n_neurons, activation_function=None, activation_prime=None):
        self.weights = np.random.randn(n_inputs, n_neurons) # Initialize weights randomly from -.5 to .5
        self.biases = np.zeros((1, n_neurons)) # Initialize biases to 0
        self.activation_function = activation_function
        self.activation_prime = activation_prime 
    
    def forward(self, inputs):
        """
        Calculates the layer's output for a given input.
        :param inputs: The input to the layer.
        :return: The activated output of the layer.
        """
        self.inputs = inputs
        self.z = np.dot(inputs, self.weights) + self.biases
        self.output = (
            self.z
            if self.activation_function is None
            else self.activation_function(self.z)
        )
        return self.output
        
    def backward(self, dL_dA):
        """
        Calculates the gradient of the loss with respect to weights, biases, and the previous layer's activations.
        :param dL_dA: The gradient of the loss with respect to the layer's output.
        :return dL_dA_prev: The gradient of the loss with respect to the previous layer's activations.
        """
        if self.activation_function is None:
            self.dL_dz = dL_dA
        else:
            self.dL_dz = dL_dA * self.activation_prime(self.z)
        # Remember, z = w * a + b, so dz/dw = a
        self.dL_dW = np.dot(self.inputs.T, self.dL_dz) # dL/dw = dL/da * da/dz * dz/dw
        self.dL_dB = np.sum(self.dL_dz, axis=0, keepdims=True) # dL/db = dL/da * da/dz * dz/db
        
        self.dL_dA_prev = np.dot(self.dL_dz, self.weights.T) # dL/dz * dz/dA[L-1]
        
        return self.dL_dA_prev
    
    def update(self, learning_rate):
        """
        Updates the weights and biases of the layer by moving in the opposite direction of the stored gradients.
        :param learning_rate: The size of the step to take.
        """
        self.weights -= learning_rate * self.dL_dW
        self.biases -= learning_rate * self.dL_dB
        
    def clip_gradients(self, max_norm):
        """
        Clips the gradients to prevent exploding gradients using L2 norm clipping.
        :param max_norm: The maximum allowable norm for the gradients.
        """
        total_norm = np.linalg.norm(self.dL_dW) # Calculate the L2 norm of the gradients
        if total_norm > max_norm:
            self.dL_dW = self.dL_dW * (max_norm / total_norm) # Rescale the gradients



In [8]:
layer1 = Layer(X_train.shape[1], 64, relu, relu_prime)
layer2 = Layer(64, 32, relu, relu_prime)
layer3 = Layer(32, 1, sigmoid, sigmoid_prime)
num_epochs = 100
batch_size = 64
epsilon = 1e-15
l2_lambda = 0.01
initial_learning_rate = 0.01

for epoch in range(num_epochs):
    total_loss = 0
    learning_rate = initial_learning_rate * (1 / (1 + 0.01 * epoch))  # Learning rate decay

    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        # Forward pass
        A1 = layer1.forward(X_batch)
        A2 = layer2.forward(A1)
        A3 = layer3.forward(A2)
        
        # Binary cross-entropy loss
        A3 = np.clip(A3, epsilon, 1 - epsilon) # Ensure that A3 is > 0 and < 1 but not exactly 0 or 1
        loss = -np.mean(y_batch * np.log(A3) + (1 - y_batch) * np.log(1 - A3)) # Calculate loss for tracking
        total_loss += loss
        
        dL_dA3 = -(y_batch / A3) + (1 - y_batch) / (1 - A3) 
        
        # Backward pass
        dL_dA2 = layer3.backward(dL_dA3)
        dL_dA1 = layer2.backward(dL_dA2)
        dL_dA0 = layer1.backward(dL_dA1)
        
        # Gradient clipping
        for layer in [layer1, layer2, layer3]:
            layer.clip_gradients(5.0) # Range of the gradients that we are allowing
            layer.update(learning_rate) 
            layer.weights -= learning_rate * l2_lambda * layer.weights  # L2 regularization
    
    
    if epoch % 1 == 0:
        print(f"Epoch: {epoch}, Loss: {total_loss / (len(X_train) / batch_size)}")

  return 1 / (1 + np.exp(-x))


Epoch: 0, Loss: 5.678217000074567
Epoch: 1, Loss: 1.5586221620675547
Epoch: 2, Loss: 0.47156914718768017
Epoch: 3, Loss: 0.3982361344784691
Epoch: 4, Loss: 0.37312888737075406
Epoch: 5, Loss: 0.3585898627314312
Epoch: 6, Loss: 0.34221082344148146
Epoch: 7, Loss: 0.3317781559526788
Epoch: 8, Loss: 0.32625520797964036
Epoch: 9, Loss: 0.32213068325532757
Epoch: 10, Loss: 0.3184936686820696
Epoch: 11, Loss: 0.31531577031425
Epoch: 12, Loss: 0.3125699771062825
Epoch: 13, Loss: 0.3102219053066221
Epoch: 14, Loss: 0.3078886513885265
Epoch: 15, Loss: 0.30622102830618414
Epoch: 16, Loss: 0.30426073582177293
Epoch: 17, Loss: 0.3026634291600917
Epoch: 18, Loss: 0.3012468732021221
Epoch: 19, Loss: 0.3002182275538207
Epoch: 20, Loss: 0.29947395703003576
Epoch: 21, Loss: 0.2984477045580124
Epoch: 22, Loss: 0.2972821025609699
Epoch: 23, Loss: 0.29654885591768676
Epoch: 24, Loss: 0.29570166342497
Epoch: 25, Loss: 0.29525345915611473
Epoch: 26, Loss: 0.29500784669629543
Epoch: 27, Loss: 0.2944404287762

In [9]:
# Forward pass on test set
A1 = layer1.forward(X_test)
A2 = layer2.forward(A1)
A3 = layer3.forward(A2)

# Convert probabilities to binary predictions
predictions = (A3 > 0.5).astype(int)

# Calculate accuracy
accuracy = np.mean(predictions == y_test)
accuracy

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


income    0.839595
dtype: float64