In [321]:
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# EDA

First, we read the data and choose the first column as index.

In [322]:
df = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/AER/BankWages.csv', index_col=0)
df.head()

Unnamed: 0,job,education,gender,minority
1,manage,15,male,no
2,admin,16,male,no
3,admin,12,female,no
4,admin,8,female,no
5,admin,15,male,no


It is useful to know some properties of data so we will use `info()` function on the dataframe.

In [323]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 1 to 474
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job        474 non-null    object
 1   education  474 non-null    int64 
 2   gender     474 non-null    object
 3   minority   474 non-null    object
dtypes: int64(1), object(3)
memory usage: 18.5+ KB


Let's look for any missing value:

In [324]:
sum(df.isnull().values)

array([0, 0, 0, 0])

As we can see, there is no missing values and everything seems fine!

Next, we will going to look at some statistical information using `describe()` function:

In [325]:
df.describe(include = 'number').transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
education,474.0,13.491561,2.884846,8.0,12.0,12.0,15.0,21.0


In [326]:
df.describe(include='object').transpose()

Unnamed: 0,count,unique,top,freq
job,474,3,admin,363
gender,474,2,male,258
minority,474,2,no,370


What are possible values for the categorical features?

In [327]:
print(df['job'].unique())
print(df['gender'].unique())
print(df['minority'].unique())

['manage' 'admin' 'custodial']
['male' 'female']
['no' 'yes']


## Preprocessing

In [328]:
# Choose target and make encode it manually
y = df[['minority']].copy()
y = y.replace({'yes':1, 'no':0})

# Choose features
X = df.drop(['minority'], axis=1)

# Split data into test and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print('Shape of X_train:', X_train.shape)
print('Shape of X_valid:', X_valid.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_valid:', y_valid.shape)

Shape of X_train: (379, 3)
Shape of X_valid: (95, 3)
Shape of y_train: (379, 1)
Shape of y_valid: (95, 1)


In [329]:
# Select categorical features
object_cols = X.select_dtypes(include='object').columns.to_list()
# Select numerical features
num_cols = X.select_dtypes(include='number').columns.to_list()

# Normalize numerical data
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])

# Encode categorical data
encoder = OrdinalEncoder()
X_train[object_cols] = encoder.fit_transform(X_train[object_cols])
X_valid[object_cols] = encoder.fit_transform(X_valid[object_cols])

X_train.head()

Unnamed: 0,job,education,gender
156,0.0,0.478244,1.0
454,2.0,1.835955,1.0
23,0.0,0.478244,0.0
311,0.0,-0.54004,0.0
47,0.0,-0.54004,0.0


## Model Implementation

Next thing to do is implementation of four classes:
1. `NeuralNetwork`: It basically hold everything related to the network.
> + `add_layer()`: Gets a `Layer` object and add it to layers of the network
> + `fit()`: Gets X, y, epochs and use sequential method to update weights
> + `predict()`: Gets X and predict y based on the current weights of the network
> + `__feed_forward__()`: Calculates each layer output
> + `__backpropagation__()`: Calculates local gradients and update weights

2. `Layer`: Each layer of a network should be a object of this class.
> + `untis`: Number of untis of layers
> + `activation`: activation function of the layer (should be `linear`/`sigmoid`)
> + `use_bias`: Wheater the layer use bias unit or not (should be `True`/`False`)
> + `get_activation()`: returns activation function of the layer
> + `get_derivation()`: returns derivation function of the layer

3. `InputLayer`: Child of `Layer` class and specifically is used for input layer
> Activation of this layer should be linear.

4. `OutputLayer`: Child of `Layer` class and specfically is used for output layer
> This layer should not use bias unit.



In [386]:
class NeuralNetwork:
    def __init__(
        self,
        learning_rate,
    ):
        self.layers = []
        self.weights = []
        self.learning_rate = learning_rate

    def add_layer(self, layer):
        self.layers.append(layer)
        if len(self.layers) > 1:
            # Find two last layers
            last_layer = self.layers[-1]
            second_last_layer = self.layers[-2]
            # Rows of weights matrix
            rows = last_layer.units
            # Columns of weights matrix
            columns = second_last_layer.units
            if (second_last_layer.use_bias):
                columns += 1
            # Randomly initialize weights
            layer_weights = np.random.rand(rows, columns)
            self.weights.append(layer_weights)

    def fit(self, X, y, epochs=1):
        if len(X) != len(y):
            raise ValueError("Inputs are not compatible.")
        # Iterate over epochs
        for epoch in range(epochs):
            msg = 'Start of epoch #{}...'.format(epoch)
            print(msg)
            # Iterate over samples (Sequential Mode)
            for i in range(len(X)):
                self.__feed_forward__(X, i)
            sys.stdout.write((b'\x08' * (len(msg)+1)).decode())


    def predict(self, X):
        predictions = []
        # Iterate over samples
        for i in range(len(X)):
            # Calculate output
            self.__feed_forward__(X, i)
            predictions.append(self.a[-1])
        return np.array(predictions)

    def __feed_forward__(self, X, i):
        # Initialize activated values of each layer
        self.a = [_ for i in range(len(self.layers))]
        for j in range(len(self.layers)):
            units = self.layers[j].units
            self.a[j] = np.ones((self.layers[j].units, 1))
            # Add bias unit if needed
            if (self.layers[j].use_bias):
                self.a[j] = np.append(1,self.a[j])

        # Initialize induced local fields of each layer
        self.v = [_ for i in range(len(self.layers))]
        for j in range(len(self.layers)):
            self.v[j] = np.ones((self.layers[j].units, 1))


        # Iterate over layers until the last one
        for j in range(len(self.layers) - 1):
            # Check for first layer
            if (j == 0):
                if (self.layers[j].use_bias):
                    self.a[j][1:] = X.iloc[i, :].to_numpy()
                else:
                    self.a[j] = X.iloc[i, :].to_numpy()

            # Calculate induced local fields
            self.v[j + 1] = self.weights[j] @ self.a[j]
            # Get activation function
            activation_func = np.vectorize(self.layers[j].get_activation())
            # Calculate activated values
            if (self.layers[j+1].use_bias):   
                self.a[j + 1][1:] = activation_func(self.v[j + 1])
            else:
                self.a[j + 1] = activation_func(self.v[j + 1])
        


    def __backpropagation__(self, X, y, i):
        # Initialize local gradients like induced local fields
        self.grad = [np.zeros_like(v_i) for v_i in self.v]
        
        # Iterate from last layer to the second one
        for j in range(len(self.layers)-1,0, -1):
            # Last layer
            if (j == len(self.layers)-1):
                # Calculate error
                self.grad[j] = np.array(y.iloc[i,:] - self.a[-1])
            else:
                if self.layers[j].use_bias:
                    # Discard local gradient of bias unit
                    self.grad[j] = (self.weights[j].T @ self.grad[j+1])[1:]
                else:
                    self.grad[j] = self.weights[j].T @ self.grad[j+1]

            derivation_func = np.vectorize(self.layers[j].get_derivation())
            phi = derivation_func(self.v[j])
            self.grad[j] = self.grad[j] * phi
            
            # Calculate delta_w based on delta rule
            delta_w = self.learning_rate * (self.grad[j].reshape(-1,1) @ self.a[j-1].reshape(1, -1))

            # Update weights
            self.weights[j-1] += delta_w
            


class Layer:
    def __init__(self, units, activation, use_bias=True):
        self.units = units
        self.use_bias = use_bias
        self.activation = activation

    def get_activation(self):
        if self.activation.lower() == "linear":
            return lambda x: x
        elif self.activation.lower() == "sigmoid":
            return self.sigmoid

    def sigmoid(self, x):
        x = np.array(x).clip(-50, 50)
        return 1 / (1 + np.exp(-x))

    def get_derivation(self):
        if self.activation.lower() == "linear":
            return lambda x: 1
        elif self.activation.lower() == "sigmoid":
            return lambda x: self.sigmoid(x) * (1 - self.sigmoid(x))

class InputLayer(Layer):
    def __init__(self, units, use_bias, activation='linear'):
        Layer.__init__(self, units, activation, use_bias)

class OutputLayer(Layer):
    def __init__(self, units, activation='sigmoid'):
        Layer.__init__(self, units, activation, False)

## Model Training

So we will create a network using above classes

In [441]:
model = NeuralNetwork(learning_rate=0.003)
model.add_layer(InputLayer(3, use_bias=False))
model.add_layer(Layer(2, 'linear', use_bias=False))
model.add_layer(OutputLayer(1))

model.fit(X_train.iloc[:,:], y_train.iloc[:,:], epochs=10)

preds = model.predict(X_valid.iloc[:,:])
categorical_preds = np.where(preds < 0.5, 0, 1)
accuracy = (categorical_preds == y_valid)['minority'].sum()/y_valid.size

print('Accuracy: {}'.format(accuracy))

Start of epoch #0...
Start of epoch #1...
Start of epoch #2...
Start of epoch #3...
Start of epoch #4...
Start of epoch #5...
Start of epoch #6...
Start of epoch #7...
Start of epoch #8...
Start of epoch #9...
Accuracy: 0.7578947368421053


## Model Examination

Using sigmoid activation funciton in hidden layer insted of sigmoid

In [434]:
model = NeuralNetwork(learning_rate=0.003)
model.add_layer(InputLayer(3, use_bias=False))
model.add_layer(Layer(3, 'sigmoid', use_bias=False))
model.add_layer(OutputLayer(1))

model.fit(X_train.iloc[:,:], y_train.iloc[:,:], epochs=10)

preds = model.predict(X_valid.iloc[:,:])
categorical_preds = np.where(preds < 0.5, 0, 1)
accuracy = (categorical_preds == y_valid)['minority'].sum()/y_valid.size

print('Accuracy: {}'.format(accuracy))

Start of epoch #0...
Start of epoch #1...
Start of epoch #2...
Start of epoch #3...
Start of epoch #4...
Start of epoch #5...
Start of epoch #6...
Start of epoch #7...
Start of epoch #8...
Start of epoch #9...
Accuracy: 0.4


The accuracy for this particular problem is worse using sigmoid activation function of hidden layer. We can not generalize this result to all problems. For some problems, sigmoid or any non-linear funciton such as relu, leaky rel, elu, or softmax could perform better than linear.


Change in the number of hidden layers:

In [436]:
model = NeuralNetwork(learning_rate=0.003)
model.add_layer(InputLayer(3, use_bias=False))
model.add_layer(Layer(7, 'sigmoid', use_bias=False))
model.add_layer(OutputLayer(1))

model.fit(X_train.iloc[:,:], y_train.iloc[:,:], epochs=10)

preds = model.predict(X_valid.iloc[:,:])
categorical_preds = np.where(preds < 0.5, 0, 1)
accuracy = (categorical_preds == y_valid)['minority'].sum()/y_valid.size

print('Accuracy: {}'.format(accuracy))

Start of epoch #0...
Start of epoch #1...
Start of epoch #2...
Start of epoch #3...
Start of epoch #4...
Start of epoch #5...
Start of epoch #6...
Start of epoch #7...
Start of epoch #8...
Start of epoch #9...
Accuracy: 0.6


Generally, we expect the network to do better on training set when it has more neurons, but as we can see, in this example it seems like the model has high bias and closer to overfit. Therefore its accuracy on validaiton set has been reduced.

Change in the hidden layer units:

In [438]:
model = NeuralNetwork(learning_rate=0.003)
model.add_layer(InputLayer(3, use_bias=False))
model.add_layer(Layer(4, 'sigmoid', use_bias=False))
model.add_layer(Layer(4, 'sigmoid', use_bias=False))
model.add_layer(Layer(4, 'sigmoid', use_bias=False))
model.add_layer(OutputLayer(1))

model.fit(X_train.iloc[:,:], y_train.iloc[:,:], epochs=10)

preds = model.predict(X_valid.iloc[:,:])
categorical_preds = np.where(preds < 0.5, 0, 1)
accuracy = (categorical_preds == y_valid)['minority'].sum()/y_valid.size

print('Accuracy: {}'.format(accuracy))

Start of epoch #0...
Start of epoch #1...
Start of epoch #2...
Start of epoch #3...
Start of epoch #4...
Start of epoch #5...
Start of epoch #6...
Start of epoch #7...
Start of epoch #8...
Start of epoch #9...
Accuracy: 0.7578947368421053


The accuracy remains slighly the same in this case. Generally, we could consider that networks with more layers be more accurate on training set.
It is possible that because or dataset is not large, it performed like previous models.

Change in learning rate:

In [444]:
model = NeuralNetwork(learning_rate=0.00001)
model.add_layer(InputLayer(3, use_bias=False))
model.add_layer(Layer(2, 'linear', use_bias=False))
model.add_layer(OutputLayer(1))

model.fit(X_train.iloc[:,:], y_train.iloc[:,:], epochs=10)

preds = model.predict(X_valid.iloc[:,:])
categorical_preds = np.where(preds < 0.5, 0, 1)
accuracy = (categorical_preds == y_valid)['minority'].sum()/y_valid.size

print('Accuracy: {}'.format(accuracy))

Start of epoch #0...
Start of epoch #1...
Start of epoch #2...
Start of epoch #3...
Start of epoch #4...
Start of epoch #5...
Start of epoch #6...
Start of epoch #7...
Start of epoch #8...
Start of epoch #9...
Accuracy: 0.6421052631578947


In [445]:
model = NeuralNetwork(learning_rate=0.3)
model.add_layer(InputLayer(3, use_bias=False))
model.add_layer(Layer(2, 'linear', use_bias=False))
model.add_layer(OutputLayer(1))

model.fit(X_train.iloc[:,:], y_train.iloc[:,:], epochs=10)

preds = model.predict(X_valid.iloc[:,:])
categorical_preds = np.where(preds < 0.5, 0, 1)
accuracy = (categorical_preds == y_valid)['minority'].sum()/y_valid.size

print('Accuracy: {}'.format(accuracy))

Start of epoch #0...
Start of epoch #1...
Start of epoch #2...
Start of epoch #3...
Start of epoch #4...
Start of epoch #5...
Start of epoch #6...
Start of epoch #7...
Start of epoch #8...
Start of epoch #9...
Accuracy: 0.6


From above cells, we can conclude that when learning rate is very small, the network can not find the optimum point and it will be stopped in its way.

On the other hand, when learning rate is too large, it tends to walk slightly random and diverges. So it missed the optimum point and has lower accuracy.