**Goal: Create a network with one hidden layer using vectorization as possible**

The setup for this network is naive. It uses sigmoid activation functions in the hidden layer. Improvements could likely be acheived by replacing the sigmoid with tanh or ReLU.

In [154]:
import numpy as np

In [155]:
def initialize(X, layer_size=2):
    '''Returns initial parameters for a neural network with the given number of nodes in the hidden layer.
    
        X = n_x by m array of training examples
        layer_size = number of nodes in hidden layer
        
    Returns four objects:
    
        w1 = n_x by layer_size array
        w2 = layer_size by 1 array
        b1 = 1 by layer_size array
        b2 = float'''
    
    n_x = X.shape[0]
    
    w1 = np.random.randn(n_x, layer_size)
    w2 = np.random.randn(layer_size, 1)
    b1 = np.random.randn(1, layer_size)
    b2 = np.random.randn()
    
    return w1, w2, b1, b2

In [156]:
def sigmoid(x):
    '''Sigmoid/logistic function.
    
        x = numpy array or float'''
    
    return 1/(1+np.exp(-x))

In [157]:
def logloss(y, yhat):
    '''Loss function for logistic regression.
    
        y = true (binary) class
        yhat = predicted class probability'''
    
    return -(y*np.log(yhat)+(1-y)*np.log(1-yhat))

In [158]:
def propogate(X, Y, w1, w2, b1, b2, layer_size=2):
    '''Implement forward and backward propogation for simple logistic regression.
    
        X = n_x by m numpy array containg training data where n_x is the number of features and m is the number of samples
        Y = 1 by m numpy array containing true sample classes
        w = numpy array of model parameters
        b = model bias array'''
    
    m = X.shape[1]
    one_vec = np.full((m,1), 1)
    
    # forward propogation
    # hidden layer
    A1 = sigmoid(np.dot(X.T, w1)+b1) # m by layer_size array
    
    # output layer
    A2 = sigmoid(np.dot(A1, w2)+b2) # m by 1 array
    #print('A1 ', A1.shape)
    #print('w2 ', w2.shape)
    #print(A2)
    
    # backward propogation
    dz1 = (1/m)*np.dot(w2, A2.T-Y)*(A1.T*(1-A1.T))
    dw1 = np.dot(dz1, X.T)
    dw2 = (1/m)*np.dot(A2.T-Y, A1)
    db1 = np.dot(dz1, one_vec)
    #db1 = (1/m)*np.dot(sum_vec, (A2-Y.T)*A1)
    db2 = (1/m)*np.sum(A2-Y)
    #print('db2 ', db2)
    
    gradient = dict({'dw1': dw1, 'dw2': dw2, 'db1': db1, 'db2': db2})
    
    return gradient

In [159]:
def fit(X, Y, w1, w2, b1, b2, layer_size=2, iterations=2000, learning_rate=0.01):
    '''Implement gradient descent fitting procedure.
    
        X = n_x by m array with samples in columns
        Y = m by 1 array of true binary class values
        w = array of model parameters
        b = model bias array'''
    
    n_x = X.shape[0]    # number of features
    m = X.shape[1]    # number of training samples
    
    # gradient descent loop
    for i in range(iterations):
        gradient = propogate(X, Y, w1, w2, b1, b2, layer_size=layer_size)
        dw1 = gradient['dw1']
        dw2 = gradient['dw2']
        db1 = gradient['db1']
        db2 = gradient['db2']
        
        w1 = w1 - learning_rate*dw1.T
        w2 = w2 - learning_rate*dw2.T
        b1 = b1 - learning_rate*db1.T
        b2 = b2 - learning_rate*db2
    
    parameters = dict({'w1': w1, 'w2': w2, 'b1': b1, 'b2': b2})
    return parameters

In [160]:
def class_val(x):
    '''Find the most likely class given the input probability x.'''
    if x <= 0.5:
        return 0
    else:
        return 1

# use numpy vectorization to apply class_val to an array
vclass_val = np.vectorize(class_val)

In [161]:
def predict(X, w1, w2, b1, b2):
    '''Predict class using logistic regression model.
    
        X = array with n_x rows, samples in columns'''
    
    # hidden layer
    A1 = sigmoid(np.dot(X.T, w1)+b1)
    #print(w2)
    
    # output layer
    out = sigmoid(np.dot(A1, w2)+b2)
    
    return vclass_val(out)

In [162]:
def LogModel(X_train, Y_train, X_test, Y_test, layer_size=2, iterations=2000, learning_rate=0.01):
    '''Build logistic regression model using training data and test on provided
    testing data.
    
        X_train = n_x by m array with feature vectors in columns
        Y_train = m by 1 array of true binary class for each training sample
        X_test = n_x by any number array with feature vectors in columns
        Y_test = column vector of true binary class for each test sample
        iterations = positive integer, number of iterations for gradient descent
        learning_rate = positive float, learning rate used for gradient descent
        
    Prints percentage of correct predictions on testing data using the model fit by gradient descent.
    Returns model parameters.'''
    
    w1, w2, b1, b2 = initialize(X_train, layer_size=layer_size)
    #print(w1)
    
    parameters = fit(X_train, Y_train, w1, w2, b1, b2, layer_size=layer_size, iterations=iterations, learning_rate=learning_rate)
    w1 = parameters['w1']
    w2 = parameters['w2']
    b1 = parameters['b1']
    b2 = parameters['b2']
    
    predictions = predict(X_test, w1, w2, b1, b2)
    
    #print(predictions)
    
    model_accuracy = 100-np.average(np.abs(Y_test-predictions.T))*100
    
    print('Model accuracy: {:.4f}%'.format(model_accuracy))
    
    return parameters

## Training an OR neural network

We overfit a neural network on a small problem. There are only four possible inputs for an OR statement, so we create a hidden node for each one.

In [163]:
X = np.array([[1,1,0,0],[1,0,1,0]])
Y = np.array([[1,1,1,0]])

In [164]:
w1, w2, b1, b2 = initialize(X, layer_size=4)

In [165]:
predict(X, w1, w2, b1, b2)

array([[1],
       [1],
       [1],
       [1]])

In [166]:
propogate(X, Y, w1, w2, b1, b2)

{'dw1': array([[ 0.00488399,  0.00421978],
        [-0.02993307, -0.02312541],
        [ 0.02339517,  0.01780057],
        [-0.02207128, -0.02360303]]),
 'dw2': array([[-0.13448924, -0.01580992, -0.14273217,  0.01480164]]),
 'db1': array([[ 0.00127042],
        [-0.01063261],
        [ 0.00202261],
        [ 0.00616183]]),
 'db2': -0.5559064734264723}

In [167]:
fit(X, Y, w1, w2, b1, b2)

{'w1': array([[ 0.08480012,  0.75913017, -0.09740079, -0.59373514],
        [ 0.87118126, -0.79815698,  1.09493664, -0.77825849]]),
 'w2': array([[ 0.2016502 ],
        [ 0.65945088],
        [-0.36467286],
        [ 0.69482114]]),
 'b1': array([[ 1.11449646, -0.9474521 ,  1.01380003, -1.25132173]]),
 'b2': 0.9738918458890288}

In [168]:
parameters = fit(X, Y, w1, w2, b1, b2, learning_rate=0.1)

w1 = parameters['w1']
w2 = parameters['w2']
b1 = parameters['b1']
b2 = parameters['b2']

predict(X, w1, w2, b1, b2)

array([[1],
       [1],
       [1],
       [0]])

In [169]:
LogModel(X, Y, X, Y, layer_size=4, learning_rate=0.1, iterations=2000)

Model accuracy: 100.0000%


{'w1': array([[-3.36671083,  0.45851469, -1.83745462,  2.48526772],
        [-3.46525019,  0.28770223, -0.69109538,  2.55158644]]),
 'w2': array([[-6.08728712],
        [-0.01699202],
        [-1.76107814],
        [ 4.58171016]]),
 'b1': array([[ 1.56480151, -2.20350072, -0.22664747, -1.02659914]]),
 'b2': 1.4013824428380999}

Unlike the large dataset, the neural network performs well on this problem.

## Train an AND network with different numbers of hidden nodes

In [170]:
X = np.array([[1,1,0,0],[1,0,1,0]])
Y = np.array([[1,0,0,0]])

In [171]:
LogModel(X, Y, X, Y, layer_size=4, learning_rate=0.1, iterations=2000)

Model accuracy: 100.0000%


{'w1': array([[ 1.98194352, -0.12378465,  0.50326089, -2.6868536 ],
        [ 1.00824945, -1.9558207 ,  0.46871953, -2.70933873]]),
 'w2': array([[ 3.41530518],
        [-2.02150548],
        [ 0.80051604],
        [-6.22452884]]),
 'b1': array([[-1.47613161,  0.38762502,  0.0556036 ,  3.68536558]]),
 'b2': 0.06367765912859673}

In [172]:
LogModel(X, Y, X, Y, layer_size=3, learning_rate=0.1, iterations=2000)

Model accuracy: 100.0000%


{'w1': array([[ 4.23705331, -1.08432874, -1.54101761],
        [-0.27528058, -2.08697021, -3.15791841]]),
 'w2': array([[ 4.16664998],
        [-3.2545004 ],
        [-5.09427344]]),
 'b1': array([[-1.13777686,  1.68157315,  2.93043352]]),
 'b2': -0.4855775069815068}

In [173]:
LogModel(X, Y, X, Y, layer_size=2, learning_rate=0.1, iterations=2000)

Model accuracy: 100.0000%


{'w1': array([[ 3.08380884,  2.71167838],
        [ 3.47127936, -0.84037907]]),
 'w2': array([[7.71495827],
        [1.2629936 ]]),
 'b1': array([[-4.64059546,  0.01523817]]),
 'b2': -5.467249181643752}

In [174]:
LogModel(X, Y, X, Y, layer_size=1, learning_rate=0.1, iterations=2000)

Model accuracy: 100.0000%


{'w1': array([[-3.09408517],
        [-3.0967422 ]]),
 'w2': array([[-7.14181602]]),
 'b1': array([[4.2203554]]),
 'b2': 2.648310522930351}

The network with 1 hidden layer does well on this problem for any number of hidden nodes.

## Train a NN on a large dataset

Note: The data in this problem comes from an interview problem where the meaning of the variables and the output is unknown.

In [175]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [176]:
train_location = '/Users/connorodell/Documents/Data_Science/learning/exercise_03_train.csv'

train_df = pd.read_csv(train_location)

In [177]:
# Clean data. Details can be found in other notebook
train_df = train_df.dropna()

# Correct days so that all are spelled out
train_df['x35'] = train_df['x35'].map(lambda x: 'wednesday' if x=='wed' else 'thursday' if (x=='thur' or x=='thurday') else 'friday' if x=='fri' else x)

# Correct sept. to Sept and Dev to Dec in column x68
train_df['x68'] = train_df['x68'].map(lambda x: 'Jan' if x=='January' else 'Sept' if x=='sept.' else 'Dec' if x=='Dev' else x)

# Transform columns x34, x35, x68, and x93 to dummy variables
train_df = pd.get_dummies(train_df, columns=['x34', 'x35', 'x68', 'x93'])

# Transform columns x41 and x45 to floats
train_df['x41'] = train_df['x41'].map(lambda x: x.lstrip('$'))
train_df['x41'] = pd.to_numeric(train_df['x41'])

train_df['x45'] = train_df['x45'].map(lambda x: x.rstrip('%'))
train_df['x45'] = pd.to_numeric(train_df['x45'])

In [178]:
# split data into train/test sets
X_train, X_test, Y_train, Y_test = train_test_split(train_df.drop('y', axis=1), train_df['y'], test_size=0.2, random_state=42)

# scale data using the standard scaler in sklearn
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Get numpy columns for Y
Y_train = Y_train.values.reshape(len(Y_train),1)
Y_test = Y_test.values.reshape(len(Y_test),1)

In [179]:
# run new model
# note that samples are in rows of train_df, so the transposes are fed into the models
params = LogModel(X_train.T, Y_train.T, X_test.T, Y_test.T, layer_size=3, learning_rate=0.1, iterations=10)

  


Model accuracy: 79.6837%


Notes:

* The current neural network framework with the sigmoid activation functions does not perform well on the large train_df dataset. It may improve with more iterations, but the time required for each iteration with the current framework is large.
* It will be interesting to see how improved versions of a neural network with tanh or ReLU activation functions perform.
* The amount of time required to train the network is large, probably due to multiplication of large matrices.
* We get an overflow warning from the np.exp function when the number of nodes in the hidden layer is increased.

In [180]:
params = LogModel(X_train.T, Y_train.T, X_test.T, Y_test.T, layer_size=60, learning_rate=0.1, iterations=10)

  


Model accuracy: 79.6837%


Note: Increasing the number of nodes in the hidden layer doesn't seem to have any effect on accuracy but greatly increases the time required to train the network.