In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
url="https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
df = pd.read_csv(url,header=None)

In [None]:
X = df[df.columns[0:60]].values
y = df[df.columns[60]].values
y

<h2>Converting the y values into numbers</h2>
<li>In our regression example, we used 0 for rocks and 1 for mines
<li>sklearn has a LabelEncoder that will replace text with numbered labels

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(y)
y = encoder.transform(y)
y


<li>That's not optimal because:
<ol>
<li>0 < 1 but rocks are not less than mines or mines less than rocks
<li>The average of 0 and 1 is meaningless

<h2>One-hot encoding</h2>
<li>comes from digital electronics
<li>only binary values are allowed
<li>all values are 0 except for one 1 (the "one hot" part)

<h2>Example</h2>
Consider the following data fragment:
<table>
<tr><td>Name</td><td>Income</td><td>City</td></tr>
<tr><td>John</td><td>120000</td><td>New York</td></tr>
<tr><td>Jill</td><td>20000</td><td>Chicago</td></tr>
<tr><td>Jae</td><td>75000</td><td>New York</td></tr>
<tr><td>Jane</td><td>179000</td><td>Los Angeles</td></tr>
<tr><td>Jake</td><td>10000</td><td>Los Angeles</td></tr>
</table>
<li>We want to encode City. One option:
<table>
<tr><td>Name</td><td>Income</td><td>City</td></tr>
<tr><td>John</td><td>120000</td><td>1</td></tr>
<tr><td>Jill</td><td>20000</td><td>2</td></tr>
<tr><td>Jae</td><td>75000</td><td>1</td></tr>
<tr><td>Jane</td><td>179000</td><td>3</td></tr>
<tr><td>Jake</td><td>10000</td><td>3</td></tr>
</table>
<li>But New York is not less than Chicago (quite the contrary!)
<li>Nor is Chicago the average of New York and LA (that would be weird!)
<li>Better to replace City with the following "one hot encoding"
<table>
<tr><td>Name</td><td>Income</td><td>New York</td><td>Chicago</td><td>Los Angeles</td></tr>
<tr><td>John</td><td>120000</td><td>1</td><td>0</td><td>0</td></tr>
<tr><td>Jill</td><td>20000</td><td>0</td><td>1</td><td>0</td></tr>
<tr><td>Jae</td><td>75000</td><td>1</td><td>0</td><td>0</td></tr>
<tr><td>Jane</td><td>179000</td><td>0</td><td>0</td><td>1</td></tr>
<tr><td>Jake</td><td>10000</td><td>0</td><td>0</td><td>1</td></tr>
</table>

<h2>One hot encoder</h2>
<li>general function to do one hot encoding

In [None]:
array = np.array(['New York','Chicago','New York','Los Angeles','Los Angeles'])

<h4>convert data into numerical labels</h4>

In [None]:

encoder.fit(array)
coded_array = encoder.transform(array)
coded_array

<h4>create a one hot coded array</h4>

In [None]:
n = len(coded_array)
n_labels = len(np.unique(coded_array))
n_labels
one_hot = np.zeros((n,n_labels))
#np.arange(n),coded_array
one_hot[np.arange(n), coded_array] = 1
one_hot

<h3>Function</h3>

In [None]:
def one_hot_encoder(array):
    from sklearn.preprocessing import LabelEncoder
    encoder.fit(array)
    coded_array = encoder.transform(array)
    n = len(coded_array)
    n_labels = len(np.unique(coded_array))
    one_hot = np.zeros((n,n_labels))
    one_hot[np.arange(n), coded_array] = 1
    return one_hot
one_hot_encoder(array)

<h3>One hot encode the y (rocks/mines) column</h3>

In [None]:
Y=one_hot_encoder(y)
Y

<h3>Create a training and a testing data set</h3>

In [None]:
from sklearn.model_selection import train_test_split 
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size = 0.20, random_state = 0)


In [None]:
ytrain

<h1>Building the neural net</h1>
<li>We need to decide:
<ol>
<li>Number of hidden layers
<li>Number of nodes in each hidden layer
<li>Number of nodes in the input layer
<li>Number of nodes in the output layer
<li>Number of training passes (epochs)
<li>Activation function to use
<li>The "learning rate"



<h3>Learning rate</h3>
A hyper parameter that controls how much weights should be adjusted after each epoch
<li>Too low, the model will take a long time to converge (expensive GPU cost)
<li>Too high, the model may never converge
<li>Bit of guesswork goes into this (e.g., start low, slowly increase the rate, see how the loss changes (loss = prediction error), and adjust the rate accordingly

<h3>hidden layers</h3>
<li>We'll start with one hidden layer
<li>With 60 nodes 

In [None]:
hidden_layers = (60,)

<h3>Input layer</h3>
<li>60 input nodes, corresponding to each sonar frequency

In [None]:
X


<h3>Output layer</h3>
<li>2 classes, one hot encoded

In [None]:
Y

<h3>learning rate</h3>
<li>start low

In [None]:
learning_rate = 0.001

<h3>Passes/epochs</h3>

In [None]:
epochs = 500

<h3>sklearn has a Multi-layer Perceptron Classifier</h3>

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(60,), max_iter = 500, random_state=1)

In [None]:
clf.fit(xtrain, ytrain)

<h3>If you must know!</h3>
<li><b>solver</b>: sgd (sigmoid), lbfgs (limited memory Broyden–Fletcher–Goldfarb–Shanno algorithm), adam (stochastic gradient based optimizer)
<li><b>activation</b>: logistic (sigmoid), tanh (hyperbolic tan function), relu (linear unit function). relu returns max(0,x) and works better on two class dependent variables (we don't want both returned
<li><b>alpha</b>: L2 regularization term. Regularization is used to prevent overfitting by not using the exact loss (difference between predicted and actual) when adjusting the weights (in a neural network model). L2 adds the sum of the square of the weights modified by a lambda parameter to each delta
<li><b>batch size</b>: Number of cases to use in one epoch. 
<li><b>momentum</b>: A number between 0 and 1 that accelerates a gradient descent (e.g., sigmoid) algorithm if it is moving in the right (consistent) direction
<li><b>shuffle</b>: shuffle the samples in each iteration (the order in which they are presented will change
<li><b>tol</b>: if the improvement is less than this, the algorithm stops


In [None]:
predictions = clf.predict(xtest)

In [None]:
actuals = ytest

In [None]:
tp=tn=fp=fn=0
for i in range(len(actuals)):
    a_class=p_class=0
    if int(actuals[i][0] == 0):
        a_class = 1 
    if int(predictions[i][0] == 0):
        p_class = 1
    if a_class == 1 and p_class == 1:
        tp +=1
    elif a_class == 1 and p_class == 0:
        fn +=1
    elif a_class == 0 and p_class == 0:
        tn +=1
    elif a_class == 0 and p_class == 1:
        fp +=1
print(tp,tn,fp,fn)
print("Accuracy: %1.2f"%((tp+tn)*100/(tp+tn+fp+fn)))
#(tp+tn)/(tp+tn+fp+fn)*100 )
    