<a href="https://colab.research.google.com/github/dougfletcher10/W207/blob/master/Week_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Neural Nets -- from scratch
Neural net with back propagation for XOR using one hidden layer
Taken from http://www.bogotobogo.com/python/python_Neural_Networks_Backpropagation_for_XOR_using_one_hidden_layer.php

In [None]:
import numpy as np

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x)*(1.0-sigmoid(x))

def tanh(x):
    return np.tanh(x)

def tanh_prime(x):
    return 1.0 - x**2


class NeuralNetwork:

    def __init__(self, layers, activation='tanh'):
        if activation == 'sigmoid':
            self.activation = sigmoid
            self.activation_prime = sigmoid_prime
        elif activation == 'tanh':
            self.activation = tanh
            self.activation_prime = tanh_prime

        # Set weights
        self.weights = []
        # layers = [2,2,1]
        # range of weight values (-1,1)
        # input and hidden layers - random((2+1, 2+1)) : 3 x 3
        for i in range(1, len(layers) - 1):
            r = 2*np.random.random((layers[i-1] + 1, layers[i] + 1)) -1
            self.weights.append(r)
        # output layer - random((2+1, 1)) : 3 x 1
        r = 2*np.random.random( (layers[i] + 1, layers[i+1])) - 1
        self.weights.append(r)

    def fit(self, X, y, learning_rate=0.2, epochs=100000):
        # Add column of ones to X
        # This is to add the bias unit to the input layer
        ones = np.atleast_2d(np.ones(X.shape[0]))
        X = np.concatenate((ones.T, X), axis=1)
         
        for k in range(epochs):
            if k % 10000 == 0: print ('epochs:', k)
            
            i = np.random.randint(X.shape[0])
            a = [X[i]]

            for l in range(len(self.weights)):
                    dot_value = np.dot(a[l], self.weights[l])
                    activation = self.activation(dot_value)
                    a.append(activation)
            # output layer
            error = y[i] - a[-1]
            deltas = [error * self.activation_prime(a[-1])]

            # we need to begin at the second to last layer 
            # (a layer before the output layer)
            for l in range(len(a) - 2, 0, -1): 
                deltas.append(deltas[-1].dot(self.weights[l].T)*self.activation_prime(a[l]))

            # reverse
            # [level3(output)->level2(hidden)]  => [level2(hidden)->level3(output)]
            deltas.reverse()

            # backpropagation
            # 1. Multiply its output delta and input activation 
            #    to get the gradient of the weight.
            # 2. Subtract a ratio (percentage) of the gradient from the weight.
            for i in range(len(self.weights)):
                layer = np.atleast_2d(a[i])
                delta = np.atleast_2d(deltas[i])
                self.weights[i] += learning_rate * layer.T.dot(delta)

    def predict(self, x): 
        a = np.concatenate((np.ones(1).T, np.array(x)), axis=0)      
        for l in range(0, len(self.weights)):
            a = self.activation(np.dot(a, self.weights[l]))
        return a



nn = NeuralNetwork([2,2,1])

X = np.array([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]])

y = np.array([0, 1, 1, 0])

nn.fit(X, y, epochs=100000)

epochs: 0
epochs: 10000
epochs: 20000
epochs: 30000
epochs: 40000
epochs: 50000
epochs: 60000
epochs: 70000
epochs: 80000
epochs: 90000


In [None]:
X, y

(array([[0, 0],
        [0, 1],
        [1, 0],
        [1, 1]]), array([0, 1, 1, 0]))

In [None]:
for e in X:
    print(e,nn.predict(e))

[0 0] [0.00010904]
[0 1] [0.99582452]
[1 0] [0.99713619]
[1 1] [0.00352613]


## Count Vectorizer example

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


In [None]:
# read in a data file that represents sport and non-sports sentences
sports_url = "https://gist.githubusercontent.com/peterg889/198aa5adc4d2022a5042df351c531276/raw/97d9b442c6bab81521a3e51fa645235113ddaff6/sports.csv"
df = pd.read_csv(sports_url)
df

Unnamed: 0,Text,Category
0,A great game,Sports
1,The election was over,Not sports
2,Very clean match,Sports
3,A clean but forgettable game,Sports
4,It was a close election,Not sports


In [None]:
vectorizer = CountVectorizer()
train_data = df.Text.str.lower().tolist()
vtrain = vectorizer.fit_transform(train_data)


In [None]:
train_data

['a great game',
 'the election was over',
 'very clean match',
 'a clean but forgettable game',
 'it was a close election']

In [None]:
vtrain

<5x13 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [None]:
print ("vocab size", vtrain.shape)

vocab size (5, 13)


In [None]:
vectorizer.get_feature_names()

['but',
 'clean',
 'close',
 'election',
 'forgettable',
 'game',
 'great',
 'it',
 'match',
 'over',
 'the',
 'very',
 'was']

In [None]:
vectorizer.vocabulary_

{'but': 0,
 'clean': 1,
 'close': 2,
 'election': 3,
 'forgettable': 4,
 'game': 5,
 'great': 6,
 'it': 7,
 'match': 8,
 'over': 9,
 'the': 10,
 'very': 11,
 'was': 12}

In [None]:
vtrain.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1]])

In [None]:
vectorizer.inverse_transform([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0])

[array(['game', 'great'], dtype='<U11')]

In [None]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [None]:
train_data

['a great game',
 'the election was over',
 'very clean match',
 'a clean but forgettable game',
 'it was a close election']

## Count vectorizer - with logistic regression/neural nets

In [None]:
df['labels'] = (df.Category == "Sports")* 1.
df

Unnamed: 0,Text,Category,labels
0,A great game,Sports,1.0
1,The election was over,Not sports,0.0
2,Very clean match,Sports,1.0
3,A clean but forgettable game,Sports,1.0
4,It was a close election,Not sports,0.0


In [None]:
train_labels = df['labels'].tolist()

### Logistic Regression

In [None]:
lr_clf = LogisticRegression(C=1)
lr_clf.fit(vtrain, train_labels)

train_predict = lr_clf.predict(vtrain)
train_predict

array([1., 0., 1., 1., 0.])

In [None]:
lr_clf.predict_proba(vtrain)

array([[0.21456222, 0.78543778],
       [0.72164009, 0.27835991],
       [0.19120814, 0.80879186],
       [0.15093898, 0.84906102],
       [0.72164009, 0.27835991]])

In [None]:
test_sentence = "A very close baseball game"
vdev = vectorizer.transform([test_sentence.lower()])

lr_clf.predict(vdev)

lr_clf.predict_proba(vdev)

array([[0.26974001, 0.73025999]])

In [None]:
vdev

<1x13 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

### Multi level perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [None]:
clf.fit(vtrain, train_labels)  

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [None]:
clf.predict(vtrain)

array([1., 0., 1., 1., 0.])

In [None]:
clf.predict_proba(vdev)

array([[1.03261581e-04, 9.99896738e-01]])

In [None]:
test_sentence = "A very close baseball game"
clf.predict(vdev)

array([1.])

## Digit Classification with Neural Networks

Interest in neural networks, and in particular those with architechures that support deep learning, has surged in recent years.

In this notebook we will be revisiting the problem of digit classification on the MNIST data. In doing so, we will introduce a new Python library, Theano, for working with neural networks. Theano is a popular choice for neural networks as the same code can be run on either CPUs or GPUs. GPUs greatly speed up the training and prediction, and is readily available. Amazon even offers GPU machines on EC2.

In part 1, we'll introduce Theano, and refresh ourselves on the MNIST dataset. In part 2, we'll create a multi-layer neural network with a simple architechure, and train it using backpropagation. Part 3 will introduce the convolutional architechure, which can be said to be doing 'deep learning' (also called feature learning or representation learning).

### Part 1 - basics
Lets start to look at Theano. If later you'd like to go deeper into Theano, you may want to read this paper: http://www.iro.umontreal.ca/~lisa/pointeurs/theano_scipy2010.pdf

Install Theano if you haven't already. Then let's load it, and set it to work with a CPU. For reference, here is the Theano documentation: http://www.deeplearning.net/software/theano/library/

In [None]:
%matplotlib inline

import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import time

import theano 
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
print(theano.config.device) # We're using CPUs (for now)
print(theano.config.floatX) # Should be 64 bit for CPUs

np.random.seed(0)

cpu
float64


Load back up the MNIST data

In [None]:
# Repeating steps from Project 1 to prepare mnist dataset. 
X, Y = fetch_openml(name='mnist_784', return_X_y=True, cache=False)

X = X / 255.0
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]
numExamples = 2000
test_data, test_labels = X[70000-numExamples:], Y[70000-numExamples:]
train_data, train_labels = X[:numExamples], Y[:numExamples]
numFeatures = train_data[1].size
numTrainExamples = train_data.shape[0]
numTestExamples = test_data.shape[0]
print('Features = %d' %(numFeatures))
print('Train set = %d' %(numTrainExamples))
print('Test set = %d' %(numTestExamples))

Features = 784
Train set = 2000
Test set = 2000


Looking ahead to working with neural networks, let's prepare one additional variation of the label data. Let's make these labels, rather than each being an integer value from 0-9, be a set of 10 binary values, one for each class. This is sometimes called a 1-of-n encoding, and it makes working with Neural Networks easier, as there will be one output node for each class.

In [None]:
def binarizeY(data):
    binarized_data = np.zeros((data.size,10))
    for j in range(0,data.size):
        feature = data[j:j+1]
        i = feature.astype(np.int64) 
        binarized_data[j,i]=1
    return binarized_data
train_labels_b = binarizeY(train_labels)
test_labels_b = binarizeY(test_labels)
numClasses = train_labels_b[1].size
print('Classes = %d' %(numClasses))

Classes = 10


Lets start with a KNN model to establish a baseline accuracy.

In [None]:
neighbors = 1
knn = KNeighborsClassifier(neighbors)
# we'll be waiting quite a while if we use 60K examples, so let's cut it down.  You may want to run the full 60K on your own later to see what the accuracy is.
mini_train_data, mini_train_labels = X[:numExamples], Y[:numExamples] 
start_time = time.time()
knn.fit(mini_train_data, mini_train_labels)
print('Train time = %.2f'% (time.time() - start_time))
start_time = time.time()
accuracy = knn.score(test_data, test_labels)
print('Accuracy = %.4f' %(accuracy))
print('Prediction time = %.2f' %(time.time() - start_time))

Train time = 0.15
Accuracy = 0.9065
Prediction time = 7.58




Alright, now that we have a simple baseline, let's start working in Theano. Before we jump to multi-layer neural networks though, let's train a logistic regression model to make certain we're using Theano correctly.

Recall there are four keye components: (1) parameters, (2) model, (3) cost function, and (4) objective.

In [None]:
## (1) Parameters 
# Initialize the weights to small, but non-zero, values.
w = theano.shared(np.asarray((np.random.randn(*(numFeatures, numClasses))*.01)))

Two notes relevant at this point:

First, logistic regression can be thought of as a neural network with no hidden layers. The output values are just the dot product of the inputs and the edge weights.

Second, we have 10 classes. We can either train separate one vs all classifiers using sigmoid activation, which would be a hassle, or we can use the softmax activation, which is essentially a multi-class version of sigmoid. We'll use Theano's built-in implementation of softmax.

In [None]:
## (2) Model
# Theano objects accessed with standard Python variables
X = T.matrix()
Y = T.matrix()

def model(X, w):
    return T.nnet.softmax(T.dot(X, w))
y_hat = model(X, w)

We'll use cross-entropy as a cost function. Cross entropy only considers the error between the true class and the prediction, and not the errors for the false classes. This tends to cause the network to converge faster. We'll use Theano's built-in cross entropy function.

In [None]:
## (3) Cost function
cost = T.mean(T.nnet.categorical_crossentropy(y_hat, Y))

The objective is minimize the cost, and to do that we'll use batch gradient descent.

We'll use Theano's built-in gradient function. 

In [None]:
## (4) Objective (and solver)

alpha = 0.01
gradient = T.grad(cost=cost, wrt=w) 
update = [[w, w - gradient * alpha]] 
train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True) # computes cost, then runs update
y_pred = T.argmax(y_hat, axis=1) # select largest probability as prediction
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)

def gradientDescent(epochs):
    trainTime = 0.0
    predictTime = 0.0
    for i in range(epochs):
        start_time = time.time()
        cost = train(train_data[0:len(train_data)], train_labels_b[0:len(train_data)])
        trainTime =  trainTime + (time.time() - start_time)
        if epochs < 500:
          print('%d) accuracy = %.4f' %(i+1, np.mean(np.argmax(test_labels_b, axis=1) == predict(test_data))))
        else:
          if i % 1000 == 0:
            print('%d) accuracy = %.4f' %(i+1, np.mean(np.argmax(test_labels_b, axis=1) == predict(test_data))))
    print('train time = %.2f' %(trainTime))

gradientDescent(50)

start_time = time.time()
predict(test_data)   
print('predict time = %.2f' %(time.time() - start_time))



1) accuracy = 0.0970
2) accuracy = 0.1245
3) accuracy = 0.1555
4) accuracy = 0.1830
5) accuracy = 0.2195
6) accuracy = 0.2490
7) accuracy = 0.2830
8) accuracy = 0.3075
9) accuracy = 0.3420
10) accuracy = 0.3645
11) accuracy = 0.3905
12) accuracy = 0.4105
13) accuracy = 0.4265
14) accuracy = 0.4470
15) accuracy = 0.4635
16) accuracy = 0.4765
17) accuracy = 0.4880
18) accuracy = 0.5010
19) accuracy = 0.5130
20) accuracy = 0.5235
21) accuracy = 0.5350
22) accuracy = 0.5430
23) accuracy = 0.5520
24) accuracy = 0.5575
25) accuracy = 0.5645
26) accuracy = 0.5695
27) accuracy = 0.5745
28) accuracy = 0.5855
29) accuracy = 0.5895
30) accuracy = 0.5985
31) accuracy = 0.6045
32) accuracy = 0.6090
33) accuracy = 0.6120
34) accuracy = 0.6155
35) accuracy = 0.6195
36) accuracy = 0.6240
37) accuracy = 0.6260
38) accuracy = 0.6285
39) accuracy = 0.6310
40) accuracy = 0.6325
41) accuracy = 0.6350
42) accuracy = 0.6400
43) accuracy = 0.6420
44) accuracy = 0.6465
45) accuracy = 0.6500
46) accuracy = 0.65

In [None]:
gradientDescent(50000)

1) accuracy = 0.6625
1001) accuracy = 0.8550
2001) accuracy = 0.8680
3001) accuracy = 0.8735
4001) accuracy = 0.8780
5001) accuracy = 0.8810
6001) accuracy = 0.8830
7001) accuracy = 0.8835
8001) accuracy = 0.8845
9001) accuracy = 0.8845
10001) accuracy = 0.8850
11001) accuracy = 0.8855
12001) accuracy = 0.8860
13001) accuracy = 0.8860
14001) accuracy = 0.8855
15001) accuracy = 0.8860
16001) accuracy = 0.8855
17001) accuracy = 0.8855
18001) accuracy = 0.8865
19001) accuracy = 0.8865
20001) accuracy = 0.8860
21001) accuracy = 0.8860
22001) accuracy = 0.8860
23001) accuracy = 0.8860
24001) accuracy = 0.8855
25001) accuracy = 0.8855
26001) accuracy = 0.8845
27001) accuracy = 0.8840
28001) accuracy = 0.8840
29001) accuracy = 0.8835
30001) accuracy = 0.8840
31001) accuracy = 0.8840
32001) accuracy = 0.8840
33001) accuracy = 0.8850
34001) accuracy = 0.8850
35001) accuracy = 0.8845
36001) accuracy = 0.8840
37001) accuracy = 0.8840
38001) accuracy = 0.8835
39001) accuracy = 0.8830
40001) accura