# Assignment 3 Neural Networks
#### Nick Russo

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm, datasets

%matplotlib inline

# Datasets
* Breast Cancer
* Blood Bank
* Iris

## Breast Cancer Data Parsing
* Contains characteristics of patients and tumor information
* Classification: Binary

In [12]:
def cancer_setup_data():
    frame = pd.read_csv('breastCancer.csv')
    cancer_concavity = ['concavity_mean', 'concave_points_mean', 'diagnosis']
    frame = frame[cancer_concavity]
    frame = frame.dropna()
    diag = frame.diagnosis == "M"
    frame.diagnosis = diag.apply(lambda res : 1 if res else 0)
    return frame
cancer_frame = cancer_setup_data()



## Transfusion Blood Bank Data
* Independent variables: information about prospective blood donors.
* Dependent varaibles: Whether the person donated blood.
* Classification: Binary

In [13]:
def blood_bank_setup_data():
    frame = pd.read_csv('transfusion.csv')
    frame = frame.dropna()
    frame = frame[["Recency","Frequency", "Result"]]
    frame.Result = frame.Result.apply(lambda val : 1 if val == 1 else 0)
    return frame
bb_frame = blood_bank_setup_data()

## Iris
* Independent variables: Sepal width, Sepal height
* Dependent variables: species
* Classification: binary, only using two species

In [14]:
iris = datasets.load_iris()
iris_frame = pd.DataFrame(data= np.c_[iris.data[:, :2], iris['target']],
                     columns= iris.feature_names[:2] + ['target'])
iris_frame = iris_frame[iris_frame.target <= 1]
iris_frame.target = iris_frame.target.apply(lambda val : 1 if val == 1 else 0)

# Neural Network
* Nueral Network and Neuron object to help keep state for.

In [19]:
class Neuron(object):
    def __init__(self, weights):
        self.output = None
        self.delta = None
        self.weights = weights

class NeuralNetwork(object):
    def __init__(self, num_inputs, num_hidden, num_outputs):
        self.outputs = None
        self.hidden_index = 0
        self.output_index = 1
        self.sum_error = 0
        self.num_hidden = num_hidden
        self.num_outputs = num_outputs
        self.hidden_layer_section = self.create_randomized_layer_section(num_inputs, num_hidden)
        self.output_layer_section = self.create_randomized_layer_section(num_inputs, num_outputs)
        self.network = [self.hidden_layer_section, self.output_layer_section]
        
    def create_randomized_layer_section(self, num_inputs, num_layers):
        return [Neuron(np.random.rand(num_inputs+1)) for idx in range(num_layers)]
            
    def calc_a_j(self, neuron, x_i):
        return sum(neuron.weights*x_i)
    
    def calc_z_j(self, a_j):
        return 1.0 / (1.0 + np.exp(-a_j))
    
    def calc_y_k(self, s):
        return a_k * (1.0 - a_k)
    
    def calc_output_layer_delta(self, expected, output):
        return (expected - output) * self.calc_y_k(output)
    
    def calc_hidden_layer_delta(self, cur_neuron, cur_neuron_index, next_layer_neuron):
        return (next_layer_neuron.weights[cur_neuron_index] * next_layer_neuron.delta) * self.calc_y_k(cur_neuron.output)
    
    def activate_and_transfer(self, neuron, x_i):
        a_j = self.calc_a_j(neuron, x_i)
        neuron.output = self.calc_z_j(a_j)
        return neuron.output
    
    def forward(self, x_i, expected = None):
        for layer in self.network:
            outputs = [self.activate_and_transfer(neuron, x_i) for neuron in layer]
        self.outputs = outputs
        if expected is not None:
            self.sum_error += sum((expected - outputs)**2)
    
    def backward(self, expected):
        for layer_index, layer in reversed(list(enumerate(self.network))):
            next_layer = layer_index + 1
            if layer_index == self.output_index:
                for neuron_index, neuron in enumerate(layer):
                    neuron.delta = self.calc_output_layer_delta(expected[neuron_index], neuron.output)
            else:
                for cur_neuron_index, cur_neuron in enumerate(layer):
                    for next_layer_neuron in self.network[next_layer]:
                        cur_neuron.delta = self.calc_hidden_layer_delta(cur_neuron, cur_neuron_index, next_layer_neuron)
        
    def update(self, row, lr):
        for layer_index, layer in enumerate(self.network):
            prev_layer_index = layer_index - 1
            if layer_index == self.hidden_index:
                prev_layer = self.network[prev_layer_index]
                vals = np.array([neuron.output for neuron in prev_layer])
            else:
                vals = np.array(row[:-1])
            vals = np.append(vals, [1], axis = 0)
            for neuron in layer:
                neuron.weights += lr * neuron.delta * vals
    def get_expected(self, row):
        expected = np.zeros(self.num_outputs)
        expected[row[-1]] = 1
        return expected
    
    def train(self, train, l_rate, n_epoch, n_outputs):
        for epoch in range(n_epoch):
            self.sum_error = 0
            for row in train:
                expected = self.get_expected(row)
                self.forward(row, expected)
                self.backward(expected)
                self.update(row, l_rate)
    
    def predict(self, row):
        self.forward(row)
        return self.outputs.index(max(self.outputs))

# Results
* Runnings NN on each data frame 10 times and printing the one with the best accuracy

In [23]:
def test(dataframe, lr, iterations):
    best_test = 0
    best_correct = 0
    best_index = 0
    best_accuracy = 0
    for a in range(10):
        mask = np.random.rand(len(dataframe)) < 0.8
        train = dataframe[mask].values
        test = dataframe[~mask].values
        n_inputs = len(train[0]) - 1
        n_outputs = 2
        network = NeuralNetwork(n_inputs, 3, n_outputs)
        network.train(train, lr, iterations, n_outputs)
        correct = list()
        for row in test:
            prediction = network.predict(row)
            correct.append(prediction == int(row[-1]))
        if sum(correct)/len(test) > best_accuracy:
            best_test = test
            best_correct = correct
            best_index = a
            best_accuracy = sum(correct)/len(test)
    print("\n\nTest "+ str(best_index + 1))
    print("Correct: " + str(sum(best_correct)))
    print("Wrong: " + str(len(best_test) - sum(best_correct)))
    print("Accuracy: " + str(sum(best_correct)/len(best_test)))


## Iris 
* Species classification

In [24]:
test(iris_frame, .001, 200)





Test 2
Correct: 20
Wrong: 0
Accuracy: 1.0


## Blood Bank
* Classification if person will be a donor

In [25]:
test(bb_frame, .001, 10)



Test 5
Correct: 120
Wrong: 28
Accuracy: 0.8108108108108109


## Breast Cancer
* Malignant vs benign tumor classification

In [28]:
test(cancer_frame, .001, 100)





Test 4
Correct: 74
Wrong: 45
Accuracy: 0.6218487394957983


## Conclusion
* It looks like the NN was able to get consistently high scores (many 100%) for iris species classification. For predicting whether a person would be a blood donor the accuracies were around 75-80% accurate. While for the breast cancer classification, the NN was only 50-60% accurate. I beleive the breast cancer classification wasnt as accurate because I was only using two variables. If there was further investigation, I would run PCA on it to find the most important variables.