In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)



# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

Python version: 3.6.5 |Anaconda custom (64-bit)| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)]
pandas version: 0.23.0
matplotlib version: 2.2.2
NumPy version: 1.14.3
SciPy version: 1.1.0
IPython version: 6.4.0
scikit-learn version: 0.19.1
-------------------------
creditcard.csv
fundamentals.csv
gender_submission.csv
pima-indians-diabetes.data.csv
prices.csv
prices-split-adjusted.csv
test.csv
train.csv



#  Load Data Modelling Libraries

We will use the popular *scikit-learn* library to develop our machine learning algorithms. In *sklearn,* algorithms are called Estimators and implemented in their own classes. For data visualization, we will use the *matplotlib* and *seaborn* library. Below are common classes to load.

In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
# from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [3]:
#import data from file: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
# data = pd.read_csv('clean.csv')

df_train = pd.read_csv("train_new.csv")
df_test = pd.read_csv("test_new.csv")



NUMERIC_COLUMNS=['Alone','Family Size','Sex','Pclass','Fare','FareBand','Age','TitleCat','Embarked'] #72
ORIGINAL_NUMERIC_COLUMNS=['Pclass','Age','SibSp','Parch','Sex','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary','Embarked'] #83
REVISED_NUMERIC_COLUMNS=[ 'Family_Survival','Sex', 'Fare','Pclass','Age','SibSp','Parch','IsAlone','Title','Embarked'] #84

#,'Sx_Cl_Survival','Sx_Em_Survival',"Sx_Si_Survival","Sx_Pa_Survival"

# create test and training data
data_to_train = df_train[REVISED_NUMERIC_COLUMNS].fillna(-1000)

X=data_to_train.values
Y=df_train['Survived'].values
Y = Y.reshape((Y.size,1))





## Dividing data


In [4]:
X_train, X_test, Y_train, Y_test =  model_selection.train_test_split(X, Y, test_size=0.25,random_state=21, stratify=Y)
print(X_train[:2,:],Y_train.shape)

[[0.5        0.         0.13913574 1.         0.475      1.
  0.         0.         0.792      1.        ]
 [0.5        1.         0.01522459 0.38492872 0.2625     0.
  0.         1.         0.15667311 0.60869565]] (668, 1)


# Defining Neural Network
Three layer layers with default values 50, 15, 15

In [5]:

def layer_sizes(X, Y):
    
    n_x = X.shape[1]
    n_h = 50 # size of hidden layer
    n2 = 15
    n3 = 15
    n_y = 1 # size of output layer
    return (n_x, n_h, n2, n3, n_y)



def initialize_parameters(n_x, n_h, n2, n3, n_y):
    np.random.seed(2) # we set up a seed so that your output matches ours although the initialization is random.
    W1 = np.random.randn(n_x,n_h)*0.005 
    b1 = np.zeros((1,n_h))
    W2 = np.random.randn(n_h,n2)*0.005 
    b2 = np.zeros((1,n2))
    W3 = np.random.randn(n2,n3)*0.005 
    b3 = np.zeros((1,n3))
    W4 = np.random.randn(n3,n_y)*0.0075 
    b4 = np.zeros((1,n_y))    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3,
                  "W4": W4,
                  "b4": b4}
    
    return parameters


# Data Standardization
from sklearn import preprocessing
def forward_propagation(X, parameters):
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    W4 = parameters["W4"]
    b4 = parameters["b4"]
    
#     X=  preprocessing.scale(X,axis=1)   

    Z1 = np.dot(X,W1)+b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(A1,W2)+b2
    A2 = np.tanh(Z2)
    Z3 = np.dot(A2,W3)+b3
    A3 = np.tanh(Z3)
    Z4 = np.dot(A3,W4)+b4
    A4 = sigmoid(Z4)
#     print("DB W1 = %s, b1 = %s, W2 = %s, b2 = %s, , W3 = %s, b3 = %s"%(W1.shape,b1.shape,W2.shape,b2.shape,W3.shape,b3.shape))
#     assert(A2.shape == (1, X.shape[1]))
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2,
             "Z3": Z3,
             "A3": A3,
             "Z4": Z4,
             "A4": A4}
    
    return A4, cache

def sigmoid(x):
    """
    Compute the sigmoid of x

    Arguments:
    x -- A scalar or numpy array of any size.

    Return:
    s -- sigmoid(x)
    """
    s = 1/(1+np.exp(-x))
    return s
def compute_cost(A4, Y, parameters):
    m = Y.size # number of example
    cost = np.sum((A4-Y)**2)/m
#     cost = -np.sum(Y*np.log(A2))/m
#     print(A2)
#     print(m)
#     logprobs = np.multiply(np.log(A2),Y) + np.multiply(np.log(1-A2),1-Y)
#     cost = - np.sum(logprobs) * (1./m)
#     cost = np.squeeze(cost)     # makes sure cost is the dimension we expect. 
                                # E.g., turns [[17]] into 17 
#     assert(isinstance(cost, float))
    
    return cost 

def StestpestDescent(parameters, cache, X, Y): 
    m = X.shape[0]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    W3 = parameters["W3"]
    W4 = parameters["W4"]
#     Z1 = cache["Z1"]
    Z2 = cache["Z2"]
    A1 = cache["A1"]
    A2 = cache["A2"]
    A3 = cache["A3"]
    A4 = cache["A4"]

    d4 = (A4-Y) #* Z2*(1-Z2)
#     print("debug d3 = ",d3.shape)
    d3 = np.dot(d4,W4.T)*(1- np.power(A3,2))
    d2 = np.dot(d3,W3.T)*(1- np.power(A2,2))
    d1 = np.dot(d2,W2.T)*(1- np.power(A1,2))
   
#     print("DB A1 = %s W2 =%s d1 = %s" %(A1.shape,W2.shape,d1.shape))
    dW4 = np.dot(A3.T,d4)/m
    db4 = np.sum(d4,axis=0,keepdims=True)/m
    dW3 = np.dot(A2.T,d3)/m
#     print("BD1 dW3 =%s, W3 = %s"%(dW3.shape,W3.shape))
    db3 = np.sum(d3,axis=0,keepdims=True)/m
    dW2 = np.dot(A1.T,d2)/m
#     print("BD2 dW2 =%s, W2 = %s"%(dW2.shape,W2.shape))
    db2 = np.sum(d2,axis=0,keepdims=True)/m
#     print("DB3 dW1",db2.T)
#     print("DB3 d1=",d1.T)
    dW1 = np.dot(X.T,d1)/m
#     print("debug dW1 =%s, W1 = %s"%(dW1.shape,W1.shape))
    db1 = np.sum(d1,axis=0,keepdims=True)/m
#     print("debug db1 =%s, b1 = %s"%(db1.shape,parameters["b1"].shape))
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2,
             "dW3": dW3,
             "db3": db3,
             "dW4": dW4,
             "db4": db4}
    
    return grads

def update_parameters(parameters, grads, learning_rate = .75):

    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    W4 = parameters["W4"]
    b4 = parameters["b4"]
    
    dW1 = grads["dW1"]
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]
    dW3 = grads["dW3"]
    db3 = grads["db3"]
    dW4 = grads["dW4"]
    db4 = grads["db4"]
    
#     rate = .01#np.linspace(0,learning_rate = 5,1000)
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    W3 = W3 - learning_rate*dW3
    b3 = b3 - learning_rate*db3
    W4 = W4 - learning_rate*dW4
    b4 = b4 - learning_rate*db4

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3,
                  "W4": W4,
                  "b4": b4}
    
    return parameters

In [6]:
def nn_model(X, Y, n_h, n2, n3, num_iterations = 10000, print_cost=False):

    np.random.seed(3)
    n_x = layer_sizes(X, Y)[0]
    n_y = layer_sizes(X, Y)[-1]
    

    parameters = initialize_parameters(n_x, n_h, n2, n3, n_y)
#     W1 = parameters["W1"]
#     b1 = parameters["b1"]
#     W2 = parameters["W2"]
#     b2 = parameters["b2"]
#     W3 = parameters["W3"]
#     b3 = parameters["b3"]

    for i in range(0, num_iterations):
        A3, cache = forward_propagation(X, parameters)
        cost = compute_cost(A3, Y, parameters)
        if cost <.11: break
        grads = StestpestDescent(parameters, cache, X, Y)
        parameters = update_parameters(parameters, grads,learning_rate = .2 - 10**(-7)*i)
        if print_cost and i % 5000 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    print ("Cost after iteration %i: %f" %(i, cost))
    
    return parameters

In [7]:
n_h, n2, n3 = 50, 20, 20
parameters = nn_model(X_train, Y_train, n_h, n2, n3, num_iterations=100000, print_cost=True)

Cost after iteration 0: 0.250000
Cost after iteration 5000: 0.236366
Cost after iteration 10000: 0.236365
Cost after iteration 15000: 0.236364
Cost after iteration 19928: 0.109999


# Prediction

In [8]:
def predict(parameters, X):
    A3, cache = forward_propagation(X, parameters)
    predictions = (A3 >= 0.5)
    return np.array(predictions)
train_predictions = predict(parameters, X_train)
test_predictions = predict(parameters, X_test)
train_accuracy = float((np.dot(Y_train[:,0],train_predictions) + np.dot(1-Y_train[:,0],1-train_predictions))/float(Y_train.size)*100)
test_accuracy = float((np.dot(Y_test[:,0],test_predictions) + np.dot(1-Y_test[:,0],1-test_predictions))/float(Y_test.size)*100)
print ("Accuracy for {}, {}, {} hidden units: Train set  {:.2f}%, Test set {:.2f}%".format(n_h, n2, n3, train_accuracy, test_accuracy))

Accuracy for 50, 20, 20 hidden units: Train set  85.48%, Test set 85.20%


# Some other prediction using other settings of Neural Networks

In [9]:
hidden_layer_sizes =[ [50,30,20], [60,30,30]]
for (n_h, n2, n3) in hidden_layer_sizes:
    print ("Hidden units:", n_h, n2, n3)
    n_x = layer_sizes(X, Y)[0]
    n_y = layer_sizes(X, Y)[-1]
    parameters = initialize_parameters(n_x, n_h, n2, n3, n_y)
    parameters = nn_model(X_train, Y_train, n_h, n2, n3, num_iterations = 50000)
    train_predictions = predict(parameters, X_train)
    test_predictions = predict(parameters, X_test)
    train_accuracy = float((np.dot(Y_train[:,0],train_predictions) + np.dot(1-Y_train[:,0],1-train_predictions))/float(Y_train.size)*100)
    test_accuracy = float((np.dot(Y_test[:,0],test_predictions) + np.dot(1-Y_test[:,0],1-test_predictions))/float(Y_test.size)*100)
    print ("  Accuracy for {}, {}, {} hidden units: Train set {:.2f}%, Test set {:.2f}%".format(n_h, n2, n3, train_accuracy, test_accuracy))

Hidden units: 50 30 20
Cost after iteration 16686: 0.109999
  Accuracy for 50, 30, 20 hidden units: Train set 85.48%, Test set 85.20%
Hidden units: 60 30 30
Cost after iteration 15037: 0.109998
  Accuracy for 60, 30, 30 hidden units: Train set 85.48%, Test set 85.20%
