DAY 02 - Feb 26, 2017

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

import os

In [2]:
%matplotlib inline

# Implementing a neural network
**Goal:** To implement a neural network

In [4]:
input_dir = "./data/"
train_file = os.path.join(input_dir, "train.csv")
test_file = os.path.join(input_dir, "test.csv")

In [5]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

# Data
That data I'll be using is the [Titanic dataset from the Kaggle competition](https://www.kaggle.com/c/titanic). The goal of this competition is to predict whether or not a passenger survived the sinking.

In [6]:
# A glimpse of the data
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# Number of rows & columns
train.shape

(891, 12)

## Data wrangling

In [8]:
train.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [9]:
# Get the mode (for filling na)
modes = train.mode().iloc[0]
modes

PassengerId        NaN
Survived             0
Pclass               3
Name               NaN
Sex               male
Age                 24
SibSp                0
Parch                0
Ticket            1601
Fare              8.05
Cabin          B96 B98
Embarked             S
Name: 0, dtype: object

In [10]:
# Replace NA
train.Age.fillna(modes["Age"], inplace=True)
train.Embarked.fillna(modes["Embarked"], inplace=True)

In [11]:
# Map categorical variable to integer
train.Sex.replace({"male":0, "female":1}, inplace=True)
train.Embarked.replace({"C":0, "Q":1, "S":2}, inplace=True)

In [12]:
# Remove categorical variables & passenger ID to obtain features
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


In [13]:
# Check that there are no NAs
train.count()

Survived    891
Pclass      891
Sex         891
Age         891
SibSp       891
Parch       891
Fare        891
dtype: int64

## A simple 2-layer neural network
- Code reference: KAI XUAN WEI's [A simple 2-layer neural network model
](https://www.kaggle.com/vandermode/digit-recognizer/a-simple-2-layer-neural-network-model)
- Andrew Trusk's [A Neural Network in 11 lines of Python](http://iamtrask.github.io/2015/07/12/basic-python-network/) might also be interesting

In [14]:
# Extract data from dataframe
data = train.as_matrix()
data

array([[  0.    ,   3.    ,   0.    , ...,   1.    ,   0.    ,   7.25  ],
       [  1.    ,   1.    ,   1.    , ...,   1.    ,   0.    ,  71.2833],
       [  1.    ,   3.    ,   1.    , ...,   0.    ,   0.    ,   7.925 ],
       ..., 
       [  0.    ,   3.    ,   1.    , ...,   1.    ,   2.    ,  23.45  ],
       [  1.    ,   1.    ,   0.    , ...,   0.    ,   0.    ,  30.    ],
       [  0.    ,   3.    ,   0.    , ...,   0.    ,   0.    ,   7.75  ]])

In [15]:
# Split data into training set and validation set
y = data[:, 0].astype(int)
X = data[:, 1:].astype(np.float64)

In [16]:
train_num = int(X.shape[0]*.9)
val_num = X.shape[0] - train_num

print("features: ", X.shape[1])
print("train: ", train_num)
print("valid: ", val_num)

features:  6
train:  801
valid:  90


In [17]:
X_train, y_train = X[:train_num], y[:train_num]
X_val, y_val = X[train_num:], y[train_num:]

In [18]:
# Data Preprocessing
mean_pixel = X_train.mean(axis=0)
X_train -= mean_pixel
X_val -= mean_pixel

Note: For the following line containing `N, D = train_num, 6`, the 6 refers to the number of input features

In [19]:
# An simple 2-layers full-connected neural network model
# Note we only use numpy 

# Initialize our nn
def initialize_global_weights():
    global W1, b1, W2, b2
    N, D = train_num, 6
    H, C = 500, 2
    W1 = 0.001 * np.random.rand(D, H)
    b1 = np.zeros(H)
    W2 = 0.001 * np.random.rand(H, C)
    b2 = np.zeros(C)

initialize_global_weights()

In [20]:
# Implement our train function
def train_or_evaluate(X, y=None, loss_fn=None, lr=1e-3, reg=0.0):
    global W1, W2, b1, b2
    # forward pass
    a = X.dot(W1) + b1
    scores = a.dot(W2) + b2
    if y is None:
        return scores
    loss, dscores = loss_fn(scores, y)
    print('loss: %f' % loss)
    # backward pass
    dW2 = np.dot(a.T, dscores) + reg * W2
    db2 = np.sum(dscores, axis=0)
    da = np.dot(dscores, W2.T)
    db1 = np.sum(da, axis=0)
    dW1 = np.dot(X.T, da) + reg * W1
    # update params
    W1 += - lr * dW1
    W2 += - lr * dW2
    b1 += - lr * db1
    b2 += - lr * db2
    return loss

In [21]:
# Implement our softmax loss function
def softmax(scores, y):
    N = scores.shape[0]
    scores = scores.copy()
    scores -= np.max(scores, axis=1)[:, None]
    probs = np.exp(scores)
    probs /= np.sum(probs, axis=1)[:, None]
    loss = np.sum(-np.log(probs[np.arange(N), y])) / N
    
    dscores = probs.copy()
    dscores[np.arange(N), y] -= 1
    
    return loss, dscores

In [22]:
# Use initialized weight to checkout train accuracy
scores = train_or_evaluate(X_train)
print((np.argmax(scores, axis=1) == y_train).mean())

0.646691635456


In [23]:
# Training our 2-layer model
num_iters = 2000
initialize_global_weights()
for i in range(num_iters):
    loss = train_or_evaluate(X_train, y_train, softmax, lr=1e-7, reg=1e-5)
    if np.isinf(loss):
        break

loss: 0.693130
loss: 0.693127
loss: 0.693123
loss: 0.693120
loss: 0.693116
loss: 0.693113
loss: 0.693110
loss: 0.693106
loss: 0.693103
loss: 0.693100
loss: 0.693096
loss: 0.693093
loss: 0.693089
loss: 0.693086
loss: 0.693083
loss: 0.693079
loss: 0.693076
loss: 0.693073
loss: 0.693069
loss: 0.693066
loss: 0.693062
loss: 0.693059
loss: 0.693056
loss: 0.693052
loss: 0.693049
loss: 0.693046
loss: 0.693042
loss: 0.693039
loss: 0.693035
loss: 0.693032
loss: 0.693029
loss: 0.693025
loss: 0.693022
loss: 0.693019
loss: 0.693015
loss: 0.693012
loss: 0.693008
loss: 0.693005
loss: 0.693002
loss: 0.692998
loss: 0.692995
loss: 0.692992
loss: 0.692988
loss: 0.692985
loss: 0.692982
loss: 0.692978
loss: 0.692975
loss: 0.692971
loss: 0.692968
loss: 0.692965
loss: 0.692961
loss: 0.692958
loss: 0.692955
loss: 0.692951
loss: 0.692948
loss: 0.692944
loss: 0.692941
loss: 0.692938
loss: 0.692934
loss: 0.692931
loss: 0.692928
loss: 0.692924
loss: 0.692921
loss: 0.692918
loss: 0.692914
loss: 0.692911
loss: 0.69

In [24]:
# Use trained weight to checkout train accuracy and val accuracy
train_scores = train_or_evaluate(X_train)
train_acc = (np.argmax(train_scores, axis=1) == y_train).mean()
val_scores = train_or_evaluate(X_val)
val_acc = (np.argmax(val_scores, axis=1) == y_val).mean() 
print(train_acc, val_acc)

0.659176029963 0.677777777778


## Make our prediction

In [25]:
test.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,male,34.5,0,0,7.8292
1,893,3,female,47.0,1,0,7.0
2,894,2,male,62.0,0,0,9.6875
3,895,3,male,27.0,0,0,8.6625
4,896,3,female,22.0,1,1,12.2875


In [26]:
# Map categorical variable to integer
test.Sex.replace({"male":0, "female":1}, inplace=True)

# Replace NA
test.Age.fillna(modes["Age"], inplace=True)
test.Fare.fillna(modes["Fare"], inplace=True)
test.count()

PassengerId    418
Pclass         418
Sex            418
Age            418
SibSp          418
Parch          418
Fare           418
dtype: int64

In [27]:
# Split data into training set and validation set
testdata = test.as_matrix()
X_test = testdata[:, 1:].astype(np.float64)
X_test.shape

(418, 6)

In [28]:
## Predictions
test_yhat = np.argmax(train_or_evaluate(X_test), axis=1)

In [29]:
pd.DataFrame({"PassengerId":test.PassengerId, "Survived":test_yhat}).to_csv("survival_submission-nn.csv", index=False)