In [319]:
from sklearn.datasets import fetch_mldata
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

In [320]:
from pandas import DataFrame, read_csv
import pandas as pd 
import matplotlib #only needed to determine Matplotlib version number
import numpy as np
import matplotlib.pyplot as plt
# Enable inline plotting
%matplotlib inline
np.set_printoptions(suppress=True)

DISPLAY_MAX_ROWS = 20  # number of max rows to print for a DataFrame
pd.set_option('display.max_rows', DISPLAY_MAX_ROWS)

In [321]:
train = read_csv('data/train.csv')
test = read_csv('data/test.csv')

In [322]:
X_train = train[train.columns.difference(['PassengerId', 'Survived', 'Name', 'Cabin', 'Age', 'Embarked', 'Ticket'])]  # independent variables data
y_train = train.Survived  # dependednt variable data
X_test = test[test.columns.difference(['PassengerId', 'Survived', 'Name', 'Cabin', 'Age', 'Embarked', 'Ticket'])]  # independent variables data

In [323]:
def one_hot_dataframe(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return (data, vecData, vec)

In [324]:
X_train2, _, _ = one_hot_dataframe(X_train, ['Sex'], replace=True)
X_train2 = X_train2.where((pd.notnull(X_train2)), 100)

X_test2, _, _ = one_hot_dataframe(X_test, ['Sex'], replace=True)
X_test2 = X_test2.where((pd.notnull(X_test2)), 100)

In [325]:
X_train2

Unnamed: 0,Fare,Parch,Pclass,SibSp,Sex=female,Sex=male
0,7.2500,0,3,1,0.0,1.0
1,71.2833,0,1,1,1.0,0.0
2,7.9250,0,3,0,1.0,0.0
3,53.1000,0,1,1,1.0,0.0
4,8.0500,0,3,0,0.0,1.0
5,8.4583,0,3,0,0.0,1.0
6,51.8625,0,1,0,0.0,1.0
7,21.0750,1,3,3,0.0,1.0
8,11.1333,2,3,0,1.0,0.0
9,30.0708,0,2,1,1.0,0.0


In [326]:
enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])

In [327]:
# mnist = fetch_mldata("MNIST original")

# # rescale the data, use the traditional train/test split
# X, y = mnist.data / 255., mnist.target
# X_train, X_test = X[:60000], X[60000:]
# y_train, y_test = y[:60000], y[60000:]

In [328]:
# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
#                     solver='sgd', verbose=10, tol=1e-4, random_state=1)
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.001)
mlp.fit(X_train2, y_train)

# pipe = make_pipeline(StandardScaler(), mlp)
# pipe.fit(X_train2, y_train)

joblib.dump(mlp, 'titanic_model_mlp.pkl')
print("Training set score: %f" % mlp.score(X_train2, y_train))
# print("Test set score: %f" % mlp.score(X_test, y_test))


Iteration 1, loss = 0.90836263
Iteration 2, loss = 0.71412880
Iteration 3, loss = 0.68181195
Iteration 4, loss = 0.66140087
Iteration 5, loss = 0.64620167
Iteration 6, loss = 0.63594034
Iteration 7, loss = 0.62861209
Iteration 8, loss = 0.62575156
Iteration 9, loss = 0.61412370
Iteration 10, loss = 0.61472286
Iteration 11, loss = 0.60839036
Iteration 12, loss = 0.60289633
Iteration 13, loss = 0.60000128
Iteration 14, loss = 0.59392623
Iteration 15, loss = 0.59769843
Iteration 16, loss = 0.58997695
Iteration 17, loss = 0.58900056
Iteration 18, loss = 0.58778125
Iteration 19, loss = 0.58751646
Iteration 20, loss = 0.58818631
Iteration 21, loss = 0.58078263
Iteration 22, loss = 0.58285699
Iteration 23, loss = 0.58554547
Iteration 24, loss = 0.57897812
Iteration 25, loss = 0.58544434
Iteration 26, loss = 0.58147415
Iteration 27, loss = 0.57384638
Iteration 28, loss = 0.57149067
Iteration 29, loss = 0.56866536
Iteration 30, loss = 0.56516069
Iteration 31, loss = 0.56492226
Iteration 32, los

In [333]:
mlp = joblib.load('titanic_model_mlp.pkl')
p = mlp.predict(X_test2)
predictions = pd.DataFrame(data={'PassengerId': test.PassengerId, 'Survived': p})

In [335]:
predictions

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [341]:
predictions.to_csv('predictions/result.csv', sep=',', encoding='utf-8', index=False)