In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
import matplotlib
import random

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [4]:
df['Sex'].replace('male',0,inplace=True)
df['Sex'].replace('female',1,inplace=True)
df = df.drop(["Ticket","Name","PassengerId"],axis=1)
df['Embarked'].replace('S',int(0),inplace=True)
df['Embarked'].replace('C',int(1),inplace=True)
df['Embarked'].replace('Q',int(1),inplace=True)

df.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
886,0,2,0,27.0,0,0,13.0,,0.0
887,1,1,1,19.0,0,0,30.0,B42,0.0
888,0,3,1,,1,2,23.45,,0.0
889,1,1,0,26.0,0,0,30.0,C148,1.0
890,0,3,0,32.0,0,0,7.75,,1.0


In [5]:
# Add cabin into the feature set

df["Cabin"].fillna(0,inplace=True)
df['Cabin'] = df['Cabin'].astype(str)
df['Cabin'] = df.Cabin.str.replace(r'([A].*$)',"1")
df['Cabin'] = df.Cabin.str.replace(r'([B].*$)',"2")
df['Cabin'] = df.Cabin.str.replace(r'([C].*$)',"3")
df['Cabin'] = df.Cabin.str.replace(r'([D].*$)',"4")
df['Cabin'] = df.Cabin.str.replace(r'([E].*$)',"5")
df['Cabin'] = df.Cabin.str.replace(r'([F].*$)',"6")
df['Cabin'] = df.Cabin.str.replace(r'([G].*$)',"7")
df['Cabin'] = df.Cabin.str.replace(r'([T].*$)',"8")
df['Cabin'] = df['Cabin'].astype(int)


In [6]:
# after cabin is filled, we need to fill age
mean_age = df["Age"].mean()
std_age = df["Age"].std()
df["Age"] = df["Age"].fillna(random.randint(int(std_age),int(mean_age)))
df.count()

Survived    891
Pclass      891
Sex         891
Age         891
SibSp       891
Parch       891
Fare        891
Cabin       891
Embarked    889
dtype: int64

In [7]:
# Embarked is still missing. need to fill
mean_em = df["Embarked"].mean()
std_em = df["Embarked"].std()
df["Embarked"] = df["Embarked"].fillna(random.randint(int(std_em),int(mean_em)))
df.count()

Survived    891
Pclass      891
Sex         891
Age         891
SibSp       891
Parch       891
Fare        891
Cabin       891
Embarked    891
dtype: int64

In [8]:
x = df.drop(["Survived"],axis=1)
y = df["Survived"]

# The below is used to split a df into test and train sets.
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.25, random_state=27)

def train_model(x_train,y_train,layer_shape,optimizer):
    # layer shape is in a python list [20,20,20]
    #solver options => solver : {‘lbfgs’, ‘sgd’, ‘adam’}, default ‘adam’
    model = MLPClassifier(hidden_layer_sizes=(layer_shape), max_iter=6000, alpha=0.000000001,
                         solver=optimizer, verbose=False,random_state=21,tol=0.000000001,learning_rate="adaptive")
    model.fit(x_train, y_train)
    return model

In [27]:
type(y_train)

pandas.core.series.Series

In [9]:

#optimizer options => solver : {‘lbfgs’, ‘sgd’, ‘adam’}, default ‘adam’ for MLPClassifier and regressor
optimizer = 'adam'
layer_shape = [30,30]
model = train_model(x_train,y_train,layer_shape,optimizer)
y_pred = model.predict(x_test)
## accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)
accuracy_score(y_test,y_pred)

0.7982062780269058

In [10]:
## Test data
# to use on the prediction, not testing accuracy
test_df_full= pd.read_csv('test.csv')
passenger_ids = test_df_full["PassengerId"]
test_df = test_df_full.drop(["PassengerId","Name","Ticket"],axis=1)
## check for NaN values with test_df["Embarked"].isnull().values.any()
## Age and Fare are missing some values.
## We will use a random number between standard variation and mean to fill the NaN values
test_df['Sex'].replace('male',0,inplace=True)
test_df['Sex'].replace('female',1,inplace=True)
test_df['Embarked'].replace('S',0,inplace=True)
test_df['Embarked'].replace('C',1,inplace=True)
test_df['Embarked'].replace('Q',1,inplace=True)
test_df["Cabin"].fillna(0,inplace=True)
test_df['Cabin'] = test_df['Cabin'].astype(str)
test_df['Cabin'] = test_df.Cabin.str.replace(r'([A].*$)',"1")
test_df['Cabin'] = test_df.Cabin.str.replace(r'([B].*$)',"2")
test_df['Cabin'] = test_df.Cabin.str.replace(r'([C].*$)',"3")
test_df['Cabin'] = test_df.Cabin.str.replace(r'([D].*$)',"4")
test_df['Cabin'] = test_df.Cabin.str.replace(r'([E].*$)',"5")
test_df['Cabin'] = test_df.Cabin.str.replace(r'([F].*$)',"6")
test_df['Cabin'] = test_df.Cabin.str.replace(r'([G].*$)',"7")
test_df['Cabin'] = test_df.Cabin.str.replace(r'([T].*$)',"8")
test_df['Cabin'] = test_df['Cabin'].astype(int)
test_df["Age"] = test_df["Age"].fillna(random.randint(int(std_age),int(mean_age)))
fare_std = int(df['Fare'].std())
fare_mean = int(df['Fare'].mean())
test_df['Fare'] = test_df["Fare"].fillna(random.randint(fare_mean,fare_std))
test_df.count()

Pclass      418
Sex         418
Age         418
SibSp       418
Parch       418
Fare        418
Cabin       418
Embarked    418
dtype: int64

In [11]:
test_x_input = test_df
#optimizer options => solver : {‘lbfgs’, ‘sgd’, ‘adam’}, default ‘adam’ for MLPClassifier and regressor
optimizer = 'adam'
layer_shape = [30,30]
test_y_pred = model.predict(test_x_input)
## accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

In [12]:
test_df["Survived"] = test_y_pred
test_df["PassengerId"] = passenger_ids
test_df = test_df.drop(["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Cabin"],axis=1)


In [13]:
test_df.count()

Survived       418
PassengerId    418
dtype: int64

In [14]:
test_df.to_csv('titanic_sklearn__with_cabin_more data.csv', index=False)