# Initial processing
Load Libraries

In [None]:
import numpy as np
import pandas as pd
import warnings
from sklearn import tree, svm
from sklearn.exceptions import DataConversionWarning
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.neural_network import MLPClassifier
# Suppress warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

Load data

In [None]:
# Load csv data
train = pd.read_csv("../input/train.csv")
compe = pd.read_csv("../input/test.csv")
# All data
data  = train.append(compe, sort=False)

# Fill and convert data
Mapping string value to integer ("Sex", "Embarked", "Cabin", "Title")

In [None]:
# Map male and female
data["Sex"]      = data["Sex"].map({"male": 0, "female": 1})
# Map embarked
data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0])
data["Embarked"] = data["Embarked"].map({"S": 0, "C" : 1, "Q" : 2})
# Map Cabin with their initials
cabin_mapping = {'^A(.*)': 1, '^B(.*)': 2, '^C(.*)': 3, '^D(.*)': 4, '^E(.*)': 5, '^F(.*)': 6, '^G(.*)': 7, '^T(.*)': 8}
data["Cabin"] = data["Cabin"].fillna(0)
data["Cabin"] = data["Cabin"].replace(cabin_mapping, regex=True).astype(int)
# Map title from name
title_mapping = {
    '(.+)Mr\.(.+)': 1, '(.+)Master\.(.+)': 1,
    '(.+)Dr\.(.+)': 2, '(.+)Don\.(.+)': 2, '(.+)Major\.(.+)': 2,
    '(.+)Sir\.(.+)':2, '(.+)Col\.(.+)': 2, '(.+)Jonkheer\.(.+)': 2,
    '(.+)Capt\.(.+)': 2,'(.+)Countess\.(.+)': 2, '(.+)Dona\.(.+)': 2,
    '(.+)Rev\.(.+)': 3,
    '(.+)Ms\.(.+)': 4, '(.*)Miss\.(.+)': 4, '(.+)Mrs\.(.+)': 4,
    '(.+)Mme\.(.+)': 4,'(.+)Lady\.(.+)': 4, '(.+)Mlle\.(.+)': 4 
}
data["Title"] = data["Name"].replace(title_mapping, regex=True).astype(int)

Estimate age from title

In [None]:
# Estimate age from title(1~4)
for i in range(1, 5):
    age_to_estimate = data.groupby('Title')['Age'].median()[i]
    data.loc[(data['Age'].isnull()) & (data['Title'] == i), 'Age'] = age_to_estimate

Estimate fare from pclass

In [None]:
# Estimate fare from pclass(1~3)
for i in range(1, 4):
    fare_to_estimate = data.groupby('Pclass')['Fare'].median()[i]
    data.loc[(data['Fare'].isnull()) & (data['Pclass'] == i), 'Fare'] = fare_to_estimate

Add "FamilySize" and "IsFamily"

In [None]:
# Add FamilySize
data['FamilySize'] = data["Parch"] + data["SibSp"]
# Add IsFamily
data['IsFamily'] = data["Parch"] + data["SibSp"]
data.loc[data['IsFamily'] > 1, 'IsFamily']  = 2
data.loc[data['IsFamily'] == 1, 'IsFamily'] = 1
data.loc[data['IsFamily'] == 0, 'IsFamily'] = 0

Add "FamilySurvival"

In [None]:
data['LastName'] = data['Name'].apply(lambda x: str.split(x, ",")[0])
DEFAULT_SURVIVAL_VALUE = 0.5
data['FamilySurvival'] = DEFAULT_SURVIVAL_VALUE
for grp, grp_df in data.groupby(['LastName', 'Fare']):
    if(len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'FamilySurvival'] = 1
            elif (smin==0.0):
                data.loc[data['PassengerId'] == passID, 'FamilySurvival'] = 0
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['FamilySurvival'] == 0) | (row['FamilySurvival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'FamilySurvival'] = 1
                elif (smin==0.0):
                    data.loc[data['PassengerId'] == passID, 'FamilySurvival'] = 0

Make AgeBin and FareBin

In [None]:
data["AgeBin"]  = pd.qcut(data["Age"], 10, duplicates="drop", labels=False)
data["FareBin"] = pd.qcut(data["Fare"], 10, duplicates="drop", labels=False)

Drop unnecessary data and standardize

In [None]:
# Drop data
train_target = data[:891]["Survived"].values
data.drop(['Survived', 'Name', 'PassengerId', 'Age', 'Fare', 'Ticket', 'LastName'], axis = 1, inplace = True)

# Standardize
ss = StandardScaler()
ss.fit_transform(data)

# Set data
train  = data[:891]
compe  = data[891:]

Possible features  
["Sex", "FamilySurvival", "Title", "Pclass", "Embarked", "FamilySize", "SibSp", "IsFamily", "FareBin", "Cabin", "AgeBin", "Parch"]

In [None]:
data.head()

Feature importances

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
possible_features = ["Pclass", "Sex", "AgeBin", "FareBin", "FamilySize", "IsFamily", "FamilySurvival", "Parch", "SibSp", "Title", "Cabin", "Embarked"]
selector = SelectKBest(f_classif, len(possible_features))
selector.fit(train, train_target)
scores = -np.log10(selector.pvalues_)
indices = np.argsort(scores)[::-1]

print('Feature importances:')
for i in range(len(scores)):
    print('%.2f %s' % (scores[indices[i]], possible_features[indices[i]]))

# Training

In [None]:
# Feature params
fparams = ["Sex", "FamilySurvival", "Title", "Pclass", "Embarked", "FamilySize", "SibSp", "IsFamily", "FareBin",  "AgeBin"]

# Get params
train_features = train[fparams].values
compe_features = compe[fparams].values

# Number of Cross Validation Split
CV_SPLIT_NUM = 6

RandomForestClassifier Grid Search

In [None]:
# Params for RandomForestClassifier
rfgs_parameters = {
    'n_estimators': [300],
    'max_depth'   : [n for n in range(2, 14)],
    'max_features': [n for n in range(2, 8)],
    "min_samples_split": [n for n in range(4, 11)],
    "bootstrap": [True,False]
}
rfc_cv = GridSearchCV(RandomForestClassifier(), rfgs_parameters, cv=CV_SPLIT_NUM)
rfc_cv.fit(train_features, train_target)
print("RFC GridSearch score: "+str(rfc_cv.best_score_))
print("RFC GridSearch params: ")
print(rfc_cv.best_params_)

ExtraTreesClassifier Grid Search

In [None]:
# Params for ExtraTreesClassifier
etcgs_parameters = {
    'n_estimators': [300],
    'max_depth'   : [n for n in range(2, 14)],
    'max_features': [n for n in range(2, 8)],
    "min_samples_split": [n for n in range(4, 11)],
    "bootstrap": [True,False]
}
etc_cv = GridSearchCV(ExtraTreesClassifier(), etcgs_parameters, cv=CV_SPLIT_NUM)
etc_cv.fit(train_features, train_target)
print("ETC GridSearch score: "+str(etc_cv.best_score_))
print("ETC GridSearch params: ")
print(etc_cv.best_params_)

GradientBoostingClassifier Grid Search

In [None]:
# Params for GradientBoostingClassifier
gbcgs_parameters = {
    'loss' : ["deviance","exponential"],
    'n_estimators' : [300],
    'learning_rate': [0.0025,0.005,0.0075,0.01,0.05,0.1],
    'max_depth':  [n for n in range(1,9)],
    'max_features': [n for n in range(1,6)],
    'min_samples_leaf': [n for n in range(2,7)],
}
gbc_cv = GridSearchCV(GradientBoostingClassifier(), gbcgs_parameters, cv=CV_SPLIT_NUM)
gbc_cv.fit(train_features, train_target)
print("GBC GridSearch score: "+str(gbc_cv.best_score_))
print("GBC GridSearch params: ")
print(gbc_cv.best_params_)

SVM Grid Search

In [None]:
# Params for SVM
svcgs_parameters = {
    'kernel': ['rbf'],
    'C':     [0.001,0.0025,0.005,0.0075,0.01,0.025,0.05,0.075,0.1,0.25,0.5,0.75,1,2.5,5,7.5,10,25,50,75,100,500,1000],
    'gamma': [0.001,0.0025,0.005,0.0075,0.01,0.025,0.05,0.075,0.1,0.25,0.5,0.75,1,2.5,5,7.5,10,25,50,75,100,500,1000],
    'probability': [True]
}
svc_cv = GridSearchCV(svm.SVC(), svcgs_parameters, cv=CV_SPLIT_NUM)
svc_cv.fit(train_features, train_target)
print("SVC GridSearch score: "+str(svc_cv.best_score_))
print("SVC GridSearch params: ")
print(svc_cv.best_params_)

VotingClassifier RF+ETC+GBC+SVM

In [None]:
# Voting Classifier
vc = VotingClassifier(estimators=[('rfc', rfc_cv.best_estimator_), ('etc', etc_cv.best_estimator_), ('gbc', gbc_cv.best_estimator_), ('svm', svc_cv.best_estimator_)], voting='soft', n_jobs=4)
vc = vc.fit(train_features, train_target)

# Output

In [None]:
# Predict and output to csv
survived = vc.predict(compe_features)
pred = pd.DataFrame(pd.read_csv("../input/test.csv")['PassengerId'])
pred['Survived'] = survived.astype(int)
pred.to_csv("../working/submission.csv", index = False)