In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import tree
from collections import defaultdict
import math
import re

# Load data

In [3]:
source_train_df = pd.read_csv("data/train.csv")
source_test_df = pd.read_csv("data/test.csv")

# Derived value extraction

In [4]:
# Set some data as categorical if it indeed is
source_train_df["Pclass"] = source_train_df["Pclass"].astype('category')
source_test_df["Pclass"] = source_test_df["Pclass"].astype('category')

In [5]:
def add_name_derived_features(df):
    df["FamilyName"] = df["Name"].map(lambda name: name.split(",")[0])
    df["NameHasQuotes"] = df["Name"].map(lambda name: 1 if name.find("\"") != -1 else 0)
    df["NameHasParentheses"] = df["Name"].map(lambda name: 1 if name.find("(") != -1 else 0)

def add_title(df):
    
    def _get_title(name):
        title_and_name = name.split(",")[1]
        return title_and_name.split(".")[0].strip()
    
    df["Title"] = df["Name"].map(lambda name : _get_title(name) )
    
    title_to_title_family = {
         "Capt":       "officer",
         "Col":        "officer",
         "Major":      "officer",
         "Dr":         "officer",
         "Rev":        "officer",
         "Jonkheer":   "snob",
         "Don":        "snob",
         "Sir" :       "snob",
         "the Countess":"snob",
         "Dona":       "snob",
         "Lady" :      "snob",
         "Mme":        "married",
         "Ms":         "married",
         "Mrs" :       "married",
         "Miss" :      "single",
         "Mlle":       "single",
         "Mr" :        "man",
         "Master" :    "boy"
    }
    
    df["TitleGroup"] = df["Title"].map(lambda title: title_to_title_family[title])
    # Fill age nan for each group
    median_age_by_pclass_and_title = defaultdict(dict)
    for pclass in [1, 2, 3]:
        for title in title_to_title_family.keys():
            median_age_by_pclass_and_title[pclass][title] = df[(df.Pclass == pclass) & (df.Title == title)]["Age"].mean()
    
    def set_age(tuple):
        if tuple["Age"] is np.nan:
            tuple["Age"] = median_age_by_pclass_and_title[tuple["Pclass"]][tuple["Title"]]
        return tuple
    
    df["Age"] = [median_age_by_title[row["Title"]] if row.Age is np.nan else row["Age"] 
                        for _, row in df.iterrows()]
    
    df["isChild"] = [1. if row.Age < 15 else 0. for _, row in df.iterrows()]
    df["isOld"] = [1. if row.Age >= 65 else 0. for _, row in df.iterrows()]


# Add cabin deck
def add_deck(df):
    
    df['Cabin'] = df['Cabin'].map(lambda x : "Unknown" if x == "" or x is np.nan else x)
    df['Deck_Unknown'] = df['Cabin'].map(lambda x : 1 if x == "Unknown" else 0)
    
    deck_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G']
    for deck in deck_list:
        df['Deck_{}'.format(deck)] = df['Cabin'].map(lambda x : 1 if deck in x else 0)


def add_room_number(df):
    
    def _room_number(cabin):
        result = re.compile(r"([0-9]+)").search(cabin)
        if result:
            return result.group()
        return 0
    
    df['Room'] = df['Cabin'].map(lambda cell: _room_number(cell) if cell != "Unknown" else np.nan).astype(float) 
    
    def _has_room(room):
        if room is np.nan or math.isnan(room):
            return 0
        return 1
    
    df['HasRoom'] = df['Room'].map(_has_room)
    room_mean = df['Room'].mean()
    df['Room'] = df['Room'].fillna(room_mean)
    df['Room'] = df['Room'].astype(int)
    df['RoomInFront'] = df['Room'].map(lambda cell: 1 if cell <= room_mean else 0)
    df['RoomInBack'] = df['Room'].map(lambda cell: 1 if cell > room_mean else 0)
            

def add_family_size(df):
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df['Singleton'] = df['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    df['SmallFamily'] = df['FamilySize'].map(lambda s: 1 if 2<=s<=4 else 0)
    df['LargeFamily'] = df['FamilySize'].map(lambda s: 1 if 5<=s else 0)


def add_fare_per_person(df):
    df['FarePerPerson'] = df['Fare']/(df['FamilySize']+1)

    
def add_ticket_derived_features(df):
    ticket_count = df["Ticket"].value_counts()
    df["TicketCount"] = df["Ticket"].map(lambda ticket : ticket_count[ticket])
    df["TicketPrefix"] = df["Ticket"].map(lambda ticket : ticket.split(" ")[0] if " " in ticket else "")
    df["TicketNumber"] = df["Ticket"].map(lambda ticket : ticket.split(" ")[1] if " " in ticket else ticket)
    df["TicketNumberSize"] = df["TicketNumber"].map(lambda ticket : len(ticket))
    
    def _get_first_digit(ticket_number):
        if re.match(r"\d+", ticket_number):
            return "{}".format(ticket_number)[0]
        return np.nan
    
    df["TicketFirstDigit"] = df["TicketNumber"].map(_get_first_digit).astype(float)


def add_new_features(df):
    add_name_derived_features(df)
    add_title(df)
    add_deck(df)
    add_family_size(df)
    add_fare_per_person(df)
    add_room_number(df)
    add_ticket_derived_features(df)

In [6]:
add_new_features(source_train_df)
add_new_features(source_test_df)

ticket_prefix_categories = list(source_train_df["TicketPrefix"].unique())

In [7]:
print(source_train_df.columns)

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked',
       u'FamilyName', u'NameHasQuotes', u'NameHasParentheses', u'Title',
       u'TitleGroup', u'isChild', u'isOld', u'Deck_Unknown', u'Deck_A',
       u'Deck_B', u'Deck_C', u'Deck_D', u'Deck_E', u'Deck_F', u'Deck_T',
       u'Deck_G', u'FamilySize', u'Singleton', u'SmallFamily', u'LargeFamily',
       u'FarePerPerson', u'Room', u'HasRoom', u'RoomInFront', u'RoomInBack',
       u'TicketCount', u'TicketPrefix', u'TicketNumber', u'TicketFirstDigit'],
      dtype='object')


In [8]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
source_train_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilyName,NameHasQuotes,NameHasParentheses,Title,TitleGroup,isChild,isOld,Deck_Unknown,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_T,Deck_G,FamilySize,Singleton,SmallFamily,LargeFamily,FarePerPerson,Room,HasRoom,RoomInFront,RoomInBack,TicketCount,TicketPrefix,TicketNumber,TicketFirstDigit
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,Braund,0,0,Mr,man,0.0,0.0,1,0,0,0,0,0,0,0,0,1,1,0,0,3.625,49,0,1,0,1,A/5,21171,2.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,0,1,Mrs,married,0.0,0.0,0,0,0,1,0,0,0,0,0,1,1,0,0,35.64165,85,1,0,1,1,PC,17599,1.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,Heikkinen,0,0,Miss,single,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,7.925,49,0,1,0,1,STON/O2.,3101282,3.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle,0,1,Mrs,married,0.0,0.0,0,0,0,1,0,0,0,0,0,1,1,0,0,26.55,123,1,0,1,2,,113803,1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,Allen,0,0,Mr,man,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,8.05,49,0,1,0,1,,373450,3.0
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Unknown,Q,Moran,0,0,Mr,man,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,8.4583,49,0,1,0,1,,330877,3.0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,McCarthy,0,0,Mr,man,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,51.8625,46,1,1,0,1,,17463,1.0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,Unknown,S,Palsson,0,0,Master,boy,1.0,0.0,1,0,0,0,0,0,0,0,0,4,0,1,0,4.215,49,0,1,0,4,,349909,3.0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,Unknown,S,Johnson,0,1,Mrs,married,0.0,0.0,1,0,0,0,0,0,0,0,0,2,0,1,0,3.7111,49,0,1,0,3,,347742,3.0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,Unknown,C,Nasser,0,1,Mrs,married,1.0,0.0,1,0,0,0,0,0,0,0,0,1,1,0,0,15.0354,49,0,1,0,2,,237736,2.0


In [9]:
# Fare
# It is known that fare == 0 is an error, so will be replacing it to nan
source_train_df["Fare"] = source_train_df["Fare"].map(lambda _fare: np.nan if _fare == 0.0 else _fare)
source_test_df["Fare"] = source_test_df["Fare"].map(lambda _fare: np.nan if _fare == 0.0 else _fare)

In [10]:
# Replace nan values
def replace_nan_values(df):
    return df.fillna(df.median())
    
train_no_nans_df = replace_nan_values(source_train_df)
test_no_nans_df = replace_nan_values(source_test_df)

In [11]:
# Drop columns we are not interested
columns_to_drop = ["Name", "Ticket", "Cabin"]

simplified_train_df = train_no_nans_df.drop(columns_to_drop, axis=1)
simplified_test_df = test_no_nans_df.drop(columns_to_drop, axis=1)

In [12]:
# Nominal attributes are changed to values
categorical_columns = ["Pclass", "Sex", "Embarked", "Title", "TitleGroup", "TicketPrefix"]

expanded_train_df = pd.get_dummies(simplified_train_df, columns=categorical_columns)
expanded_columns = expanded_train_df.columns.values.tolist()

expanded_test_df = pd.get_dummies(simplified_test_df, columns=categorical_columns)

In [13]:
# In case any column of test is not present in train, set it to zero
all_columns = set(expanded_train_df.columns).union(set(expanded_test_df.columns))
for column in all_columns:
    if column not in expanded_train_df.columns:
        expanded_train_df[column] = 0
    if column not in expanded_test_df.columns:
        expanded_test_df[column] = 0

In [14]:
print(expanded_train_df.columns)
print(expanded_test_df.columns)

Index([u'PassengerId', u'Survived', u'Age', u'SibSp', u'Parch', u'Fare',
       u'FamilyName', u'NameHasQuotes', u'NameHasParentheses', u'isChild',
       ...
       u'TicketPrefix_W/C', u'TicketPrefix_WE/P', u'TicketPrefix_A.',
       u'TicketPrefix_STON/OQ.', u'TicketPrefix_SC/A.3', u'TicketPrefix_AQ/3.',
       u'TicketPrefix_LP', u'TicketPrefix_SC/A4', u'TicketPrefix_AQ/4',
       u'Title_Dona'],
      dtype='object', length=114)
Index([u'PassengerId', u'Age', u'SibSp', u'Parch', u'Fare', u'FamilyName',
       u'NameHasQuotes', u'NameHasParentheses', u'isChild', u'isOld',
       ...
       u'TicketPrefix_A/4.', u'TicketPrefix_Fa', u'TicketPrefix_S.P.',
       u'Title_Major', u'Title_Capt', u'TicketPrefix_W/C',
       u'TicketPrefix_S.O.P.', u'TicketPrefix_S.W./PP', u'Title_the Countess',
       u'Title_Don'],
      dtype='object', length=114)


In [15]:
assert len(expanded_train_df.columns) == len(expanded_test_df.columns)

In [16]:
expanded_train_df.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,FamilyName,NameHasQuotes,NameHasParentheses,isChild,isOld,Deck_Unknown,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_T,Deck_G,FamilySize,Singleton,SmallFamily,LargeFamily,FarePerPerson,Room,HasRoom,RoomInFront,RoomInBack,TicketCount,TicketNumber,TicketFirstDigit,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Capt,Title_Col,Title_Don,Title_Dr,Title_Jonkheer,Title_Lady,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Title_the Countess,TitleGroup_boy,TitleGroup_man,TitleGroup_married,TitleGroup_officer,TitleGroup_single,TitleGroup_snob,TicketPrefix_,TicketPrefix_A./5.,TicketPrefix_A.5.,TicketPrefix_A/4,TicketPrefix_A/4.,TicketPrefix_A/5,TicketPrefix_A/5.,TicketPrefix_A/S,TicketPrefix_A4.,TicketPrefix_C,TicketPrefix_C.A.,TicketPrefix_C.A./SOTON,TicketPrefix_CA,TicketPrefix_CA.,TicketPrefix_F.C.,TicketPrefix_F.C.C.,TicketPrefix_Fa,TicketPrefix_P/PP,TicketPrefix_PC,TicketPrefix_PP,TicketPrefix_S.C./A.4.,TicketPrefix_S.C./PARIS,TicketPrefix_S.O./P.P.,TicketPrefix_S.O.C.,TicketPrefix_S.O.P.,TicketPrefix_S.P.,TicketPrefix_S.W./PP,TicketPrefix_SC,TicketPrefix_SC/AH,TicketPrefix_SC/PARIS,TicketPrefix_SC/Paris,TicketPrefix_SCO/W,TicketPrefix_SO/C,TicketPrefix_SOTON/O.Q.,TicketPrefix_SOTON/O2,TicketPrefix_SOTON/OQ,TicketPrefix_STON/O,TicketPrefix_STON/O2.,TicketPrefix_SW/PP,TicketPrefix_W./C.,TicketPrefix_W.E.P.,TicketPrefix_W/C,TicketPrefix_WE/P,TicketPrefix_A.,TicketPrefix_STON/OQ.,TicketPrefix_SC/A.3,TicketPrefix_AQ/3.,TicketPrefix_LP,TicketPrefix_SC/A4,TicketPrefix_AQ/4,Title_Dona
0,1,0,22.0,1,0,7.25,Braund,0,0,0.0,0.0,1,0,0,0,0,0,0,0,0,1,1,0,0,3.625,49,0,1,0,1,21171,2.0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,38.0,1,0,71.2833,Cumings,0,1,0.0,0.0,0,0,0,1,0,0,0,0,0,1,1,0,0,35.64165,85,1,0,1,1,17599,1.0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1,26.0,0,0,7.925,Heikkinen,0,0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,7.925,49,0,1,0,1,3101282,3.0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,1,35.0,1,0,53.1,Futrelle,0,1,0.0,0.0,0,0,0,1,0,0,0,0,0,1,1,0,0,26.55,123,1,0,1,2,113803,1.0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0,35.0,0,0,8.05,Allen,0,0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,8.05,49,0,1,0,1,373450,3.0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Prepare raw data for algorithms

In [17]:
# Features that will have a role in the classification
selected_features = [
    "PassengerId",
    "Age",
    "Pclass_1", "Pclass_2", "Pclass_3",
    "Sex_female", "Sex_male",
    #"NameHasQuotes", "NameHasParentheses",
    "Fare",
    "FarePerPerson",
    "SibSp",
    "Parch",
    "TicketCount",
    #"TicketFirstDigit",
    "FamilySize",
    "Singleton",
    "SmallFamily",
    "LargeFamily",
    "HasRoom",
    "Room",
    "RoomInBack",
    "RoomInFront",
    "isChild",
    "isOld",
    "Deck_A", "Deck_B", "Deck_C", "Deck_D", "Deck_E", "Deck_F", "Deck_T", "Deck_G"
] +\
["TitleGroup_{}".format(family_group) for family_group in ["officer", "snob", "married", "single", "man", "boy"]] +\
["TicketPrefix_{}".format(ticket_prefix) for ticket_prefix in ticket_prefix_categories]

# X and Y are the input and output of the classifier algorithm
train_y = train_no_nans_df.Survived.astype(int).values

# test_x and train_x must have the same number of columns and
# test_x has no "Survived" column so we must drop it from train_x
train_x = expanded_train_df[selected_features].values

test_x = expanded_test_df[selected_features].values

# Centroid

In [18]:
# Basic decision tree Pipeline
classifier = Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('classifier', NearestCentroid())
    ])

parameters = {
    'classifier__metric': ["manhattan", "euclidean"], 
    'classifier__shrink_threshold': [None, .05, .1, .2, .5, .55, .6, .7, .74, .75, .77, .8, .85, .9, 1],
}

nearest_centroid_clf = GridSearchCV(classifier, parameters, cv=5)

nearest_centroid_clf.fit(train_x, train_y)

print ("Best parameters found: ")
print (nearest_centroid_clf.best_params_)

NFOLDS = 5
scores = cross_val_score(nearest_centroid_clf.best_estimator_, train_x, train_y, cv=NFOLDS)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

nearest_centroid_prediction = nearest_centroid_clf.predict(test_x)

# Add the prediction to the test dataset
test_classified_with_nearest_centroid = source_test_df.assign(Survived=list(nearest_centroid_prediction))

# Save to upload to Kaggle
test_classified_with_nearest_centroid.to_csv("results/test_classified_with_nearest_centroid.csv", columns=["PassengerId", "Survived"], index=False)

Best parameters found: 
{'classifier__shrink_threshold': 0.75, 'classifier__metric': 'manhattan'}
Expected performance: 79.79% (+/-1.75).


# Basic decision tree

In [19]:
# Basic decision tree Pipeline
classifier = Pipeline([
        ('classifier', tree.DecisionTreeClassifier())
    ])

max_depths = [10, 20, 30, 40, 50, 70, 100, 150, 200, 300, 400, 1000]
parameters = {
    'classifier__max_depth': max_depths, 
    'classifier__criterion': ["gini", "entropy"],
    'classifier__splitter': ["best", "random"],
    'classifier__min_samples_split':[2, 3, 4, 5, 7, 10, 15, 20, 25],
    'classifier__random_state': [1],
    'classifier__max_features': [2, 5, 10, 20, 40, "auto", "sqrt", "log2", None]
}

tree_clf = GridSearchCV(classifier, parameters, cv=5)

tree_clf.fit(train_x, train_y)

print ("Best parameters found: ")
print (tree_clf.best_params_)

NFOLDS = 5
scores = cross_val_score(tree_clf.best_estimator_, train_x, train_y, cv=NFOLDS)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

tree_prediction = tree_clf.predict(test_x)

# Add the prediction to the test dataset
test_classified_with_decision_tree = source_test_df.assign(Survived=list(tree_prediction))

# Save to upload to Kaggle
test_classified_with_decision_tree.to_csv("results/test_classified_with_decision_tree.csv", columns=["PassengerId", "Survived"], index=False)

Best parameters found: 
{'classifier__max_features': 20, 'classifier__min_samples_split': 25, 'classifier__splitter': 'best', 'classifier__max_depth': 10, 'classifier__random_state': 1, 'classifier__criterion': 'gini'}
Expected performance: 83.18% (+/-2.71).


# Predict test values with KNN

In [20]:
# KNN Pipeline
classifier = Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('classifier', KNeighborsClassifier())
    ])

k_values = [1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]
parameters = {
    'classifier__n_neighbors': k_values, 
    'classifier__weights': ["uniform", "distance"],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

knn_clf = GridSearchCV(classifier, parameters, cv=5)

knn_clf.fit(train_x, train_y)

print ("Best parameters found: ")
print (knn_clf.best_params_)

NFOLDS = 5
scores = cross_val_score(knn_clf.best_estimator_, train_x, train_y, cv=NFOLDS)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

# Make the prediction over the test set
knn_prediction = knn_clf.predict(test_x)

# Add the prediction to the test dataset
test_classified_with_knn = source_test_df.assign(Survived=list(knn_prediction))

# Save to upload to Kaggle
test_classified_with_knn.to_csv("results/test_classified_with_knn.csv", columns=["PassengerId", "Survived"], index=False)

Best parameters found: 
{'classifier__algorithm': 'auto', 'classifier__n_neighbors': 5, 'classifier__weights': 'distance'}
Expected performance: 82.61% (+/-2.16).


# Predict test values with RandomForest

In [None]:
# Random forest Pipeline

classifier = Pipeline([
        ('classifier', RandomForestClassifier())
    ])


parameter_grid = {
                 'classifier__max_depth' : [4, 6, 12, 20, None],
                 'classifier__criterion': ['gini', 'entropy'],
                 'classifier__n_estimators': [2, 10, 50, 100],
                 'classifier__max_features': ['sqrt', 'auto', 'log2', None],
                 'classifier__min_samples_split': [2, 3, 10],
                 'classifier__min_samples_leaf': [1, 3, 10],
                 'classifier__bootstrap': [True, False],
                 'classifier__n_jobs': [-1]
                 }

#cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(classifier,
                           scoring='accuracy',
                           param_grid=parameter_grid,
                           cv=5)

grid_search.fit(train_x, train_y)

random_forest_clf = grid_search
parameters = grid_search.best_params_

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

NFOLDS = 5
scores = cross_val_score(grid_search.best_estimator_, train_x, train_y, cv=NFOLDS)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

# Make the prediction over the test set
random_forest_prediction = random_forest_clf.predict(test_x)

# Add the prediction to the test dataset
test_classified_with_rf = source_test_df.assign(Survived=list(random_forest_prediction))

# Save to upload to Kaggle
test_classified_with_rf.to_csv("results/test_classified_with_rf.csv", columns=["PassengerId", "Survived"], index=False)

# Predict with SVM

In [None]:
# Random forest Pipeline
from sklearn.pipeline import Pipeline
from sklearn import svm


tuned_parameters = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
]

scores = ['precision', 'recall']

for score in scores:

    clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(train_x, train_y)

    print "Best parameters for SVM"
    print clf.best_params_
    
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    
    print('Best parameters: {}'.format(clf.best_params_))

    NFOLDS = 5
    scores = cross_val_score(clf.best_estimator_, train_x, train_y, cv=NFOLDS)
    print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

    # Make the prediction over the test set
    svm_prediction = clf.predict(test_x)

    # Add the prediction to the test dataset
    test_classified_with_svm = source_test_df.assign(Survived=list(svm_prediction))

    # Save to upload to Kaggle
    test_classified_with_svm.to_csv("results/test_classified_with_svm_{}.csv".format(score), columns=["PassengerId", "Survived"], index=False)
