# Predicting who got iced in the Titanic
This is a machine learning project that is meant to solidify an understanding of important algorithms used  
It was modeled after https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy

## Packages and Data

In [323]:
# Important standard packages
import pandas as pd
import numpy as np
import random
import time
import warnings
# Preprocessing Tools
from sklearn.model_selection import train_test_split, KFold, cross_val_score, ShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.ensemble import VotingClassifier



# Algorithms / learning models to test
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [324]:
# Loading the datasets
train_orig = pd.read_csv("~/Desktop/titanic/train.csv")
test_orig = pd.read_csv("~/Desktop/titanic/test.csv")
best_submit = pd.read_csv("/home/ghost/Desktop/git/kaggle/titanic/best_submit.csv")

In [293]:
# I like to keep a copy of the original data so I make changes on train
train = train_orig.copy()
test = test_orig.copy()

# Quick Setup
Below this contains the scratchwork for making this setup code  
Setup <a name='bookmark2' />

In [295]:
train_orig = pd.read_csv("~/Desktop/titanic/train.csv")
test = pd.read_csv("~/Desktop/titanic/test.csv")
train = train_orig.copy()
# Creating the training data for the y
train_y = train['Survived']
train_y
train.drop(['Survived'], axis = 1, inplace=True)

# Create the Cabin information needed for one hot encoding

test_cabin_vals = set(test.Cabin.unique().tolist())
train_cabin_vals = set(train_orig.Cabin.unique().tolist())
cabin_in_both = list(train_cabin_vals.intersection(test_cabin_vals))[1:] # the extra [1:] is to exclude nan value

# Create the Title information needed for one hot encoding
test_title_vals = set(test.Name.str.split(",",expand=True)[1].str.split(".", expand=True)[0])
train_title_vals = set(train_orig.Name.str.split(",",expand=True)[1].str.split(".", expand=True)[0])
title_in_both = list(test_title_vals.intersection(train_title_vals))

# Create the Surname information needed for one hot encoding
test_surname_vals = set(test.Name.str.split(",",expand=True)[0])
train_surname_vals = set(train_orig.Name.str.split(",", expand=True)[0])
surname_in_both = list(train_surname_vals.intersection(test_surname_vals))

def preprocess(data):
    # Extract data on the titles of the individuals on the ship
    # One hot encode title
    data['title'] = data.Name.str.split(",",expand=True)[1].str.split(".", expand=True)[0]
    title_encode = pd.get_dummies(data['title'])
    data = pd.concat([data, title_encode[title_in_both]], axis=1, sort=False)
    
    # Extracting surnames of the individuals on the ship
    # One hot encode surname
    data['surname'] = data.Name.str.split(",", expand=True)[0]
    surname_encode = pd.get_dummies(data['surname'])
    data = pd.concat([data, surname_encode[surname_in_both]], axis = 1, sort=False)
    
    # One hot encode all the cabin values that are in both train and test data 
    # Note: removing the cabin encoding actually makes the test results more accurate; results in overfitting
#     cabin_encode = pd.get_dummies(data['Cabin'])
#     data = pd.concat([data,cabin_encode[cabin_in_both]], axis = 1, sort=False)
    
    # Replace null values for some columns
    data.fillna(value={'Age':data.Age.median(), 'Embarked':data.Embarked.mode()[0], 'Fare':data.Fare.median()}, inplace=True)
    # Engineering some features
    data['fam_size'] = data['SibSp'] + data['Parch']
    data['isalone'] = 0
    data['isalone'].loc[data['fam_size'] < 1] = 1
#     data['farebins'] = pd.cut(data['Fare'], 8)
#     data['agebins'] = pd.cut(data['Age'], 12)
    data['farebins'] = pd.qcut(data['Fare'], 4)
    data['agebins'] = pd.qcut(data['Age'], 4)
    # One hot encoding some of the columns
    dummy = pd.concat([pd.get_dummies(data[s]) for s in ['Sex', 'Embarked', 'farebins', 'agebins']], axis=1, sort=False)
    data = pd.concat([data, dummy], axis=1, sort=False)
    # Isolating columns with categorical variables and dropping them
    cols = data.columns
    # Continue with removal of the unecessary columns
    num_cols = data._get_numeric_data().columns
    cat_cols = list(set(cols)-set(num_cols))
    data.drop(cat_cols,axis=1,inplace=True)
    # The line below generates a warning but should be ok
    #data.columns = data.columns.astype(str).str.replace("\]", "_")
    # Renaming the columns that were once bins so that train and submission set have uniform column names
    # Should find a better way to do this
    rand_letters= ['a','b','c','d','e','f','g','h','i','j','k','l'] # This is for qcut
#     rand_letters= ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t'] # This is for cut
    index = 0
    for i in data.columns:
        if '(' in str(i):
            data.rename({i:rand_letters[index]}, axis = 1, inplace=True)
            index+=1
    return data
# Applying preprocessing to the submission dataset
train = preprocess(train)
submission = preprocess(test)
print(train.shape)
print(submission.shape)

(891, 173)
(418, 173)


In [296]:
# Train test split, .75
x_train, x_test, y_train, y_test = train_test_split(train, train_y, test_size = .3)

### Jump to the algorithms
Go to <a href=#bookmark> algorithms</a>

## Cleaning Missing Data and Feature Engineering

In [164]:
# Extracting titles from names
print(train_orig.Name.str.split(",",expand=True)[1].str.split(".", expand=True)[0].unique())
test.Name.str.split(",",expand=True)[1].str.split(".", expand=True)[0].unique()

[' Mr' ' Mrs' ' Miss' ' Master' ' Don' ' Rev' ' Dr' ' Mme' ' Ms' ' Major'
 ' Lady' ' Sir' ' Mlle' ' Col' ' Capt' ' the Countess' ' Jonkheer']


array([' Mr', ' Mrs', ' Miss', ' Master', ' Ms', ' Col', ' Rev', ' Dr',
       ' Dona'], dtype=object)

In [258]:
# trying to create surname values
temp = train_orig.Name.str.split(",",expand=True)[0]
temp2 = test.Name.str.split(",", expand=True)[0]
print(set(temp).intersection(set(temp2)))

{'Goodwin', 'Burns', 'Nilsson', 'Dodge', 'Kink-Heilmann', 'Dean', 'Wright', 'Connolly', 'Harder', 'Spedden', 'Cumings', 'Peter', 'Cook', 'Palsson', 'Fortune', 'Betros', 'Douglas', 'Ilmakangas', 'Cor', 'White', 'Svensson', 'Widener', 'Lennon', 'Davidson', 'Wiklund', 'Danbom', 'Warren', 'Duran y More', 'Kimball', 'Moore', 'Spencer', 'Chaffee', 'Lefebre', 'Compton', 'Frolicher-Stehli', 'Johnston', 'McNamee', 'Goldsmith', 'Sandstrom', 'Hocking', 'Watt', 'Carrau', 'Laroche', 'Hirvonen', 'Louch', 'Smith', 'Cribb', 'Rice', 'Fleming', 'Risien', 'Davies', 'Chapman', 'Lines', 'Giles', 'Robins', 'Andrew', 'Gale', 'Moubarek', 'Elias', 'Hansen', 'Kink', 'Clarke', 'Bradley', 'Kenyon', 'Caldwell', 'Abbott', 'Weisz', 'Andersson', 'Angle', 'Lindell', 'Astor', 'Asplund', 'Davison', 'Becker', 'Karlsson', 'Murphy', 'Phillips', 'Sage', 'Johansson', 'Minahan', 'Faunthorpe', 'Samaan', 'Olsson', 'Williams', 'Carr', 'Vander Planke', 'Jonsson', 'Keane', 'Aks', 'Allison', 'Kiernan', 'Karun', 'Olsen', 'Cavendish'

In [111]:
# One hot encode for all the Cabin values that are in both test and train
temp1 = train_orig.copy()
temp2 = test.copy()

test_cabin_vals = set(temp2.Cabin.unique().tolist())
# print(test_cabin_vals)
train_cabin_vals = set(temp1.Cabin.unique().tolist())
in_both = list(train_cabin_vals.intersection(test_cabin_vals))[1:] # the extra [1:] is to exclude nan value
print(in_both)
train_cabin_encode = pd.get_dummies(temp1['Cabin'])
print(train_cabin_encode[in_both])

['D37', 'B71', 'D28', 'C106', 'B58 B60', 'D', 'D30', 'B41', 'C7', 'D15', 'F33', 'G6', 'E46', 'F G63', 'D21', 'E31', 'C23 C25 C27', 'C85', 'D10 D12', 'B69', 'B57 B59 B63 B66', 'C46', 'B78', 'D19', 'A34', 'E50', 'C86', 'C62 C64', 'F4', 'C32', 'F2', 'C78', 'C101', 'B51 B53 B55', 'C54', 'C22 C26', 'E34']
     D37  B71  D28  C106  B58 B60  D  D30  B41  C7  D15  ...  C62 C64  F4  \
0      0    0    0     0        0  0    0    0   0    0  ...        0   0   
1      0    0    0     0        0  0    0    0   0    0  ...        0   0   
2      0    0    0     0        0  0    0    0   0    0  ...        0   0   
3      0    0    0     0        0  0    0    0   0    0  ...        0   0   
4      0    0    0     0        0  0    0    0   0    0  ...        0   0   
..   ...  ...  ...   ...      ... ..  ...  ...  ..  ...  ...      ...  ..   
886    0    0    0     0        0  0    0    0   0    0  ...        0   0   
887    0    0    0     0        0  0    0    0   0    0  ...        0   0   
888  

In [176]:
# Creating the training data for the y
train_y = train['Survived']
train_y
train.drop(['Survived'], axis = 1, inplace=True)

KeyError: 'Survived'

In [198]:
def preprocess(data):
    # Replace null values for some columns
    data.fillna(value={'Age':data.Age.median(), 'Embarked':data.Embarked.mode()[0], 'Fare':data.Fare.median()}, inplace=True)
    # Engineering some features
    data['fam_size'] = data['SibSp'] + data['Parch']
    data['isalone'] = 0
    data['isalone'].loc[data['fam_size'] < 1] = 1
    data['farebins'] = pd.qcut(data['Fare'], 4)
    data['agebins'] = pd.qcut(data['Age'], 4)
    # One hot encoding some of the columns
    dummy = pd.concat([pd.get_dummies(data[s]) for s in ['Sex', 'Embarked', 'farebins', 'agebins']], axis=1, sort=False)
    data = pd.concat([data, dummy], axis=1, sort=False)
    # Isolating columns with categorical variables and dropping them
    cols = data.columns
    num_cols = data._get_numeric_data().columns
    cat_cols = list(set(cols)-set(num_cols))
    data.drop(cat_cols,axis=1,inplace=True)
    # The line below generates a warning but should be ok
    #data.columns = data.columns.astype(str).str.replace("\]", "_")
    # Renaming the columns that were once bins so that train and submission set have uniform column names
    rand_letters= ['a','b','c','d','e','f','g','h']
    index = 0
    for i in data.columns:
        if '(' in str(i):
            data.rename({i:rand_letters[index]}, axis = 1, inplace=True)
            index+=1
    return data


In [199]:
# Applying preprocessing to the submission dataset
train = preprocess(train)
submission = preprocess(test)
print(train.shape)
print(submission.shape)
# Train test split, .75
x_train, x_test, y_train, y_test = train_test_split(train, train_y, test_size = .3)

AttributeError: 'DataFrame' object has no attribute 'Embarked'

In [76]:
#Using Median age for now, this can be replaced with something else
#Replace Embarked with mode value
train.fillna(value={'Age':train.Age.median(), 'Embarked':train.Embarked.mode()[0]}, inplace=True)
train.columns[train.isna().any()].tolist() # <-- shows all null columns
#train.columns[train.isnull().any()] # <-- shows all nulls columns

['Cabin']

In [77]:
# Feature Engineering
# The goal is to engineer four features: familysize, isalone, farebins, agebins
train['fam_size'] = train['SibSp'] + train['Parch']
train['isalone'] = 0
train['isalone'].loc[train['fam_size'] < 1] = 1
train['farebins'] = pd.qcut(train['Fare'], 4)
train['agebins'] = pd.qcut(train['Age'], 4)

In [78]:
# Checking the categorical columns
cols = train.columns

num_cols = train._get_numeric_data().columns
cat_cols = list(set(cols)-set(num_cols))

In [79]:
#One hot encoding of sex, ticket, embarked
# One hot encoding might be easier just using pandas instead of sklearn 
dummy = pd.concat([pd.get_dummies(train[s]) for s in ['Sex','Ticket', 'Embarked', 'farebins', 'agebins']], axis=1, sort=False)

In [92]:
rand_letters= ['a','b','c','d','e','f','g','h']
index = 0
for i in dummy.columns:
    if '(' in str(i):
        dummy.rename({i:rand_letters[index]}, axis = 1, inplace=True)
        print(str(i))
        print(rand_letters[index])
        index+=1

(-0.001, 7.91]
a
(7.91, 14.454]
b
(14.454, 31.0]
c
(31.0, 512.329]
d
(0.419, 22.0]
e
(22.0, 28.0]
f
(28.0, 35.0]
g
(35.0, 80.0]
h


In [93]:
dummy

Unnamed: 0,female,male,110152,110413,110465,110564,110813,111240,111320,111361,...,Q,S,a,b,c,d,e,f,g,h
0,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
887,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
888,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
889,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [16]:
# One hot encoding 
# train_temp=train.copy()
# dummy = pd.get_dummies(train['Ticket'])
# Thought about filling in Cabin with random values of most visited cabin 
#random.choice(random.choice(train_temp.Cabin.mode()).split())
train = pd.concat([train, dummy], axis=1, sort=False)
train.drop(cat_cols,axis=1,inplace=True)

In [17]:
# Double check to make sure all null columns are gone
train.columns[train.isnull().any()]

Index([], dtype='object')

In [18]:
# Preview of information
train.info()
train.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 703 entries, PassengerId to (35.0, 80.0]
dtypes: float64(2), int64(7), uint8(694)
memory usage: 666.6 KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,fam_size,isalone,female,...,Q,S,"(-0.001, 7.91]","(7.91, 14.454]","(14.454, 31.0]","(31.0, 512.329]","(0.419, 22.0]","(22.0, 28.0]","(28.0, 35.0]","(35.0, 80.0]"
358,359,1,3,28.0,0,0,7.8792,0,1,1,...,1,0,1,0,0,0,0,1,0,0
587,588,1,1,60.0,1,1,79.2,2,0,0,...,0,0,0,0,0,1,0,0,0,1
222,223,0,3,51.0,0,0,8.05,0,1,0,...,0,1,0,1,0,0,0,0,0,1
184,185,1,3,4.0,0,2,22.025,2,0,1,...,0,1,0,0,1,0,1,0,0,0
76,77,0,3,28.0,0,0,7.8958,0,1,0,...,0,1,1,0,0,0,0,1,0,0
274,275,1,3,28.0,0,0,7.75,0,1,1,...,1,0,1,0,0,0,0,1,0,0
140,141,0,3,28.0,0,2,15.2458,2,0,1,...,0,0,0,0,1,0,0,1,0,0
544,545,0,1,50.0,1,0,106.425,1,0,0,...,0,0,0,0,0,1,0,0,0,1
66,67,1,2,29.0,0,0,10.5,0,1,1,...,0,1,0,1,0,0,0,0,1,0
135,136,0,2,23.0,0,0,15.0458,0,1,0,...,0,0,0,0,1,0,0,1,0,0


In [19]:
# String replacement for xgboost
train.columns = train.columns.astype(str).str.replace("\]", "_")

In [14]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fam_size,isalone,farebins,agebins
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,"(-0.001, 7.896]","(27.0, 35.75]"
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1,0,"(-0.001, 7.896]","(35.75, 76.0]"
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,"(7.896, 14.454]","(35.75, 76.0]"
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,1,"(7.896, 14.454]","(23.0, 27.0]"
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2,0,"(7.896, 14.454]","(0.169, 23.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,27.0,0,0,A.5. 3236,8.0500,,S,0,1,"(7.896, 14.454]","(23.0, 27.0]"
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0,1,"(31.472, 512.329]","(35.75, 76.0]"
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,1,"(-0.001, 7.896]","(35.75, 76.0]"
416,1308,3,"Ware, Mr. Frederick",male,27.0,0,0,359309,8.0500,,S,0,1,"(7.896, 14.454]","(23.0, 27.0]"


## Data Analysis Using Various ML Algorithms

First, the most basic algorithm is KNN so we'll try it on knn with some tuning

In [8]:
# Train test split, .75
x_train, x_test, y_train, y_test = train_test_split(train, train_y, test_size = .3)

In [12]:
# KNN
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(x_train, y_train)
train_predict = knn.predict(x_test)
accuracy_score(y_test, train_predict)

0.664179104477612

In [10]:
# Using kfold cv to tune hyperparameters
# Testig for just one case
#scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy')
#scores.mean()
k_range = range(2, 50)
k_scores = []
for i in k_range:
    knn = KNeighborsClassifier(n_neighbors=i)
    ks = cross_val_score(knn, x_train, y_train, cv = 5, scoring='accuracy')
    k_scores.append(ks.mean())
print(k_scores)

[0.6324038914490527, 0.6162879672299028, 0.6228161802355351, 0.6082617511520737, 0.6307268817204301, 0.6370629800307219, 0.6452174091141833, 0.625926472094214, 0.6451145929339478, 0.6435399897593446, 0.6403655913978494, 0.6371653865847413, 0.6323653865847414, 0.6339655913978495, 0.6404045058883769, 0.6387660010240654, 0.6372296979006656, 0.6404426011264721, 0.6388167946748592, 0.6307778801843319, 0.6388167946748592, 0.6468174091141833, 0.6436428059395801, 0.648468817204301, 0.6468430107526881, 0.6500688172043011, 0.6549331285202252, 0.6597462365591398, 0.6549202252944187, 0.658120430107527, 0.6613079365079365, 0.6613079365079365, 0.656494828469022, 0.6565204301075268, 0.6533458269329238, 0.6517329237071172, 0.648583922171019, 0.648558320532514, 0.6469198156682028, 0.6516817204301075, 0.638842396313364, 0.6500815156169995, 0.6404296979006656, 0.6420426011264722, 0.6388167946748592, 0.6452428059395802, 0.6436811059907834, 0.6484686123911929]


In [11]:
# Print optimal k and the index 
print(k_scores.index(max(k_scores)))
print(max(k_scores))


30
0.6613079365079365


In [25]:
# List of machine learning algorithms to go through
MLA = [    
    XGBClassifier(),
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis()
]


In [26]:
# Using shufflesplit to split the data
ssplit = ShuffleSplit(train_size=.75, test_size=.25)
ssplit.split(x_train)

<generator object BaseShuffleSplit.split at 0x7f6a52a92050>

In [27]:
# Creating a df to compare various attributes of an algorithm
compare_col = ['Name', 'Parameters', 'Train Accuracy', 'Test Accuracy', 'Time']
compare = pd.DataFrame(columns = compare_col)
row = 0
for alg in MLA:
    print('-----Starting %s -----' %str(alg))
    time0 = time.time()
    score = cross_val_score(alg, x_train, y_train, cv = 10, scoring = 'accuracy')
    compare.loc[row, 'Name'] = alg.__class__.__name__
    compare.loc[row, 'Parameters'] = str(alg.get_params())
    compare.loc[row, 'Train Accuracy'] = score.mean()
    model = alg.fit(x_train, y_train)
    train_predict = model.predict(x_test)
#     y_sub = model.predict(submission)
    compare.loc[row, 'Test Accuracy'] = accuracy_score(y_test, train_predict)
    compare.loc[row, 'Time'] = time.time() - time0
    row+=1


-----Starting XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1) -----
-----Starting AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None) -----
-----Starting BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False) -----
-----Starting ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_dep



-----Starting RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False) -----
-----Starting GaussianProcessClassifier(copy_X_train=True, kernel=None, max_iter_predict=100,
                          multi_class='one_vs_rest', n_jobs=None,
                          n_restarts_optimizer=0, optimizer='fmin_l_bfgs_b',
                          random_state=None, warm_start=False) -----




-----Starting LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0) -----












-----Starting PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False) -----
-----Starting RidgeClassifierCV(alphas=array([ 0.1,  1. , 10. ]), class_weight=None, cv=None,
                  fit_intercept=True, normalize=False, scoring=None,
                  store_cv_values=False) -----
-----Starting SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, 



-----Starting NuSVC(cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.5, probability=True, random_state=None,
      shrinking=True, tol=0.001, verbose=False) -----




-----Starting LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) -----




-----Starting DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') -----
-----Starting ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, random_state=None,
                    splitter='random') -----
-----Starting LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001) -----
-----St



In [28]:
# Show top performers
compare.sort_values(by=['Test Accuracy', 'Train Accuracy'], ascending = False, inplace=True)
# Finding the 8 best performing algorithms and performing hyperparameter tuning with them
# Will then use these 8 algorithms to build an ensemble learner 
best_algs = []
for i in compare.iloc[:8].Name:
    best_algs.append(i)
best_algs

['XGBClassifier',
 'GradientBoostingClassifier',
 'RandomForestClassifier',
 'AdaBoostClassifier',
 'RidgeClassifierCV',
 'BaggingClassifier',
 'LinearDiscriminantAnalysis',
 'LogisticRegressionCV']

In [29]:
# Create a voting block based on the best algorithms
voting_block = [
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('xgb', XGBClassifier()),
    ('bc', ensemble.BaggingClassifier()),
    ('rfc', ensemble.RandomForestClassifier()),
    ('etc', ensemble.ExtraTreesClassifier()),
    ('abc', ensemble.AdaBoostClassifier()),
    ('dtc', tree.DecisionTreeClassifier()),
    ('lr', linear_model.LogisticRegression()),
    ('knn', KNeighborsClassifier(n_neighbors = 23))
]

In [30]:
# Soft and Hard ensemble learner
vote_hard = ensemble.VotingClassifier(estimators = voting_block, voting='hard')
vote_soft = ensemble.VotingClassifier(estimators = voting_block, voting='soft')

In [31]:
# Hard Accuracy Score
vote_hard_cv = cross_val_score(vote_hard, x_train, y_train, cv = 3)
vote_hard.fit(x_train, y_train)
y_pred = vote_hard.predict(x_test)
accuracy_score(y_pred, y_test)



0.8432835820895522

In [32]:
# Soft Accuracy Score
vote_soft_cv = (vote_soft, x_train, y_train)
vote_soft.fit(x_train, y_train)
y_pred = vote_soft.predict(x_test)
accuracy_score(y_pred, y_test)



0.8475336322869955

Algorithms <a name='bookmark' />

Go to <a href=#bookmark2> setup</a>

In [297]:
# Smaller voting block mixed with grid search below, excluding knn because we already did that above, nneighbrs = 23
# The best parameters for XGBClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 100}
smaller_vb = [
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('xgb', XGBClassifier()),
    ('bc', ensemble.BaggingClassifier()),
    ('rfc', ensemble.RandomForestClassifier()),
    ('abc', ensemble.AdaBoostClassifier())
]

In [334]:
# Hyperparameter Tuning with GridSearch
grid_n_estimator = [150,200, 250, 300,350,400]
learning_rate = [0.001, 0.01,0.02,0.05, 0.07]
sample_ratio = [.2, .225,.25,.3]
feature_ratio = [.35, .4, .45, .5]
max_depth = [2, 3, 4, 5,6]
min_leaf = [2,3,4,5,6]


grid_param = [
    [
        # Regular Gradient Boost
        {
            'max_depth': [1,2,3,5,6,7],
            'n_estimators': grid_n_estimator,
            'learning_rate': [0.01, 0.015, 0.02, 0.025]
        }
    ],
#     [
#         # XGBoost
#         {
#             'learning_rate': learning_rate,
#             'n_estimators': [50, 73,75,100, 125, 150],
#             'max_depth': [1,2,3,4]
#         }
#     ],
    [
        # Bagging Classifier
        {
            'n_estimators': [225, 250, 275, 300],
            'max_samples': sample_ratio,
            'max_features': feature_ratio
        }
    ],
    [
        # Random Forest
        {
            'n_estimators': [75,100, 125, 150],
            'criterion': ['entropy'],
            'max_depth': [4, 5,6,7],
            'min_samples_leaf': [4,5,6,7]
        }
    ],
    [
        #AdaBoost
        {
            'n_estimators': grid_n_estimator,
            'learning_rate': learning_rate
        }
    ]
]


In [335]:
def shrink_tune(algs, params, iterations): # takes in as input the algorithms and initial parameters
    
    for alg, param in zip(algs, params):
        print(alg)
#         print(param)
        for i in range(iterations):
            print('current i: ', i)
            best_search = GridSearchCV(estimator = alg[1], param_grid=param, cv = 10, scoring='roc_auc')
            best_search.fit(x_train, y_train)
            best_param = best_search.best_params_
            if (i == iterations - 1):
                break
            else:
                # change the parameters to make them the median
                for parameter in best_param:
                    # if length of list is 0, continue
                    if len(param[0][parameter]) == 1:
                        continue
                    # if type of parameter is a string, keep it and continue
                    if isinstance(best_param[parameter], str):
                        param[0][parameter] = [best_param[parameter]]
                        continue
                    diff = param[0][parameter][1] - param[0][parameter][0]
                    print(diff)
                    # stopping condition; no more updates to make
                    if (diff == 1):
                        new = [best_param[parameter]]
                    else:
                        print(l)
                        print(h)
                        l = best_param[parameter] - diff/2
                        h = best_param[parameter] + diff/2
                        if isinstance(best_param[parameter], int):
                            l = int(l)
                            h = int(h)
                        # Quick band aid; should make this better 
                        if l < 0:
                            l = 0
                        new = [l, best_param[parameter], h]
                    param[0][parameter] = new
        
        print('The best parameters for {} is {}'.format(alg[1].__class__.__name__, best_param))


In [336]:
# Tune Once
def tune_once (algs, params):
    for alg, param in zip(algs, params):
        print(alg[1].__class__.__name__)
    #     print(param)
        best_search = GridSearchCV(estimator = alg[1], param_grid=param, cv = 10, scoring = 'roc_auc')
        best_search.fit(x_train, y_train)
        best_param = best_search.best_params_
        print('The best parameters for {} is {}'.format(alg[1].__class__.__name__, best_param))
    

In [337]:
# optimizing hyperparameters by shrinking space, the alg in smaller_vb are mostly trees
# shrink_tune(smaller_vb, grid_param, 6)
tune_once (smaller_vb, grid_param)

GradientBoostingClassifier




The best parameters for GradientBoostingClassifier is {'learning_rate': 0.025, 'max_depth': 3, 'n_estimators': 350}
BaggingClassifier




The best parameters for BaggingClassifier is {'max_features': 0.5, 'max_samples': 0.3, 'n_estimators': 250}
RandomForestClassifier




The best parameters for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 7, 'n_estimators': 150}
AdaBoostClassifier




The best parameters for AdaBoostClassifier is {'learning_rate': 0.07, 'n_estimators': 250}


In [356]:
### optimized smaller voting block
smaller_vb = [
    ('gbc', ensemble.GradientBoostingClassifier(learning_rate=0.01625, max_depth = 2, n_estimators = 212)),
#     ('xgb', XGBClassifier(learning_rate=0.0725, max_depth = 2, n_estimators=55)),
    ('bc', ensemble.BaggingClassifier(max_features = .5125, max_samples=.28125, n_estimators=318)),
    ('rfc', ensemble.RandomForestClassifier(criterion='entropy', max_depth=7, min_samples_leaf = 4,n_estimators=143)),
    ('abc', ensemble.AdaBoostClassifier(learning_rate = 0.05, n_estimators = 150)),
#     ('knn', KNeighborsClassifier(n_neighbors=30)),
#     ('lrcv', linear_model.LogisticRegressionCV(solver = 'newton-cg', max_iter = 10)),
#     ('lda', discriminant_analysis.LinearDiscriminantAnalysis())
]
vote_soft = ensemble.VotingClassifier(estimators = smaller_vb, voting='soft')
# vote_soft_cv = (vote_soft, x_train, y_train)
vote_soft.fit(x_train, y_train)
y_pred = vote_soft.predict(x_test)
print(accuracy_score(y_pred, y_test))


vote_soft = ensemble.VotingClassifier(estimators = smaller_vb, voting='soft')
vote_soft.fit(train, train_y)
submission_pred = vote_soft.predict(submission)

0.8283582089552238


In [357]:
submission_val = pd.DataFrame(columns = ['PassengerId', 'Survived'])
submission_val['PassengerId'] = submission['PassengerId']
submission_val['Survived'] = pd.Series(submission_pred)
print(submission_val.groupby('Survived').count())
submission_val.to_csv("./submit.csv", index=False)
submission_val

          PassengerId
Survived             
0                 265
1                 153


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [325]:
best_submit.groupby('Survived').count()

Unnamed: 0_level_0,PassengerId
Survived,Unnamed: 1_level_1
0,259
1,159


In [None]:
### 6/11/2020
### Getting rid of xgb increased the accuracy score
### Logic was that shallow trees would not be as good for the dataset?
smaller_vb = [
    ('gbc', ensemble.GradientBoostingClassifier(learning_rate=0.01625, max_depth = 2, n_estimators = 212)),
#     ('xgb', XGBClassifier(learning_rate=0.0725, max_depth = 2, n_estimators=55)),
    ('bc', ensemble.BaggingClassifier(max_features = .5125, max_samples=.28125, n_estimators=318)),
    ('rfc', ensemble.RandomForestClassifier(criterion='entropy', max_depth=7, min_samples_leaf = 4,n_estimators=143)),
    ('abc', ensemble.AdaBoostClassifier(learning_rate = 0.05, n_estimators = 150)),
#     ('knn', KNeighborsClassifier(n_neighbors=30)),
#     ('lrcv', linear_model.LogisticRegressionCV(solver = 'newton-cg', max_iter = 10)),
#     ('lda', discriminant_analysis.LinearDiscriminantAnalysis())
]
vote_soft = ensemble.VotingClassifier(estimators = smaller_vb, voting='soft')
# vote_soft_cv = (vote_soft, x_train, y_train)
vote_soft.fit(x_train, y_train)
y_pred = vote_soft.predict(x_test)
print(accuracy_score(y_pred, y_test))


vote_soft = ensemble.VotingClassifier(estimators = smaller_vb, voting='soft')
vote_soft.fit(train, train_y)
submission_pred = vote_soft.predict(submission)

In [282]:
### optimized smaller voting block
smaller_vb = [
    ('gbc', ensemble.GradientBoostingClassifier(learning_rate=0.01625, max_depth = 2, n_estimators = 212)),
    ('xgb', XGBClassifier(learning_rate=0.0725, max_depth = 2, n_estimators=55)),
    ('bc', ensemble.BaggingClassifier(max_features = .5125, max_samples=.28125, n_estimators=318)),
    ('rfc', ensemble.RandomForestClassifier(criterion='entropy', max_depth=7, min_samples_leaf = 4,n_estimators=143)),
    ('abc', ensemble.AdaBoostClassifier(learning_rate = 0.05, n_estimators = 150)),
    ('knn', KNeighborsClassifier(n_neighbors=30)),
#     ('lrcv', linear_model.LogisticRegressionCV(solver = 'newton-cg', max_iter = 10)),
#     ('lda', discriminant_analysis.LinearDiscriminantAnalysis())
]
vote_soft = ensemble.VotingClassifier(estimators = smaller_vb, voting='soft')
vote_soft_cv = (vote_soft, x_train, y_train)
vote_soft.fit(x_train, y_train)
y_pred = vote_soft.predict(x_test)
print(accuracy_score(y_pred, y_test))
submission_pred = vote_soft.predict(submission)

0.8432835820895522


In [None]:
#Endnote 1:
# Old algorithm
temp = []
for alg, param in zip(smaller_vb, grid_param):
    print(alg[1].__class__.__name__)
#     print(param)
    best_search = GridSearchCV(estimator = alg[1], param_grid=param, cv = 10, scoring = 'roc_auc')
    best_search.fit(x_train, y_train)
    best_param = best_search.best_params_
    print('The best parameters for {} is {}'.format(alg[1].__class__.__name__, best_param))
    temp.append(alg[1].set_params(**best_param))
'''
GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.05, loss='deviance', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
[{'max_depth': [2, 3, 4, 5, 6], 'n_estimators': [100, 200, 300, 400], 'learning_rate': [0.05, 0.1, 0.15, 0.2]}]
The best parameters for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)
[{'learning_rate': [0.05, 0.1, 0.15, 0.2], 'n_estimators': [100, 200, 300, 400], 'max_depth': [2, 3, 4, 5, 6]}]
The best parameters for XGBClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 100}
BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)
[{'n_estimators': [100, 200, 300, 400], 'max_samples': [0.2, 0.25, 3, 0.35], 'max_features': [0.2, 0.25, 0.3]}]
The best parameters for BaggingClassifier is {'max_features': 0.3, 'max_samples': 0.35, 'n_estimators': 200}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
[{'n_estimators': [100, 200, 300, 400], 'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5, 6], 'min_samples_leaf': [2, 3, 4, 5, 6]}]
The best parameters for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 5, 'n_estimators': 100}
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
[{'n_estimators': [100, 200, 300, 400], 'learning_rate': [0.05, 0.1, 0.15, 0.2]}]
The best parameters for AdaBoostClassifier is {'learning_rate': 0.05, 'n_estimators': 200}
'''