# Predicting who got iced in the Titanic
This is a machine learning project that is meant to solidify an understanding of important algorithms used

## Packages and Data

In [351]:
# Important standard packages
import pandas as pd
import numpy as np
import random
import time
# Preprocessing Tools
from sklearn.model_selection import train_test_split, KFold, cross_val_score, ShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process



# Algorithms / learning models to test
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [460]:
# Loading the datasets
train_orig = pd.read_csv("/Users/Kvothe/Desktop/ml_projects/titanic/titanic/train.csv")
test = pd.read_csv("/Users/Kvothe/Desktop/ml_projects/titanic/titanic/test.csv")

In [461]:
# I like to keep a copy of the original data so I make changes on train
train = train_orig.copy()

## Cleaning Missing Data and Feature Engineering

In [462]:
# Creating the training data for the y
train_y = train['Survived']
train_y
train.drop(['Survived'], axis = 1, inplace=True)

In [463]:
# Looking at which columns have null values
train.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [464]:
def preprocess(data):
    # Replace null values for some columns
    data.fillna(value={'Age':data.Age.median(), 'Embarked':data.Embarked.mode()[0], 'Fare':data.Fare.median()}, inplace=True)
    # Engineering some features
    data['fam_size'] = data['SibSp'] + data['Parch']
    data['isalone'] = 0
    data['isalone'].loc[data['fam_size'] < 1] = 1
    data['farebins'] = pd.qcut(data['Fare'], 4)
    data['agebins'] = pd.qcut(data['Age'], 4)
    # One hot encoding some of the columns
    dummy = pd.concat([pd.get_dummies(data[s]) for s in ['Sex', 'Embarked', 'farebins', 'agebins']], axis=1, sort=False)
    data = pd.concat([data, dummy], axis=1, sort=False)
    # Isolating columns with categorical variables and dropping them
    cols = data.columns
    num_cols = data._get_numeric_data().columns
    cat_cols = list(set(cols)-set(num_cols))
    data.drop(cat_cols,axis=1,inplace=True)
    data.columns = data.columns.astype(str).str.replace("\]", "_")
    return data

In [465]:
# Applying preprocessing to the submission dataset
train = preprocess(train)
submission = preprocess(test)
print(train.shape)
print(submission.shape)

In [411]:
#Using Median age for now, this can be replaced with something else
#Replace Embarked with mode value
train.fillna(value={'Age':train.Age.median(), 'Embarked':train.Embarked.mode()[0]}, inplace=True)
train.columns[train.isna().any()].tolist() # <-- shows all null columns
#train.columns[train.isnull().any()] # <-- shows all nulls columns

['Cabin']

In [412]:
# Feature Engineering
# The goal is to engineer four features: familysize, isalone, farebins, agebins
train['fam_size'] = train['SibSp'] + train['Parch']
train['isalone'] = 0
train['isalone'].loc[train['fam_size'] < 1] = 1
train['farebins'] = pd.qcut(train['Fare'], 4)
train['agebins'] = pd.qcut(train['Age'], 4)

In [413]:
# Checking the categorical columns
cols = train.columns

num_cols = train._get_numeric_data().columns
cat_cols = list(set(cols)-set(num_cols))

In [414]:
#One hot encoding of sex, ticket, embarked
# One hot encoding might be easier just using pandas instead of sklearn 
dummy = pd.concat([pd.get_dummies(train[s]) for s in ['Sex','Ticket', 'Embarked', 'farebins', 'agebins']], axis=1, sort=False)

In [415]:
# One hot encoding 
# train_temp=train.copy()
# dummy = pd.get_dummies(train['Ticket'])
# Thought about filling in Cabin with random values of most visited cabin 
#random.choice(random.choice(train_temp.Cabin.mode()).split())
train = pd.concat([train, dummy], axis=1, sort=False)
train.drop(cat_cols,axis=1,inplace=True)

In [416]:
# Double check to make sure all null columns are gone
train.columns[train.isnull().any()]

Index([], dtype='object')

In [417]:
# Preview of information
train.info()
train.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 703 entries, PassengerId to (35.0, 80.0]
dtypes: float64(2), int64(7), uint8(694)
memory usage: 666.6 KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,fam_size,isalone,female,...,Q,S,"(-0.001, 7.91]","(7.91, 14.454]","(14.454, 31.0]","(31.0, 512.329]","(0.419, 22.0]","(22.0, 28.0]","(28.0, 35.0]","(35.0, 80.0]"
546,547,1,2,19.0,1,0,26.0,1,0,1,...,0,1,0,0,1,0,1,0,0,0
636,637,0,3,32.0,0,0,7.925,0,1,0,...,0,1,0,1,0,0,0,0,1,0
812,813,0,2,35.0,0,0,10.5,0,1,0,...,0,1,0,1,0,0,0,0,1,0
590,591,0,3,35.0,0,0,7.125,0,1,0,...,0,1,1,0,0,0,0,0,1,0
289,290,1,3,22.0,0,0,7.75,0,1,1,...,1,0,1,0,0,0,1,0,0,0
386,387,0,3,1.0,5,2,46.9,7,0,0,...,0,1,0,0,0,1,1,0,0,0
294,295,0,3,24.0,0,0,7.8958,0,1,0,...,0,1,1,0,0,0,0,1,0,0
101,102,0,3,28.0,0,0,7.8958,0,1,0,...,0,1,1,0,0,0,0,1,0,0
315,316,1,3,26.0,0,0,7.8542,0,1,1,...,0,1,1,0,0,0,0,1,0,0
170,171,0,1,61.0,0,0,33.5,0,1,0,...,0,1,0,0,0,1,0,0,0,1


In [418]:
# String replacement for xgboost
train.columns = train.columns.astype(str).str.replace("\]", "_")

## Data Analysis Using Various ML Algorithms

First, the most basic algorithm is KNN so we'll try it on knn with some tuning

In [467]:
# Train test split, .75
x_train, x_test, y_train, y_test = train_test_split(train, train_y, test_size = .25)

In [290]:
# KNN
knn = KNeighborsClassifier(n_neighbors=45)
knn.fit(x_train, y_train)
train_predict = knn.predict(x_test)
accuracy_score(y_test, train_predict)

0.6322869955156951

In [None]:
# Using kfold cv to tune hyperparameters
# Testig for just one case
#scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy')
#scores.mean()
k_range = range(2, 50)
k_scores = []
for i in k_range:
    knn = KNeighborsClassifier(n_neighbors=i)
    ks = cross_val_score(knn, x_train, y_train, cv = 5, scoring='accuracy')
    k_scores.append(ks.mean())
print(k_scores)

In [292]:
# Print optimal k and the index 
print(k_scores.index(max(k_scores)))
print(max(k_scores))


29
0.676658063068118


In [468]:
# List of machine learning algorithms to go through
MLA = [    
    XGBClassifier(),
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis()
]


In [469]:
# Using shufflesplit to split the data
ssplit = ShuffleSplit(train_size=.75, test_size=.25)
ssplit.split(x_train)

<generator object BaseShuffleSplit.split at 0x128eb85d0>

In [470]:
# Creating a df to compare various attributes of an algorithm
compare_col = ['Name', 'Parameters', 'Train Accuracy', 'Test Accuracy', 'Time']
compare = pd.DataFrame(columns = compare_col)
row = 0
for alg in MLA:
    print('-----Starting %s -----' %str(alg))
    time0 = time.time()
    score = cross_val_score(alg, x_train, y_train, cv = 10, scoring = 'accuracy')
    compare.loc[row, 'Name'] = alg.__class__.__name__
    compare.loc[row, 'Parameters'] = str(alg.get_params())
    compare.loc[row, 'Train Accuracy'] = score.mean()
    model = alg.fit(x_train, y_train)
    train_predict = model.predict(x_test)
#     y_sub = model.predict(submission)
    compare.loc[row, 'Test Accuracy'] = accuracy_score(y_test, train_predict)
    compare.loc[row, 'Time'] = time.time() - time0
    row+=1


-----Starting XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1) -----
-----Starting AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None) -----
-----Starting BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False) -----
-----Starting ExtraTreesClassifier(bootstrap=False, class_weight=None,

In [457]:
# Show top performers
compare.sort_values(by=['Test Accuracy', 'Train Accuracy'], ascending = False)

Unnamed: 0,Name,Parameters,Train Accuracy,Test Accuracy,Time
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.793387,0.847534,0.414732
4,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",0.790469,0.820628,0.218361
3,GradientBoostingClassifier,"{'criterion': 'friedman_mse', 'init': None, 'l...",0.817315,0.811659,1.17721
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': 'warn',...",0.800714,0.807175,6.23229
12,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.77369,0.793722,0.067343
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.790491,0.784753,1.11004
8,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...",0.802208,0.775785,0.094867
19,LinearDiscriminantAnalysis,"{'n_components': None, 'priors': None, 'shrink...",0.799245,0.775785,0.0996759
18,ExtraTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.71686,0.775785,0.0536001
11,BernoulliNB,"{'alpha': 1.0, 'binarize': 0.0, 'class_prior':...",0.766274,0.7713,0.055371


In [370]:
# Plotting the Confucian Matrix

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [399]:
submission.shape

(418, 384)

In [400]:
train.shape

(891, 703)

0      False
1      False
2      False
3      False
4      False
       ...  
413    False
414    False
415    False
416    False
417    False
Name: Fare, Length: 418, dtype: bool