In [1]:
# Refactoring: Make notebook linear by ruthlessly removing comments and dead code from previous notebook

In [2]:
cd ..

/code


In [3]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

from src.preprocessing import train_model, add_derived_title, categorize_column, add_is_alone_column, impute_nans



In [4]:
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")
df = pd.concat([train_df,test_df], sort=True)

df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [5]:
df = impute_nans(df, categorical_columns=['Embarked'], continuous_columns=['Fare', 'Age'])
df = add_derived_title(df)
df = add_is_alone_column(df)

In [6]:
# encode columns

df['Title'] = df['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)
df['AgeGroup'] = categorize_column(df['Age'], num_bins=5)
df['FareBand'] = categorize_column(df['Fare'], num_bins=4)
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
df['Sex'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [7]:
# remove columns which we don't need for training the model
df = df.drop(['Parch', 'SibSp', 'Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)

In [8]:
train_df = df[-df['Survived'].isna()]
test_df = df[df['Survived'].isna()]
test_df = test_df.drop('Survived', axis=1)

X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()

In [9]:
_, acc_svc           = train_model(SVC, X_train, Y_train, gamma='auto')
_, acc_knn           = train_model(KNeighborsClassifier, X_train, Y_train, n_neighbors=3)
_, acc_gaussian      = train_model(GaussianNB, X_train, Y_train)
_, acc_perceptron    = train_model(Perceptron, X_train, Y_train)
_, acc_sgd           = train_model(SGDClassifier, X_train, Y_train)
_, acc_decision_tree = train_model(DecisionTreeClassifier, X_train, Y_train)
_, acc_random_forest = train_model(RandomForestClassifier, X_train, Y_train, n_estimators=100)

accuracy (SVC): 89.79
accuracy (KNeighborsClassifier): 84.96
accuracy (GaussianNB): 76.54
accuracy (Perceptron): 72.95
accuracy (SGDClassifier): 70.15
accuracy (DecisionTreeClassifier): 98.2
accuracy (RandomForestClassifier): 98.2


In [10]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN',
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})

models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
2,Random Forest,98.2
6,Decision Tree,98.2
0,Support Vector Machines,89.79
1,KNN,84.96
3,Naive Bayes,76.54
4,Perceptron,72.95
5,Stochastic Gradient Decent,70.15


### Conclusion

This is the end of our refactoring exercise. For a real project, I would move this into a plain python file (e.g.  see `src/train.py`) and remove this notebook entirely so that we remove the flat surface and prevent mess and complexity from accumulating