In [31]:
#import library
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
#read train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## **Feature Engineering**

**Family Survival**

In [None]:
# this feature was inspired by S.Xu's (https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever)
# and I take this code from konstantinmasich's (https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83) beacause using simple code
#this feature is about family survival chance.

data = pd.concat((train, test))

# Extract last name
data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])

# Fill in missing Fare value by overall Fare mean
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

# Setting coin flip (e.g. random chance of surviving)
default_survival_chance = 0.5
data['Family_Survival'] = default_survival_chance

# Grouping data by last name and fare - looking for families
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    # If not equal to 1, a family is found 
    # Then work out survival chance depending on whether or not that family member survived
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin == 0.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0

# Print the headline
print("Number of passengers with family survival information:", 
      data.loc[data['Family_Survival']!=0.5].shape[0])

**Title**

In [37]:
#extract Title every Passenger
data['Title']=data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

#grouping Title Passenger by 'Rare','Miss','Mr','Mrs'
data['Title'] = data['Title'].replace(['Don', 'Rev', 'Dr', 'Major', 'Lady', 'Sir',
                                                'Col', 'Capt', 'Countess', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Ms',  'Mlle'], 'Miss')
data['Title'] = data['Title'].replace(['Mme'], 'Mrs')

#change this categorical feature to numeric
c={'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4}
data['Title'] = data.Title.map(c)

**Sex**

In [38]:
Sex_conv = {'male': 1, 'female' : 0}
data['Sex_cat'] = data['Sex'].map(Sex_conv).astype(int)

**Deck/Cabin**

In [39]:
#Create deck feature by first letter on Cabin and fill missing cabin by 'M'
data['Deck'] = data.Cabin.apply(lambda x:x[0] if pd.notnull(x) else 'M')

#change deck feature to numeric value
d={'M': 0, 'C': 1, 'B': 2, 'D': 3, 'E': 4, 'A': 5, 'F': 6, 'G': 7, 'T': 8}
data['Deck'] = data.Deck.map(d)

**Age**

In [40]:
#fill missing value in age by median of age
data.Age.fillna(data.Age.median(), inplace=True)

#binning age feature
data.loc[data['Age']<=22, 'Age'] = 0
data.loc[(data['Age']>22) & (data['Age']<=28), 'Age'] = 1
data.loc[(data['Age']>28) & (data['Age']<=35), 'Age'] = 2
data.loc[(data['Age']>35) & (data['Age']<=80), 'Age'] = 3

**Embarked**

In [41]:
data.Embarked.value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [42]:
#fill missing value in Embarked by most frequent value ('S')
data.Embarked.fillna('S', inplace=True)

#change feature to numerical feature
e={'C':0, 'S':1, 'Q':2}
data['Embarked'] = data.Embarked.map(e)

**Fare**

In [43]:
#binning Fare feature by 5 bins
data.loc[data['Fare'] <= 7.854, 'Fare'] = 0
data.loc[(data['Fare'] > 7.854) & (data['Fare'] <= 10.5), 'Fare'] = 1
data.loc[(data['Fare'] > 10.5) & (data['Fare'] <= 21.679), 'Fare'] = 2
data.loc[(data['Fare'] > 21.679) & (data['Fare'] <= 39.688), 'Fare'] = 3
data.loc[(data['Fare'] > 39.688) & (data['Fare'] <= 512.329), 'Fare'] = 4

**is_alone from SibSp and Parch**

In [44]:
#create feature that passanger is alone or not
data['is_alone'] = data['SibSp'] + data['Parch']
data['is_alone'] = data.is_alone.apply(lambda x: x/x if x!=0 else 0)

**Create train and test set**

In [45]:
#get train and test set
n_train = train.shape[0]
X_train = data[:n_train]
X_test = data[n_train:]

In [46]:
#choosing feature that apply to the model and slice target ('Survived')
y_train = X_train['Survived']
X_train = X_train[['Age','Embarked','Fare','Parch','Pclass','SibSp','Family_Survival','Sex_cat','Title','Deck']]
X_test = X_test[['Age','Embarked','Fare','Parch','Pclass','SibSp','Family_Survival','Sex_cat','Title','Deck']]

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train[:5]

## **Develop Model**

In [47]:
#import model
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier

In [48]:
#define model
log_reg = LogisticRegression()
SGD = SGDClassifier()
percep = Perceptron()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
SVC = SVC()
lin_SVC = LinearSVC()
gauss = GaussianNB()

In [49]:
#run the model
classifiers = [('Decision Tree Classifier', tree),
               ('Random Forest Classifier', rf),
              ('KNN', knn),
              ('Logistic Regression', log_reg),
              ('Perceptron', percep),
              ('Stochastic Gradient', SGD),
              ('SVC', SVC),
              ('Linear SVC', lin_SVC),
              ('GaussianNB', gauss)]
# iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    # fit clf to the data
    clf.fit(X_train, y_train)
    
    # predict the labels of the test
    y_pred = clf.predict(X_train)
    
    # evaluate the accuracy of clf on the train set
    print('{:s} : {:.9f}'.format(clf_name, accuracy_score(y_train, y_pred)))

Decision Tree Classifier : 0.929292929
Random Forest Classifier : 0.929292929
KNN : 0.854096521
Logistic Regression : 0.841750842
Perceptron : 0.659932660
Stochastic Gradient : 0.778900112
SVC : 0.776655443
Linear SVC : 0.841750842
GaussianNB : 0.744107744




<p>as you can see, top 3 model are Decision Tree, Random Forest and KNN.<p>

## **Hyperparameter Tuning**

**Random Forest**

In [50]:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [51]:
#Random Forest Hyperparameter Tuning
rf=RandomForestClassifier(oob_score=True, random_state=1)

params_rf = {'criterion': ["gini","entropy"], 
             'min_samples_leaf': [3,4,5,6,7], 
             'n_estimators': [50,100,400,700,1000], 
             'min_samples_leaf':[2,4,10,12,16],
             'max_depth':[3,4,5,6,7],
             'max_features':['auto','sqrt','log2']}

grid_rf = GridSearchCV(estimator = rf,
                        param_grid = params_rf,
                        scoring= 'accuracy',
                        n_jobs=-1
                      )
grid_rf.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(oob_score=True, random_state=1),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6, 7],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [2, 4, 10, 12, 16],
                         'n_estimators': [50, 100, 400, 700, 1000]},
             scoring='accuracy')

In [53]:
#get best parameter of Random Forest
grid_rf.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'auto',
 'min_samples_leaf': 10,
 'n_estimators': 400}

In [54]:
#get best score
grid_rf.best_score_

0.8529533613709119

In [58]:
#make submission file of Random Forest with hyperparameter tuning
submission = pd.DataFrame({'PassengerId':test['PassengerId'], 
                           'Survived': grid_rf.best_estimator_.predict(X_test).astype(int)})
submission.to_csv('grid_rf.csv', index=False)

<p> We got 0.81339 of submission score on Kaggle for Random Forest using Hyperparameter tuning <p>

In [55]:
#Cross Validation
#print(cross_val_score(grid_rf.best_estimator_, X_train, y_train, cv=10, scoring='accuracy'))
#print(cross_val_score(grid_rf.best_estimator_, X_train, y_train, cv=10, scoring='roc_auc'))

**Decision Tree**

In [57]:
#Decision Tree hyperparameter tuning
params_tree = {'max_depth': range(1,30),
               'min_samples_leaf':range(1,11),
               'min_samples_split': range(1,7),
              'criterion': ['entropy', 'gini']
              }
grid_tree = GridSearchCV(estimator = tree,
                        param_grid = params_tree,
                        scoring= 'accuracy',
                        n_jobs=-1,
                        )
grid_tree.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': range(1, 30),
                         'min_samples_leaf': range(1, 11),
                         'min_samples_split': range(1, 7)},
             scoring='accuracy')

In [59]:
# get best parameter and score
print(grid_tree.best_params_)
print(grid_tree.best_score_)
print(grid_tree.best_estimator_)

{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.8506998932898122
DecisionTreeClassifier(criterion='entropy', max_depth=3)


In [60]:
#make submission file
submission = pd.DataFrame({'PassengerId':test['PassengerId'], 
                           'Survived': grid_tree.best_estimator_.predict(X_test).astype(int)})
submission.to_csv('grid_tree.csv', index=False)

I've got 0.79665 score on Kaggle for Decision Tree using Hyperparameter tuning

In [56]:
#Cross Validation
#print(cross_val_score(grid_tree.best_estimator_, X_train, y_train, cv=10, scoring='accuracy'))
#print(cross_val_score(grid_tree.best_estimator_, X_train, y_train, cv=10, scoring='roc_auc'))

**KNneighbors**

In [61]:
#KNN hyperparameter tuning
param_grid_knn = {'leaf_size':range(1,31), 'n_neighbors': range(1,31),
                  'p':[1,2,5], 'weights':['uniform', 'distance'],
                 'algorithm': ['auto','ball_tree','kd_tree', 'brute']}
grid_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, scoring='accuracy',
                       n_jobs=-1)
grid_knn.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': range(1, 31), 'n_neighbors': range(1, 31),
                         'p': [1, 2, 5], 'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [62]:
#get best parameter and score
print(grid_knn.best_params_)
print(grid_knn.best_score_)
print(grid_knn.best_estimator_)

{'algorithm': 'ball_tree', 'leaf_size': 6, 'n_neighbors': 19, 'p': 1, 'weights': 'uniform'}
0.8372167472223966
KNeighborsClassifier(algorithm='ball_tree', leaf_size=6, n_neighbors=19, p=1)


In [65]:
#make submission file
submission = pd.DataFrame({'PassengerId':test['PassengerId'], 
                           'Survived': grid_knn.best_estimator_.predict(X_test).astype(int)})
submission.to_csv('grid_knn.csv', index=False)

I've got 0.76076 on Kaggle for KNN using hyperparameter tuning

In [None]:
#Cross Validation
#print(cross_val_score(grid_knn.best_estimator_, X_train, y_train, cv=10, scoring='accuracy'))
#print(cross_val_score(grid_knn.best_estimator_, X_train, y_train, cv=10, scoring='roc_auc'))

**Voting Classifier**

In [63]:
classifiers = [('random forest', grid_rf.best_estimator_),
              ('Decision Tree', grid_tree.best_estimator_),
              ('KNN', grid_knn.best_estimator_)]
vc=VotingClassifier(estimators=classifiers, voting='hard')
vc.fit(X_train, y_train)
y_pred_vc = vc.predict(X_train)
print('accuracy score:{}'.format(accuracy_score(y_train, y_pred_vc)))

accuracy score:0.8552188552188552


In [64]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'], 
                           'Survived': vc.predict(X_test).astype(int)})
submission.to_csv('voting.csv', index=False)
#submission.head()

I've got 0.81100 on Kaggle for Voting Classifier