# 🚢 Titanic : Feature Engineering & KNN [6%]

- Feature Engineering
- Handling Missing Value
- Scaling
- Modeling
- Hyperparameter Tuning
- Prediction

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV,RepeatedStratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
df = train.append(test)

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [5]:
print(train.isnull().sum())
print()
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


There are some <strong>missing values</strong> in the train and test datasets

## Feature Engineering

<strong>Sex</strong>

In [6]:
#0=Male; 1=Female
df['Sex'] = pd.get_dummies(df['Sex'])

**Family Size**

In [7]:
df['Family_Size'] = df['Parch'] + df['SibSp']

**Title**

In [8]:
#Finding Passenger's Title
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [9]:
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace(['Mme',"Countess","Lady","Dona"], 'Mrs')    
df['Title'] = df['Title'].replace(['Capt',"Col","Don","Jonkheer","Major", "Rev","Sir", "Master"],"Other")

df.loc[((df.Title == "Dr") & (df.Sex==1)), 'Title'] = "Mrs"
df.loc[((df.Title == "Dr") & (df.Sex==0)), 'Title'] = "Mr"

titles = list(df.Title.unique())
for title in titles:
    age = df.groupby('Title')['Age'].median().loc[title]
    df.loc[(df.Age.isnull()) & (df.Title == title),'Age'] = age

In [10]:
title_mapping = {"Mr": 0, "Miss": 1, 
                 "Mrs": 2, "Other":3}

df.replace({'Title': title_mapping}, inplace=True)

**Fare & Age Binning**

In [11]:
df["Age"].fillna(df.Age.median(), inplace=True)
df['Fare'].fillna(value = df[df.Pclass==3]['Fare'].median(), inplace = True)

In [12]:
# Quantile-based discretization function
df['Fare_Bin'] = pd.qcut(df['Fare'], 5, labels=False)
df['Age_Bin'] = pd.qcut(df['Age'], 4, labels=False)

**Family Survival**

In [13]:
# This feature is inspired by https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever
df['Last_Name'] = df['Name'].apply(lambda x: str.split(x, ",")[0])
df['Fare'].fillna(df['Fare'].mean(), inplace=True)

DEFAULT_SURVIVAL_VALUE = 0.5
df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      df.loc[df['Family_Survival']!=0.5].shape[0])

Number of passengers with family survival information: 420


In [14]:
for _, grp_df in df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(df[df['Family_Survival']!=0.5].shape[0]))

# # Family_Survival in TRAIN_DF and TEST_DF:
train['Family_Survival'] = df['Family_Survival'][:891]
test['Family_Survival'] = df['Family_Survival'][891:]

Number of passenger with family/group survival information: 546


## Feature Selection

In [15]:
features = ['Survived','Title', 'Pclass','Sex','Family_Size','Family_Survival','Fare_Bin','Age_Bin']
df = df[features]

**Splitting Data Back into Train and Test**

In [16]:
train = df[:len(train)]

x_train = train.drop('Survived', axis=1)
y_train = train['Survived'].astype(int)

x_test = df[len(train):].drop('Survived', axis=1)

## Scaling

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train))
x_test = pd.DataFrame(scaler.fit_transform(x_test))

## Modeling

In [18]:
# Model
model = KNeighborsClassifier()

# KFold
kfold = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=420)

## Hyperparameter Tuning

In [19]:
# GSCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_neighbors":np.arange(2,20,2),
    "weights":["uniform","distance"],
    "leaf_size":np.arange(1,50,5)
}

gscv = GridSearchCV(model, cv=kfold, param_grid=param_grid, scoring='accuracy')
gscv_result = gscv.fit(x_train,y_train)

In [20]:
# GSCV Result
best_acc_score=gscv_result.best_score_
best_params=gscv_result.best_params_

print("Accuracy: {:6f}".format(best_acc_score))
print("Params: {}".format(best_params))

Accuracy: 0.840995
Params: {'leaf_size': 16, 'n_neighbors': 8, 'weights': 'uniform'}


## Making Prediction

In [21]:
# Model training and prediction
result = gscv.predict(x_test)

In [22]:
# Exporting result
test_new = pd.read_csv('../input/titanic/test.csv')

submission = pd.DataFrame({'PassengerId': test_new['PassengerId'], 'Survived': result})
submission.to_csv('submission.csv', index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
