### Import Libraries and Dataset

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [18]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,sex,age,parch,fare,class,deck,embark_town,alive,alone
0,male,22.0,0,7.25,Third,,Southampton,no,False
1,female,38.0,0,71.2833,First,C,Cherbourg,yes,False
2,female,26.0,0,7.925,Third,,Southampton,yes,True
3,female,35.0,0,53.1,First,C,Southampton,yes,False
4,male,35.0,0,8.05,Third,,Southampton,no,True


In [3]:
df.isnull().sum()

sex              0
age            177
parch            0
fare             0
class            0
embark_town      2
alive            0
alone            0
dtype: int64

### Preprocessing

Preprocessing Scheme:
* target (y) = alive
* SimpleImputer(most_frequent) + onehot: sex, embark_town, alone
* ordinal: class
* KNNImputer: age
* RobustScaler(KNN only): age, fare

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler

One hot encoder pipeline and ordinal map:

In [5]:
onehot_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(drop='first'))
])

ord_map = ([
    {'col':'class',
    'mapping':{'First':1,'Second':2,'Third':3}}
])

Transformer for logistic regression and decision tree:

In [6]:
transformer = ColumnTransformer([
    ('onehot',onehot_pipe,['sex','embark_town','alone']),
    ('ordinal',ce.OrdinalEncoder(mapping=ord_map),['class']),
    ('imputer',KNNImputer(),['age'])
],remainder='passthrough')

Pipeline and transformer for KNN:

In [7]:
imp_scale = Pipeline([
    ('imputer',KNNImputer()),
    ('scaler',RobustScaler())
])

transformer2 = ColumnTransformer([
    ('onehot',onehot_pipe,['sex','embark_town','alone']),
    ('ordinal',ce.OrdinalEncoder(mapping=ord_map),['class']),
    ('imp_scale',imp_scale,['age','fare'])
],remainder='passthrough')

### Split Data

In [8]:
x = df.drop('alive',axis=1)
y = np.where(df['alive']=='yes',1,0)

x_train,x_test,y_train,y_test = train_test_split(x,y,stratify=y,test_size=0.2,random_state=2020)

### Algorithm Chain

- TP : Predicted alive and actually alive
- TN : Predicted not alive and actually not alive
- FP : Predicted alive but actually not alive
- FN : Predicted not alive but actually alive

We want to minimize FP, thus we will use precision metric to evaluate models.

In [9]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

logreg = LogisticRegression(solver='liblinear')
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

models = [logreg,tree,knn]
mean_score = []
std_score = []

for i in models:
    skfold = StratifiedKFold(5)
    if str(i)=='KNeighborsClassifier()':
        estimator = Pipeline([
            ('preprocess',transformer2),
            ('model',i)
        ])
    else:
        estimator = Pipeline([
            ('preprocess',transformer),
            ('model',i)
        ])
    cv_score = cross_val_score(estimator,x_train,y_train,cv=skfold,scoring='precision')
    mean_score.append(cv_score.mean())
    std_score.append(cv_score.std())

pd.DataFrame({
    'model':models,
    'mean':mean_score,
    'std':std_score
})

Unnamed: 0,model,mean,std
0,LogisticRegression(solver='liblinear'),0.756809,0.061015
1,DecisionTreeClassifier(),0.71364,0.049368
2,KNeighborsClassifier(),0.752057,0.060881


From the result above, LogisticRegression gives the highest precision score and also a low deviation (stable). Thus, LogisticRegression model is used.

### Hyperparameter Tuning

In [10]:
logreg = LogisticRegression(solver='liblinear')

estimator = Pipeline([
    ('preprocess',transformer),
    ('model',logreg)
])

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
hyperparam_space = {
    'model__C':[1000,100,10,1,0.1,0.01,0.001],
    'model__max_iter':[1000,5000,10000],
    'model__solver':['liblinear','lbfgs']
}

skfold = StratifiedKFold(5)
grid_search = GridSearchCV(
    estimator,
    param_grid = hyperparam_space,
    cv = skfold,
    scoring='precision',
    n_jobs = -1
)

In [13]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('onehot',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(drop='first'))]),
                                                                         ['sex',
                                                                          'embark_town',
                                                                          'a

#### Before Tuning

In [14]:
from sklearn.metrics import precision_score

In [15]:
logreg = LogisticRegression(solver='liblinear')
estimator = Pipeline([
    ('preprocess',transformer),
    ('model',logreg)
])
estimator.fit(x_train,y_train)
y_pred = estimator.predict(x_test)

print('Precision= ',precision_score(y_test,y_pred))

Precision=  0.7014925373134329


#### After Tuning

In [16]:
y_pred = grid_search.best_estimator_.predict(x_test)

print('Precision= ',precision_score(y_test,y_pred))

Precision=  0.75


In [17]:
grid_search.best_params_

{'model__C': 0.1, 'model__max_iter': 1000, 'model__solver': 'lbfgs'}