In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split

import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, classification_report, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, plot_roc_curve, precision_recall_curve, average_precision_score, plot_precision_recall_curve

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(r'C:\Users\dheof\Desktop\titanic.csv')
df

Unnamed: 0,sex,age,parch,fare,class,deck,embark_town,alive,alone
0,male,22.0,0,7.2500,Third,,Southampton,no,False
1,female,38.0,0,71.2833,First,C,Cherbourg,yes,False
2,female,26.0,0,7.9250,Third,,Southampton,yes,True
3,female,35.0,0,53.1000,First,C,Southampton,yes,False
4,male,35.0,0,8.0500,Third,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...
886,male,27.0,0,13.0000,Second,,Southampton,no,True
887,female,19.0,0,30.0000,First,B,Southampton,yes,True
888,female,,2,23.4500,Third,,Southampton,no,False
889,male,26.0,0,30.0000,First,C,Cherbourg,yes,True


In [4]:
df = df.drop(columns=['deck'])
df

Unnamed: 0,sex,age,parch,fare,class,embark_town,alive,alone
0,male,22.0,0,7.2500,Third,Southampton,no,False
1,female,38.0,0,71.2833,First,Cherbourg,yes,False
2,female,26.0,0,7.9250,Third,Southampton,yes,True
3,female,35.0,0,53.1000,First,Southampton,yes,False
4,male,35.0,0,8.0500,Third,Southampton,no,True
...,...,...,...,...,...,...,...,...
886,male,27.0,0,13.0000,Second,Southampton,no,True
887,female,19.0,0,30.0000,First,Southampton,yes,True
888,female,,2,23.4500,Third,Southampton,no,False
889,male,26.0,0,30.0000,First,Cherbourg,yes,True


In [5]:
df.isnull().sum()

sex              0
age            177
parch            0
fare             0
class            0
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
simple_impute = SimpleImputer(strategy='median')
df[['age']] = simple_impute.fit_transform(df[['age']])
df

Unnamed: 0,sex,age,parch,fare,class,embark_town,alive,alone
0,male,22.0,0,7.2500,Third,Southampton,no,False
1,female,38.0,0,71.2833,First,Cherbourg,yes,False
2,female,26.0,0,7.9250,Third,Southampton,yes,True
3,female,35.0,0,53.1000,First,Southampton,yes,False
4,male,35.0,0,8.0500,Third,Southampton,no,True
...,...,...,...,...,...,...,...,...
886,male,27.0,0,13.0000,Second,Southampton,no,True
887,female,19.0,0,30.0000,First,Southampton,yes,True
888,female,28.0,2,23.4500,Third,Southampton,no,False
889,male,26.0,0,30.0000,First,Cherbourg,yes,True


In [7]:
simple_impute = SimpleImputer(strategy='most_frequent')
df[['embark_town']] = simple_impute.fit_transform(df[['embark_town']])
df

Unnamed: 0,sex,age,parch,fare,class,embark_town,alive,alone
0,male,22.0,0,7.2500,Third,Southampton,no,False
1,female,38.0,0,71.2833,First,Cherbourg,yes,False
2,female,26.0,0,7.9250,Third,Southampton,yes,True
3,female,35.0,0,53.1000,First,Southampton,yes,False
4,male,35.0,0,8.0500,Third,Southampton,no,True
...,...,...,...,...,...,...,...,...
886,male,27.0,0,13.0000,Second,Southampton,no,True
887,female,19.0,0,30.0000,First,Southampton,yes,True
888,female,28.0,2,23.4500,Third,Southampton,no,False
889,male,26.0,0,30.0000,First,Cherbourg,yes,True


In [8]:
df.isnull().sum()

sex            0
age            0
parch          0
fare           0
class          0
embark_town    0
alive          0
alone          0
dtype: int64

In [65]:
df['class'].unique()

array(['Third', 'First', 'Second'], dtype=object)

In [122]:
x = df.drop(columns=['alive'])
y = np.where(df['alive'] == 'yes', 1, 0)

In [123]:
x_trainval, x_test, y_trainval, y_test = train_test_split(
    x,
    y,
    test_size = 0.2,
    random_state = 0
)

In [124]:
ordinal_mapping = [
    {'col':'class',
    'mapping':{None:0,'First':1,'Second':2,'Third':3}}
]

ordinal_encoder = ce.OrdinalEncoder(cols = 'class',mapping = ordinal_mapping)
dford = ordinal_encoder.fit_transform(df['class'])

In [125]:
onehot = OneHotEncoder()
one_hot_var = ['sex', 'embark_town', 'alone']
ordinal_var = ['class']

transformer = ColumnTransformer([
    ('one hot', onehot, one_hot_var),
    ('ordinal', ordinal_encoder, ordinal_var)
    ])

In [126]:
x_train_prep = transformer.fit_transform(x_trainval)
x_test_prep = transformer.transform(x_test)

# Model: Benchmark

In [127]:
x_train_prep

array([[1., 0., 1., ..., 1., 0., 3.],
       [0., 1., 0., ..., 0., 1., 2.],
       [0., 1., 1., ..., 1., 0., 2.],
       ...,
       [0., 1., 0., ..., 0., 1., 3.],
       [1., 0., 0., ..., 1., 0., 3.],
       [0., 1., 0., ..., 1., 0., 2.]])

In [128]:
logreg = LogisticRegression(solver = 'liblinear')
knn = KNeighborsClassifier(n_neighbors= 3)
tree = DecisionTreeClassifier(max_depth = 3)

In [129]:
skfold = StratifiedKFold(n_splits = 5)

logreg_cv = cross_val_score(logreg, x_train_prep, y_trainval, cv = skfold, scoring = 'f1', error_score= 'raise')
knn_cv = cross_val_score(knn, x_train_prep, y_trainval, cv = skfold, scoring = 'f1', error_score= 'raise')
tree_cv = cross_val_score(tree, x_train_prep, y_trainval, cv = skfold, scoring = 'f1', error_score= 'raise')

In [130]:
print('Hasil Cross Validasi', logreg_cv)
print('Hasil Mean Cross Validasi', logreg_cv.mean())
print('Hasil STD Cross Validasi', logreg_cv.std())

Hasil Cross Validasi [0.64220183 0.75229358 0.67326733 0.69306931 0.75      ]
Hasil Mean Cross Validasi 0.7021664093014807
Hasil STD Cross Validasi 0.043161209138597854


In [131]:
print('Hasil Cross Validasi', knn_cv)
print('Hasil Mean Cross Validasi', knn_cv.mean())
print('Hasil STD Cross Validasi', knn_cv.std())

Hasil Cross Validasi [0.69565217 0.70491803 0.60431655 0.68       0.65263158]
Hasil Mean Cross Validasi 0.6675036664819775
Hasil STD Cross Validasi 0.03622740044236554


In [132]:
print('Hasil Cross Validasi', tree_cv)
print('Hasil Mean Cross Validasi', tree_cv.mean())
print('Hasil STD Cross Validasi', tree_cv.std())

Hasil Cross Validasi [0.64444444 0.79569892 0.65909091 0.66666667 0.65116279]
Hasil Mean Cross Validasi 0.6834127471261755
Hasil STD Cross Validasi 0.05663701440021099
