# Import Modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, probplot, skew
from scipy.special import boxcox1p
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RepeatedStratifiedKFold, RepeatedKFold, train_test_split
from sklearn.neighbors import  KNeighborsClassifier as knn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import auc, accuracy_score

from IPython.core.display import HTML

# Read Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# <h3 style=>Univariate Search</h3>

In [53]:
def multi_table(table_list):
    return HTML(
        f"<table><tr> {''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list])} </tr></table>")

In [54]:
data = pd.concat((train, test)).drop(['Survived'], axis=1).reset_index(drop=True)

In [55]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [56]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [57]:
# dependent variable: Survived
set(train.columns) - set(test.columns)

{'Survived'}

In [58]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [59]:
multi_table([pd.DataFrame(data[i].value_counts()) for i in data.columns])

Unnamed: 0_level_0,PassengerId,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0
Unnamed: 0_level_1,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Unnamed: 0_level_2,Name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Unnamed: 0_level_3,Sex,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
Unnamed: 0_level_4,Age,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
Unnamed: 0_level_5,SibSp,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5
Unnamed: 0_level_6,Parch,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6
Unnamed: 0_level_7,Ticket,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7
Unnamed: 0_level_8,Fare,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8
Unnamed: 0_level_9,Cabin,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9
Unnamed: 0_level_10,Embarked,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10
1,1,,,,,,,,,
861,1,,,,,,,,,
879,1,,,,,,,,,
878,1,,,,,,,,,
877,1,,,,,,,,,
...,...,,,,,,,,,
436,1,,,,,,,,,
435,1,,,,,,,,,
434,1,,,,,,,,,
433,1,,,,,,,,,

Unnamed: 0,PassengerId
1,1
861,1
879,1
878,1
877,1
...,...
436,1
435,1
434,1
433,1

Unnamed: 0,Pclass
3,709
1,323
2,277

Unnamed: 0,Name
"Connolly, Miss. Kate",2
"Kelly, Mr. James",2
"Goldsmith, Mr. Frank John",1
"Carr, Miss. Jeannie",1
"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",1
...,...
"Goodwin, Miss. Jessie Allis",1
"Edvardsson, Mr. Gustaf Hjalmar",1
"Davison, Mrs. Thomas Henry (Mary E Finck)",1
"Birnbaum, Mr. Jakob",1

Unnamed: 0,Sex
male,843
female,466

Unnamed: 0,Age
24.00,47
22.00,43
21.00,41
30.00,40
18.00,39
...,...
20.50,1
11.50,1
22.50,1
0.33,1

Unnamed: 0,SibSp
0,891
1,319
2,42
4,22
3,20
8,9
5,6

Unnamed: 0,Parch
0,1002
1,170
2,113
3,8
4,6
5,6
6,2
9,2

Unnamed: 0,Ticket
CA. 2343,11
CA 2144,8
1601,8
S.O.C. 14879,7
347082,7
...,...
32302,1
3411,1
11752,1
STON/O 2. 3101291,1

Unnamed: 0,Fare
8.0500,60
13.0000,59
7.7500,55
26.0000,50
7.8958,49
...,...
4.0125,1
25.5875,1
12.2750,1
8.6833,1

Unnamed: 0,Cabin
C23 C25 C27,6
G6,5
B57 B59 B63 B66,5
B96 B98,4
F4,4
...,...
C148,1
B61,1
C49,1
C130,1

Unnamed: 0,Embarked
S,914
C,270
Q,123


In [60]:
# Check nominal variables and order variables.
# all order vairables(Pclass) are aleady numeric type.
nominal_vars = ['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
data[nominal_vars] = data[nominal_vars].astype('category')

In [61]:
# Have to check distribution (skew, histogram, barplot...)
# Have to check derivative possible variables

# Missing values and Create Derivative variables

In [62]:
miss_cnt = data.isnull().sum().sort_values(ascending=False)
miss_pct = miss_cnt / data.shape[0]
miss_table = pd.DataFrame([miss_cnt, miss_pct], index=['count', 'percent']).T
miss_table = miss_table[miss_table['count'] > 0]
miss_table

Unnamed: 0,count,percent
Cabin,1014.0,0.774637
Age,263.0,0.200917
Embarked,2.0,0.001528
Fare,1.0,0.000764


Cabin: delete columns

In [63]:
data.drop(['Cabin'], axis=1, inplace=True)

Embarked: fill na by mode

In [64]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

Fare: fill na by median / create derivative variable(FareBin)

In [65]:
data['Fare'] = data['Fare'].fillna(data['Fare'].median())
data['FareBin']= pd.qcut(data['Fare'], 5)

Name: create Derivative variable

In [66]:
titles = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
value_count_title = titles.value_counts()
other_titles = value_count_title.index[value_count_title.values < 10]
data['Title'] = titles
data['Title'] = data['Title'].replace(other_titles, 'Other')

Age: fill na median by Title / create derivative variables (AgeBin)

In [67]:
data.loc[data['Age'].isnull(), 'Age'] = data.groupby('Title').Age.transform('median')
data['AgeBin'] = 0
data.loc[data['Age'] <= 10, 'AgeBin'] = 0
data.loc[(data['Age'] > 10) & (data['Age'] <= 16), 'AgeBin'] = 1
data.loc[(data['Age'] > 16) & (data['Age'] <= 20), 'AgeBin'] = 2
data.loc[(data['Age'] > 20) & (data['Age'] <= 26), 'AgeBin'] = 3
data.loc[(data['Age'] > 26) & (data['Age'] <= 30), 'AgeBin'] = 4
data.loc[(data['Age'] > 30) & (data['Age'] <= 36), 'AgeBin'] = 5
data.loc[(data['Age'] > 36) & (data['Age'] <= 40), 'AgeBin'] = 6
data.loc[(data['Age'] > 40) & (data['Age'] <= 46), 'AgeBin'] = 7
data.loc[(data['Age'] > 46) & (data['Age'] <= 50), 'AgeBin'] = 8
data.loc[(data['Age'] > 50) & (data['Age'] <= 60), 'AgeBin'] = 9
data.loc[(data['Age'] > 60), 'AgeBin'] = 10

SibSp, Parch: create derivative variable(Family)

In [68]:
data['Family'] = data['SibSp'] + data['Parch']

Family: create derivative variable(Solo)

In [69]:
data['Solo'] = data['Family'] == 1
data['Solo'] = data['Solo'].astype('int')

# Select Variables and Transform

In [70]:
# delete some variables
data.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Age', 'Fare'], axis=1, inplace=True)

In [71]:
# label encoding
data['Sex'] = data.Sex.cat.codes
data['FareBin'] = data.FareBin.cat.codes

In [72]:
# onehot encoding
data = pd.get_dummies(data)

In [73]:
X_train = data.iloc[:train.shape[0], :]
X_test = data.iloc[train.shape[0]:, :]
y_train = train.Survived

In [74]:
X_train.shape, y_train.shape, X_test.shape

((891, 14), (891,), (418, 14))

# Modeling

In [89]:
allow_tuning = True

In [75]:
x_train, x_test, y_tmp_train, y_tmp_test = train_test_split(X_train, y_train, test_size=.2, random_state=42)

XGBoost

In [76]:
def xgb_gridsearch(params_grid_xgb, features, values, last=False):
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)

    model_xgb = XGBClassifier(use_label_encoder = False, 
                              objective = 'binary:logistic')
    
    for i in range(len(features)):
        params_grid_xgb[features[i]] = values[i]
    search_xgb = GridSearchCV(model_xgb, params_grid_xgb, verbose = 0,
                              scoring = 'neg_log_loss', cv = cv).fit(x_train, y_tmp_train, early_stopping_rounds = 15, 
                                  eval_set = [[x_test, y_tmp_test]], 
                                  eval_metric = 'logloss', verbose = False)
    for i in range(len(features)):
        print(f"{features[i]}: {search_xgb.best_params_[features[i]]}")
    for k, v in search_xgb.best_params_.items():
        search_xgb.best_params_[k] = [v]
    if last:
        for k, v in search_xgb.best_params_.items():
            search_xgb.best_params_[k] = v[0]
    return search_xgb, search_xgb.best_params_

In [77]:
if allow_tuning:
    params_knn = {
        'n_neighbors' : range(1, 10),
        'weights' : ['uniform', 'distance'],
        'algorithm' : ['auto', 'ball_tree','kd_tree'],
        'p' : [1,2]
    }
    model_knn = knn()
    search_knn = GridSearchCV(model_knn, params_knn, cv=5, scoring='accuracy', n_jobs=-1, verbose=0).fit(X_train, y_train)
    search_knn.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}

In [78]:
if allow_tuning:
    params_logistic = {
        'max_iter': [2000],
        'penalty': ['l1', 'l2'],
        'C': np.logspace(-4, 4, 20),
        'solver': ['liblinear']
    }
    model_logistic = LogisticRegression()
    search_logistic = GridSearchCV(model_logistic, params_logistic, cv=5, scoring='accuracy', n_jobs=-1, verbose=0).fit(X_train, y_train)
    search_logistic.best_params_

{'C': 4.281332398719396,
 'max_iter': 2000,
 'penalty': 'l1',
 'solver': 'liblinear'}

In [79]:
if allow_tuning:
    params_svc = [{'kernel': ['rbf'], 'gamma': [.01, .1, .5, 1, 2, 5, 10], 'C': [.1, 1, 10, 100, 1000], 'probability': [True]},
                  {'kernel': ['linear'], 'C': [.01, .1, 1, 10, 100, 1000], 'probability': [True]},
                  {'kernel': ['poly'], 'degree' : [2, 3, 4, 5], 'C': [.01, .1, 1, 10, 100, 1000], 'probability': [True]}]
    model_svc = SVC()
    search_svc = GridSearchCV(model_svc, params_svc, cv=5, scoring='accuracy', n_jobs=-1, verbose=0).fit(X_train, y_train)
    search_svc.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf', 'probability': True}

In [80]:
if allow_tuning:
    params_rf = {
        'n_estimators': range(100, 500, 1000),
        'criterion':['gini','entropy'],
        'bootstrap': [True],
        'max_depth': [5, 10, 15, 20, 25],
        'max_features': ['auto','sqrt', 10],
        'min_samples_leaf': [2, 3],
        'min_samples_split': [2, 3]}
    model_rf = RandomForestClassifier()
    search_rf = GridSearchCV(model_rf, params_rf, cv=5, scoring='accuracy', n_jobs=-1, verbose=0).fit(X_train, y_train)
    search_rf.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 10,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 100}

In [81]:
if allow_tuning:
    params_xgb = {'n_estimators': [1000],
                  'learning_rate': [0.1],
                  'max_depth': [5],
                  'min_child_weight': [1],
                  'gamma': [0],
                  'subsample': [0.8],
                  'colsample_bytree': [0.8],
                  'n_jobs': [-1],
                  'objective': ['binary:logistic'],
                  'use_label_encoder': [False],
                  'eval_metric': ['logloss'],
                  'scale_pos_weight': [1]}

    search_xgb, params_xgb = xgb_gridsearch(params_xgb, ['learning_rate'], [[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.15, 0.2]])
    search_xgb, params_xgb = xgb_gridsearch(params_xgb, ['max_depth', 'min_child_weight'], [range(3, 10), range(1, 6)])
    search_xgb, params_xgb = xgb_gridsearch(params_xgb, ['gamma'], [[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2]])
    search_xgb, params_xgb = xgb_gridsearch(params_xgb, ['subsample', 'colsample_bytree'], [[i/100.0 for i in range(75,90,5)], [i/100.0 for i in range(75,90,5)]])
    search_xgb, params_xgb = xgb_gridsearch(params_xgb, ['reg_alpha'], [[1e-5, 1e-2, 0.1, 1, 100]])
    params_xgb['n_estimators'] = [5000]
    search_xgb, params_xgb = xgb_gridsearch(params_xgb, ['learning_rate'], [[0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.15, 0.2]], last=True)

    model_xgb = XGBClassifier(**params_xgb)
    model_xgb = model_xgb.fit(x_train, y_tmp_train, eval_set=[(x_test, y_tmp_test)], eval_metric=['logloss'], early_stopping_rounds=15, verbose=0)
    params_xgb['n_estimators'] = model_xgb.best_iteration

learning_rate's best value: 0.01
max_depth's best value: 3
min_child_weight's best value: 1
gamma's best value: 0.7
subsample's best value: 0.8
colsample_bytree's best value: 0.8
reg_alpha's best value: 1e-05
learning_rate's best value: 0.01


each models

In [211]:
if allow_tuning:
    model_knn = search_knn.best_estimator_
    model_logistic = search_logistic.best_estimator_
    model_svc = search_svc.best_estimator_
    model_rf = search_rf.best_estimator_
    model_xgb = XGBClassifier(**params_xgb)
else:
    model_knn = knn(algorithm='ball_tree', 
                    n_neighbors=7,
                    p=2, 
                    weights='uniform')
    model_logistic = LogisticRegression(C=4.281332398719396,
                                        max_iter=2000, 
                                        penalty='l1', 
                                        solver='liblinear')
    model_svc = SVC(C=100,
                    gamma=0.01,
                    kernel='rbf',
                    probability=True)
    model_rf = RandomForestClassifier(bootstrap=True,
                                      criterion='entropy',
                                      max_depth=10,
                                      max_features=10,
                                      min_samples_leaf=3,
                                      min_samples_split=3,
                                      n_estimators=100)
    model_xgb = XGBClassifier(learning_rate=0.01,
                              max_depth=3,
                              min_child_weight=1,
                              gamma=0.7,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              reg_alpha=1e-05)
models = {
    'knn': model_knn,
    'logistic': model_logistic,
    'svc': model_svc,
    'rf': model_rf,
    'xgb': model_xgb
}

voting models

In [212]:
import copy

def select_models(start, cnt, goal, estimators, voting):
    if cnt == goal:
        estimators_copy = copy.deepcopy(estimators)
        voting_name = f'{voting}_' + '_'.join([i[0] for i in list(estimators_copy)])
        models[voting_name] = VotingClassifier(estimators=estimators_copy, voting=voting)
        return
    for i in range(start, 5):
        estimators.append(list(models.items())[i])
        select_models(i + 1, cnt + 1, goal, estimators, voting)
        estimators.pop()
        

In [213]:
select_models(0, 0, 3, [], 'hard')
select_models(0, 0, 4, [], 'hard')
select_models(0, 0, 5, [], 'hard')

select_models(0, 0, 3, [], 'soft')
select_models(0, 0, 4, [], 'soft')
select_models(0, 0, 5, [], 'soft')

cross validation scores

In [221]:
result_by_model = pd.DataFrame({'model name': models.keys(), 'model': models.values(), 'score': 0})

In [222]:
for name, model in models.items():
    result_by_model.loc[result_by_model['model name'] == name, 'score'] = cross_val_score(model, X_train,y_train,cv=5).mean()

In [232]:
result_by_model.sort_values('score', ascending=False)

Unnamed: 0,model name,models,score
18,hard_knn_svc_rf_xgb,"VotingClassifier(estimators=[('knn',\n ...",0.84399
26,soft_knn_rf_xgb,"VotingClassifier(estimators=[('knn',\n ...",0.840625
15,hard_knn_logistic_svc_rf,"VotingClassifier(estimators=[('knn',\n ...",0.840619
8,hard_knn_svc_rf,"VotingClassifier(estimators=[('knn',\n ...",0.840606
17,hard_knn_logistic_rf_xgb,"VotingClassifier(estimators=[('knn',\n ...",0.839508
6,hard_knn_logistic_rf,"VotingClassifier(estimators=[('knn',\n ...",0.839502
35,soft_logistic_svc_rf_xgb,"VotingClassifier(estimators=[('logistic',\n ...",0.839489
22,soft_knn_logistic_rf,"VotingClassifier(estimators=[('knn',\n ...",0.838378
27,soft_logistic_svc_rf,"VotingClassifier(estimators=[('logistic',\n ...",0.838365
24,soft_knn_svc_rf,"VotingClassifier(estimators=[('knn',\n ...",0.838365


In [246]:
models['rf'].fit(X_train, y_train)
y_pred = models['rf'].predict(X_test)

In [247]:
my_submission = pd.DataFrame({'PassengerId': test.PassengerId, 
                              'Survived': y_pred})

my_submission.to_csv('submission.csv', index = False)