In [14]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb
from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score

from sklearn.ensemble import VotingClassifier
import collections
from collections import Counter

from sklearn.model_selection import KFold
from joblib import dump, load

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

In [3]:
# preprocessing
# 여러개의 이상치를 가진애들 제거
def detect_outliers(df, n, features) :
    outlier_indices = []
    for col in features :
        Q1 = np.percentile(df[col],25)
        Q3 = np.percentile(df[col],75)
        # inter quartile range
        IQR = Q3 - Q1
        
        outlier_step = 1.5* IQR
        
        # 아래나 위로 튀는 이상치에 대한 index 값을 가져옴
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        
        outlier_indices.extend(outlier_list_col)
    # outlier couting
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
    
    return multiple_outliers


In [4]:
Outliers_to_drop = detect_outliers(data, 2, ['Age','SibSp','Parch','Fare'])

data = data.drop(Outliers_to_drop, axis=0).reset_index(drop=True)

  interpolation=interpolation)


In [5]:
data['Fare'] = data['Fare'].apply(lambda x : np.log(x) if x>0 else 0)

data['Embarked'] = data['Embarked'].fillna('S')

In [7]:
index_NaN_age = list(data[data['Age'].isnull()].index)

Age_idx = data.columns.get_loc('Age')
SibSp_idx = data.columns.get_loc('SibSp')
Parch_idx = data.columns.get_loc('Parch')
Pclass_idx = data.columns.get_loc('Pclass')

for i in index_NaN_age :
    age_med = data['Age'].median()
    age_pred = data['Age'][((data['SibSp'] == data.iloc[i]['SibSp']) 
                           & (data['Parch'] == data.iloc[i]['Parch'])
                           & (data['Pclass'] == data.iloc[i]['Pclass']))].median()
    if not np.isnan(age_pred) :
        data.iat[i, Age_idx] = age_pred
    else :
        data.iat[i,Age_idx] = age_med

In [8]:
data['Cabin'] = data['Cabin'].fillna('X')
data['Cabin'] = data['Cabin'].apply(lambda x : str(x)[:1])


In [10]:
data['Fsize'] = data['SibSp'] + data['Parch'] +1
# Fsize 값으로 원핫인코딩함
data['Single'] = data['Fsize'].apply(lambda x : 1 if x==1 else 0)
data['SmallF'] = data['Fsize'].apply(lambda x : 1 if x==2 else 0)
data['MedF'] = data['Fsize'].apply(lambda x : 1 if 3<=x<=4 else 0)
data['LargeF'] = data['Fsize'].apply(lambda x : 1 if x>=5 else 0)

In [11]:
def titanic_fillna(df) :
    df.Age.fillna(data['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return(df)

def drop_feature(df) :
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

# 전처리 기본 2 : lambda 이용해서 Cabin의 값 1글자만 사용
def format_features(df) :
    
    features = ['Cabin','Sex','Embarked']
    for feature in features :
        le = preprocessing.LabelEncoder()
        le = le.fit(df.loc[:,feature])
        df.loc[:,feature] = le.transform(df.loc[:,feature])
        
    # Scalar 사용할 수 있음
        
    return(df)

def titanic_transform(df) :
    df = titanic_fillna(df)
    df = drop_feature(df)
    df = format_features(df)
    return(df)

In [12]:
data_train = titanic_transform(data)

In [15]:
y_titanic_train = data_train.loc[:,'Survived']
X_titanic_train = data_train.drop('Survived',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, y_titanic_train,
                                                    stratify = y_titanic_train,
                                                    test_size=0.2,
                                                    random_state=11)

In [40]:
kfold = KFold(n_splits=7,random_state=0,shuffle=True)

In [None]:
# RFC

In [41]:
rfc = RandomForestClassifier(n_jobs=-1,random_state=1234)

In [44]:
parameters = {
    'n_estimators':np.arange(50,1000,50),
    'max_features':np.arange(1,8),
    'min_samples_split':np.arange(2,5),
    'max_leaf_nodes':np.arange(2,15)
}

n_iter_search=10
rfc_rgs = RandomizedSearchCV(rfc, param_distributions=parameters, cv=kfold,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
rfc_rgs.fit(X_train, y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    4.1s finished


RandomizedSearchCV(cv=KFold(n_splits=7, random_state=0, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_lea...
                   param_distributions={'max_features': array([1, 2, 3, 4,

In [45]:
rfc_rgs.best_params_

{'n_estimators': 450,
 'min_samples_split': 3,
 'max_leaf_nodes': 7,
 'max_features': 5}

In [47]:
rfc_rgs.best_score_

0.8252840909090909

In [48]:
parameters = {
    'n_estimators':np.arange(425,475),
    'max_features':np.arange(4,7),
    'min_samples_split':np.arange(2,5),
    'max_leaf_nodes':np.arange(5,10)
}

n_iter_search=10
grid_rfc_clf = GridSearchCV(rfc, param_grid=parameters, cv=kfold,scoring='accuracy', n_jobs=-1)

grid_rfc_clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [21]:
grid_rfc_clf.best_params_

{'max_features': 5,
 'max_leaf_nodes': 11,
 'min_samples_split': 2,
 'n_estimators': 216}

In [22]:
rfc_best = grid_rfc_clf.best_estimator_
rfc_predictions = rfc_best.predict(X_test)
accuracy = accuracy_score(y_test,rfc_predictions)
print('rfc에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

rfc에서 GridSearchCV로 찾은 결과 : 0.7627


In [None]:
# xgboost

In [49]:
XGBC = XGBClassifier(verbosity=1, n_jobs=-1)

In [52]:
xgc_param_grid = {
    'n_estimators':np.arange(50,700,50),
    'max_depth':np.arange(2,10),
    'colsample_bytree':np.arange(0,1,0.1)    
    
}

n_iter_search=10
xgbc_rgs = RandomizedSearchCV(XGBC, param_distributions=xgc_param_grid, cv=kfold,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
xgbc_rgs.fit(X_train, y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    4.9s finished


RandomizedSearchCV(cv=KFold(n_splits=7, random_state=0, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=-1, nthread=None,
                                           objective...
                                           verbosity=1),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'colsample_bytree': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                 

In [53]:
xgbc_rgs.best_params_

{'n_estimators': 50, 'max_depth': 7, 'colsample_bytree': 0.6000000000000001}

In [54]:
xgc_param_grid = {
    'n_estimators':np.arange(125,175),
    'max_depth':np.arange(5,8),
    'colsample_bytree':np.arange(0.5,0.8,0.1)    
    
}

n_iter_search=10
xgbc_gs = GridSearchCV(XGBC, param_grid=xgc_param_grid, cv=kfold,verbose=1,
                            scoring='accuracy', n_jobs=-1)

xgbc_gs.fit(X_train, y_train)

Fitting 7 folds for each of 600 candidates, totalling 4200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4200 out of 4200 | elapsed:  4.6min finished


GridSearchCV(cv=KFold(n_splits=7, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='bina...
             param_grid={'colsample_bytree': array([0.5, 0.6, 0.7, 0.8]),
                         'max_depth': array([5, 6, 7]),
                         'n_estimators': array([125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
       138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
       151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162

In [59]:
xgbc_gs.best_score_

0.8267045454545454

In [55]:
xgbc_gs.best_params_

{'colsample_bytree': 0.6, 'max_depth': 5, 'n_estimators': 127}

In [65]:
xgbc_best = xgbc_gs.best_estimator_

xgbc_predictions = xgbc_best.predict(X_test)
accuracy = accuracy_score(y_test,xgbc_predictions)
print('xgbc에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

xgbc에서 GridSearchCV로 찾은 결과 : 0.7853


In [None]:
# lightgboost

In [66]:
LGB = LGBMClassifier(random_sate=1234, n_jobs=-1)

In [67]:
lb_param_grid = {
    'n_estimators':np.arange(50,700,50),
    'learning_rate':np.arange(0.5,0,-0.05),
    'colsample_bytree':np.arange(0.1,1,0.1)
}

n_iter_search=10
lgbc_rgs = RandomizedSearchCV(LGB, param_distributions=lb_param_grid, cv=kfold,
                            scoring='accuracy', n_jobs=-1,
                            verbose=1,random_state=1234,
                            n_iter=n_iter_search)
lgbc_rgs.fit(X_train, y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    0.8s finished


RandomizedSearchCV(cv=KFold(n_splits=7, random_state=0, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=31, object...
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'colsample_bytree': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                             

In [68]:
lgbc_rgs.best_params_

{'n_estimators': 200,
 'learning_rate': 0.0500000000000001,
 'colsample_bytree': 0.1}

In [69]:
lb_param_grid = {
    'n_estimators':np.arange(175,226),
    # 러닝레이트는 내려가면서
    'learning_rate':np.arange(0.07,0.02,-0.01),
    'colsample_bytree':np.arange(0.1,1,0.1)
}


n_iter_search=10
lgbc_gs = GridSearchCV(LGB, param_grid=lb_param_grid, cv=kfold,verbose=1,
                            scoring='accuracy', n_jobs=-1)

lgbc_gs.fit(X_train, y_train)

Fitting 7 folds for each of 2295 candidates, totalling 16065 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1696 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 4196 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 7696 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 12196 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 16065 out of 16065 | elapsed:  2.0min finished


GridSearchCV(cv=KFold(n_splits=7, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=No...
                         'learning_rate': array([0.07, 0.06, 0.05, 0.04, 0.03]),
                         'n_estimators': array([175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
       188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
       201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,


In [71]:
lgbc_gs.best_params_

{'colsample_bytree': 0.8, 'learning_rate': 0.03, 'n_estimators': 195}

In [74]:
lgbc_gs.best_score_

0.8309659090909091

In [72]:
lgbc_best = lgbc_gs.best_estimator_

lgbc_predictions = lgbc_best.predict(X_test)
accuracy = accuracy_score(y_test,lgbc_predictions)
print('lgbm에서 GridSearchCV로 찾은 결과 : {0:.4f}'.format(accuracy))

lgbm에서 GridSearchCV로 찾은 결과 : 0.7910


In [None]:
#Voting

In [73]:
accuracy_rf = accuracy_score(y_test, rfc_best.predict(X_test))
accuracy_xg = accuracy_score(y_test, xgbc_best.predict(X_test))
accuracy_lg = accuracy_score(y_test, lgbc_best.predict(X_test))

In [70]:
eclf = VotingClassifier(estimators= [('rfc',rfc_best),('xgb',xgbc_best), ('lgbc',lgbc_best)],
                       voting='hard')
# voting = 'hard' (voting별로 같은 가중치)

eclf.fit(X_train, y_train)
eclf_pred = eclf.predict(X_test)
accuracy = accuracy_score(y_test, eclf_pred)
print('Hard Voting 에서로 찾은 결과 : {0:.4f}'.format(accuracy))

Hard Voting 에서로 찾은 결과 : 0.7571


In [None]:
test_Survived_RFC = pd.Series(rfc_best.predict(X_test),name='RFC')
test_Survived_XGB = pd.Series(xgbc_best.predict(X_test),name='XGB')
test_Survived_LGB = pd.Series(lgbc_best.predict(X_test),name='LGB')

ensemble_results = pd.concat([est_Survived_RFC,test_Survived_XGB,test_Survived_LGB],axis=1)
sns.heatmap(ensemble_results.corr().annot=True)

