In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import pickle
import gc

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import lightgbm as lgb
%matplotlib inline

In [10]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### dataset

In [19]:
x_train, y_train, id_train = df_train[['Pclass', 'Fare', 'Age']], df_train[['Survived']], df_train[['PassengerId']]

### learning, cross-validation

In [18]:
# Hyper parameter
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    'random_state': 123,
    'importance_type': 'gain',
}

def train_cv(input_x, input_y, input_id, params, n_splits=5):
    metrics = []
    imp = pd.DataFrame()
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    
    for nfold in np.arange(n_splits):
        # dataset
        print('-'*20, nfold, '-'*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = input_x.loc[idx_tr, :], input_y.loc[idx_tr, :]
        x_va, y_va = input_x.loc[idx_va, :], input_y.loc[idx_va, :]
        print(f'x_tr:{x_tr.shape}, y_tr:{y_tr.shape}')
        print(f'x_va:{x_va.shape}, y_va:{y_va.shape}')
        print("input_y['Survived']_mean:{:.3f}, y_tr['Survived']_mean:{:.3f}, y_va['Survived']_mean{:.3f}".format(
            input_y['Survived'].mean(), y_tr['Survived'].mean(), y_va['Survived'].mean()
        ))
        
        # learning
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr, y_tr, 
                  eval_set=[(x_tr, y_tr), (x_va, y_va)], 
                  callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=10)]
                 )
        
        # evaluation
        y_tr_pred = model.predict(x_tr)
        y_va_pred = model.predict(x_va)
        metric_tr = accuracy_score(y_tr, y_tr_pred)
        metric_va = accuracy_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print(f'[Accuracy]tr:{metric_tr}, [Accuracy]va:{metric_va}')
        
        # DataFrame(feature importance)
        _imp = pd.DataFrame({
            'col': input_x.columns,
            'imp': model.feature_importances_,
            'nfold': nfold
        })
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
        
        # result
        print('-'*20, 'result', '-'*20)
        metrics = np.array(metrics)
        print(metrics)
        print('[cv]  tr: {:.2f}+-{:.2f}, va: {:.2f}+-{:.2f}'.format(metrics[:, 1].mean(), metrics[:, 1].std(),
                                                                    metrics[:, 2].mean(), metrics[:, 2].std(),
                                                                   ))
        imp = imp.groupby('col')['imp'].agg(['mean', 'std'])
        imp.columns = ['imp', 'imp_std']
        imp = imp.reset_index(drop=False)
        print('Done')
        
        return imp, metrics

In [21]:
imp, metrics = train_cv(x_train, y_train, id_train, params, n_splits=5)

-------------------- 0 --------------------
x_tr:(712, 3), y_tr:(712, 1)
x_va:(179, 3), y_va:(179, 1)
input_y['Survived']_mean:0.384, y_tr['Survived']_mean:0.383, y_va['Survived']_mean0.385
[1]	training's auc: 0.782093	valid_1's auc: 0.657708
Training until validation scores don't improve for 100 rounds
[2]	training's auc: 0.795214	valid_1's auc: 0.669236
[3]	training's auc: 0.805598	valid_1's auc: 0.67859
[4]	training's auc: 0.810237	valid_1's auc: 0.674045
[5]	training's auc: 0.809636	valid_1's auc: 0.676943
[6]	training's auc: 0.822132	valid_1's auc: 0.696904
[7]	training's auc: 0.825949	valid_1's auc: 0.704743
[8]	training's auc: 0.82743	valid_1's auc: 0.701383
[9]	training's auc: 0.828135	valid_1's auc: 0.703755
[10]	training's auc: 0.834706	valid_1's auc: 0.700264
[11]	training's auc: 0.839153	valid_1's auc: 0.699736
[12]	training's auc: 0.840522	valid_1's auc: 0.704348
[13]	training's auc: 0.842729	valid_1's auc: 0.704348
[14]	training's auc: 0.844823	valid_1's auc: 0.705138
[15

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[110]	training's auc: 0.935818	valid_1's auc: 0.721014
[111]	training's auc: 0.935943	valid_1's auc: 0.720487
[112]	training's auc: 0.936227	valid_1's auc: 0.720224
[113]	training's auc: 0.936494	valid_1's auc: 0.72141
[114]	training's auc: 0.936819	valid_1's auc: 0.721146
[115]	training's auc: 0.937253	valid_1's auc: 0.721278
[116]	training's auc: 0.937854	valid_1's auc: 0.721673
[117]	training's auc: 0.938363	valid_1's auc: 0.721937
[118]	training's auc: 0.938705	valid_1's auc: 0.721805
[119]	training's auc: 0.939181	valid_1's auc: 0.722596
[120]	training's auc: 0.939264	valid_1's auc: 0.722991
[121]	training's auc: 0.939665	valid_1's auc: 0.723518
[122]	training's auc: 0.939932	valid_1's auc: 0.723913
[123]	training's auc: 0.940507	valid_1's auc: 0.723386
[124]	training's auc: 0.9409	valid_1's auc: 0.722464
[125]	training's auc: 0.940749	valid_1's auc: 0.7222
[126]	training's auc: 0.941208	valid_1's auc: 0.722069
[127]	training's auc: 0.941434	valid_1's auc: 0.722991
[128]	training'

In [22]:
imp

Unnamed: 0,col,imp,imp_std
0,Age,785.061933,
1,Fare,898.232602,
2,Pclass,416.319329,


In [23]:
metrics

array([[0.        , 0.8244382 , 0.70391061]])