In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import pickle
import gc

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import lightgbm as lgb
%matplotlib inline

In [10]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### dataset

In [27]:
x_train, y_train, id_train = df_train[['Pclass', 'Fare', 'Age']], df_train[['Survived']], df_train[['PassengerId']]

In [32]:
x_train[x_train['Age'].isna()]

Unnamed: 0,Pclass,Fare,Age
5,3,8.4583,
17,2,13.0000,
19,3,7.2250,
26,3,7.2250,
28,3,7.8792,
...,...,...,...
859,3,7.2292,
863,3,69.5500,
868,3,9.5000,
878,3,7.8958,


### learning, cross-validation

In [18]:
# Hyper parameter
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    'random_state': 123,
    'importance_type': 'gain',
}

def train_cv(input_x, input_y, input_id, params, n_splits=5):
    metrics = []
    imp = pd.DataFrame()
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    
    for nfold in np.arange(n_splits):
        # dataset
        print('-'*20, nfold, '-'*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = input_x.loc[idx_tr, :], input_y.loc[idx_tr, :]
        x_va, y_va = input_x.loc[idx_va, :], input_y.loc[idx_va, :]
        print(f'x_tr:{x_tr.shape}, y_tr:{y_tr.shape}')
        print(f'x_va:{x_va.shape}, y_va:{y_va.shape}')
        print("input_y['Survived']_mean:{:.3f}, y_tr['Survived']_mean:{:.3f}, y_va['Survived']_mean{:.3f}".format(
            input_y['Survived'].mean(), y_tr['Survived'].mean(), y_va['Survived'].mean()
        ))
        
        # learning
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr, y_tr, 
                  eval_set=[(x_tr, y_tr), (x_va, y_va)], 
                  callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=10)]
                 )
        
        # evaluation
        y_tr_pred = model.predict(x_tr)
        y_va_pred = model.predict(x_va)
        metric_tr = accuracy_score(y_tr, y_tr_pred)
        metric_va = accuracy_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print(f'[Accuracy]tr:{metric_tr}, [Accuracy]va:{metric_va}')
        
        # DataFrame(feature importance)
        _imp = pd.DataFrame({
            'col': input_x.columns,
            'imp': model.feature_importances_,
            'nfold': nfold
        })
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
        
        # result
        print('-'*20, 'result', '-'*20)
        metrics = np.array(metrics)
        print(metrics)
        print('[cv]  tr: {:.2f}+-{:.2f}, va: {:.2f}+-{:.2f}'.format(metrics[:, 1].mean(), metrics[:, 1].std(),
                                                                    metrics[:, 2].mean(), metrics[:, 2].std(),
                                                                   ))
        imp = imp.groupby('col')['imp'].agg(['mean', 'std'])
        imp.columns = ['imp', 'imp_std']
        imp = imp.reset_index(drop=False)
        print('Done')
        
        return imp, metrics

In [25]:
imp, metrics = train_cv(x_train, y_train, id_train, params, n_splits=5)

-------------------- 0 --------------------
x_tr:(712, 2), y_tr:(712, 1)
x_va:(179, 2), y_va:(179, 1)
input_y['Survived']_mean:0.384, y_tr['Survived']_mean:0.383, y_va['Survived']_mean0.385
[1]	training's auc: 0.762985	valid_1's auc: 0.729381
Training until validation scores don't improve for 100 rounds
[2]	training's auc: 0.763607	valid_1's auc: 0.730237
[3]	training's auc: 0.763607	valid_1's auc: 0.730237
[4]	training's auc: 0.777045	valid_1's auc: 0.732411
[5]	training's auc: 0.775702	valid_1's auc: 0.735046
[6]	training's auc: 0.777383	valid_1's auc: 0.72747
[7]	training's auc: 0.774955	valid_1's auc: 0.738274
[8]	training's auc: 0.776974	valid_1's auc: 0.739394
[9]	training's auc: 0.780612	valid_1's auc: 0.729644
[10]	training's auc: 0.788155	valid_1's auc: 0.736957
[11]	training's auc: 0.792636	valid_1's auc: 0.739723
[12]	training's auc: 0.793779	valid_1's auc: 0.740382
[13]	training's auc: 0.793958	valid_1's auc: 0.735903
[14]	training's auc: 0.794342	valid_1's auc: 0.737088
[1

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



[81]	training's auc: 0.839846	valid_1's auc: 0.719433
[82]	training's auc: 0.840547	valid_1's auc: 0.715217
[83]	training's auc: 0.840801	valid_1's auc: 0.718841
[84]	training's auc: 0.841102	valid_1's auc: 0.71581
[85]	training's auc: 0.841611	valid_1's auc: 0.713966
[86]	training's auc: 0.841677	valid_1's auc: 0.716601
[87]	training's auc: 0.842662	valid_1's auc: 0.716469
[88]	training's auc: 0.842261	valid_1's auc: 0.71726
[89]	training's auc: 0.842303	valid_1's auc: 0.717128
[90]	training's auc: 0.843037	valid_1's auc: 0.716206
[91]	training's auc: 0.843596	valid_1's auc: 0.716601
[92]	training's auc: 0.843179	valid_1's auc: 0.718182
[93]	training's auc: 0.84406	valid_1's auc: 0.720553
[94]	training's auc: 0.84421	valid_1's auc: 0.72029
[95]	training's auc: 0.844535	valid_1's auc: 0.721212
[96]	training's auc: 0.844702	valid_1's auc: 0.720553
[97]	training's auc: 0.844552	valid_1's auc: 0.720685
[98]	training's auc: 0.844952	valid_1's auc: 0.719368
[99]	training's auc: 0.844902	va

In [26]:
imp

Unnamed: 0,col,imp,imp_std
0,Fare,474.747741,
1,Pclass,371.877959,


In [23]:
metrics

array([[0.        , 0.8244382 , 0.70391061]])