# Overview
- lightGBM を使って titanic の問題を解く
- data は、`15_pytorch_NN` 内にある

# Import everything I need :)

In [88]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Preparation

## load data

In [46]:
# train
path = '15_pytorch_NN/train.csv'
train = pd.read_csv(path)

# test
path = '15_pytorch_NN/test.csv'
test = pd.read_csv(path)

In [47]:
target = train['Survived'].values
train = train.drop(['Survived'], axis=1)

# EDA

data shape

In [48]:
print(f'data shape')
print(f'train: {train.shape}')
print(f'test:  {test.shape}')

data shape
train: (891, 11)
test:  (418, 11)


features

In [49]:
train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


check null

In [50]:
# train
train.isnull().sum(axis=0)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [51]:
# test
test.isnull().sum(axis=0)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# FeatureEngineering

NameとTicket はいらないだろう

In [52]:
def delete_name(df):
    return df.drop(['Name', 'Ticket'], axis=1)

train = delete_name(train)
test  = delete_name(test )

In [53]:
train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,7.25,,S
1,2,1,female,38.0,1,0,71.2833,C85,C
2,3,3,female,26.0,0,0,7.925,,S
3,4,1,female,35.0,1,0,53.1,C123,S
4,5,3,male,35.0,0,0,8.05,,S


ラベルエンコーディング

In [57]:
for f in ['Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']:
    if f in train.columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

In [58]:
train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,1,22.0,1,0,7.25,186,2
1,2,1,0,38.0,1,0,71.2833,106,0
2,3,3,0,26.0,0,0,7.925,186,2
3,4,1,0,35.0,1,0,53.1,70,2
4,5,3,1,35.0,0,0,8.05,186,2


# Train

In [61]:
X = train
y = target
X_test = test

In [95]:
# 3分割交差検証を指定し、インスタンス化
kf = KFold()
# スコアとモデルを格納するリスト
score_ list = []
models = []
for fold_, (train_index, valid_index) in enumerate(kf.split(X, y)):
    train_x = X.iloc[train_index]
    valid_x = X.iloc[valid_index]
    train_y = y[train_index]
    valid_y = y[valid_index]
    print(f'fold{fold̲ + 1} start')
    
    gbm = lgb.LGBMClassifier(objective='binary', n_estimators=100)
    gbm.fit(train_x, train_y, eval_set = [(valid_x, valid_y)],
            early_stopping_rounds=20, verbose= 10)
    
    oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
    
    score_list.append(round(accuracy_score(valid_y, oof)*100,2))
    models.append(gbm) # 学習が終わったモデルをリストに入れておく
    print(f'fold{fold_ + 1} end\n' )
    
print(score_list, '平均score', np.mean(score_list))

fold1 start
Training until validation scores don't improve for 20 rounds.
[10]	valid_0's binary_logloss: 0.505473
[20]	valid_0's binary_logloss: 0.464627
[30]	valid_0's binary_logloss: 0.457694
[40]	valid_0's binary_logloss: 0.468137
Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.456209
fold1 end

fold1 start
Training until validation scores don't improve for 20 rounds.
[10]	valid_0's binary_logloss: 0.459207
[20]	valid_0's binary_logloss: 0.422734
[30]	valid_0's binary_logloss: 0.429598
[40]	valid_0's binary_logloss: 0.435298
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.420867
fold2 end

fold1 start
Training until validation scores don't improve for 20 rounds.
[10]	valid_0's binary_logloss: 0.470931
[20]	valid_0's binary_logloss: 0.42772
[30]	valid_0's binary_logloss: 0.400176
[40]	valid_0's binary_logloss: 0.385825
[50]	valid_0's binary_logloss: 0.388337
[60]	valid_0's binary_logloss: 0.390696
Early stopping, best iteration is:
[40]	valid_0