In [70]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

In [71]:
df_train = pd.read_csv('./input/train.csv')
df_test = pd.read_csv('./input/test.csv')
df_gender_submission = pd.read_csv('./input/gender_submission.csv')

In [72]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [73]:
genders = {'male': 0, 'female': 1} # 辞書を作成
# Sexをgendersを用いて変換
df_train['Sex'] = df_train['Sex'].map(genders)
df_test['Sex'] = df_test['Sex'].map(genders)

In [74]:
# ダミー変数化
df_train = pd.get_dummies(df_train, columns=['Embarked'])
df_test = pd.get_dummies(df_test, columns = ['Embarked'])

In [75]:
# 不要な列の削除
df_train.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

In [76]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,0,0,1


In [77]:
X_train = df_train.iloc[:,1:]
Y_train = df_train['Survived'].values

In [78]:
# 3分割交差検証を指定し、インスタンス化
skf = StratifiedKFold(n_splits=3)

params = {
        'objective': 'binary',
        'learning_rate': 0.1,
        'num_leaves' : 300
}


# skf.split(X_train.Ytrain)で、X_trainとY_trainを3分割し、交差検証をする
for train_index, test_index in skf.split(X_train, Y_train):
    X_cv_train = X_train.iloc[train_index]
    X_cv_test = X_train.iloc[test_index]
    y_cv_train = Y_train[train_index]
    y_cv_test = Y_train[test_index]
    
    lgb_train = lgb.Dataset(X_cv_train,y_cv_train)
    lgb_eval = lgb.Dataset(X_cv_test,y_cv_test)
    
    gbm = lgb.train(params = params,
            train_set = lgb_train,
            num_boost_round=50,
            valid_sets=lgb_eval,
            early_stopping_rounds=20)
    
    y_pred = gbm.predict(X_cv_test, num_iteration=gbm.best_iteration)

    # acuuracyを表示
    preds = np.round(gbm.predict(X_cv_test))
    print(round(accuracy_score(y_cv_test,preds)*100,2))

[1]	valid_0's binary_logloss: 0.653809
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's binary_logloss: 0.622564
[3]	valid_0's binary_logloss: 0.597825
[4]	valid_0's binary_logloss: 0.575612
[5]	valid_0's binary_logloss: 0.557721
[6]	valid_0's binary_logloss: 0.542859
[7]	valid_0's binary_logloss: 0.531736
[8]	valid_0's binary_logloss: 0.522472
[9]	valid_0's binary_logloss: 0.514633
[10]	valid_0's binary_logloss: 0.509292
[11]	valid_0's binary_logloss: 0.504134
[12]	valid_0's binary_logloss: 0.499229
[13]	valid_0's binary_logloss: 0.496218
[14]	valid_0's binary_logloss: 0.491775
[15]	valid_0's binary_logloss: 0.490236
[16]	valid_0's binary_logloss: 0.488807
[17]	valid_0's binary_logloss: 0.486969
[18]	valid_0's binary_logloss: 0.486387
[19]	valid_0's binary_logloss: 0.482192
[20]	valid_0's binary_logloss: 0.479107
[21]	valid_0's binary_logloss: 0.477294
[22]	valid_0's binary_logloss: 0.476172
[23]	valid_0's binary_logloss: 0.475082
[24]	valid_0's binary_loglo

# scikit-learnを使う

In [69]:
# 3分割交差検証を指定し、インスタンス化
skf = StratifiedKFold(n_splits=3)

params = {
        'objective': 'binary',
        'metric' : 'binary_error',
        'learning_rate': 0.1,
        'num_leaves' : 300,
}

for train_index, test_index in skf.split(X_train, Y_train):
    X_cv_train = X_train.iloc[train_index]
    X_cv_test = X_train.iloc[test_index]
    y_cv_train = Y_train[train_index]
    y_cv_test = Y_train[test_index]

    gbm = lgb.LGBMClassifier(objective='binary',
                        num_leaves = 300,
                        learning_rate=0.1,
                        )
    gbm.fit(X_cv_train, y_cv_train,
        eval_set = [(X_cv_test, y_cv_test)],
        early_stopping_rounds=10,)
    
    y_pred = gbm.predict(X_cv_test, num_iteration=gbm.best_iteration_)
    print(gbm.best_iteration_)
    print(round(accuracy_score(y_cv_test,y_pred)*100,2))

[1]	valid_0's binary_logloss: 0.653809
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.622564
[3]	valid_0's binary_logloss: 0.597825
[4]	valid_0's binary_logloss: 0.575612
[5]	valid_0's binary_logloss: 0.557721
[6]	valid_0's binary_logloss: 0.542859
[7]	valid_0's binary_logloss: 0.531736
[8]	valid_0's binary_logloss: 0.522472
[9]	valid_0's binary_logloss: 0.514633
[10]	valid_0's binary_logloss: 0.509292
[11]	valid_0's binary_logloss: 0.504134
[12]	valid_0's binary_logloss: 0.499229
[13]	valid_0's binary_logloss: 0.496218
[14]	valid_0's binary_logloss: 0.491775
[15]	valid_0's binary_logloss: 0.490236
[16]	valid_0's binary_logloss: 0.488807
[17]	valid_0's binary_logloss: 0.486969
[18]	valid_0's binary_logloss: 0.486387
[19]	valid_0's binary_logloss: 0.482192
[20]	valid_0's binary_logloss: 0.479107
[21]	valid_0's binary_logloss: 0.477294
[22]	valid_0's binary_logloss: 0.476172
[23]	valid_0's binary_logloss: 0.475082
[24]	valid_0's binary_loglo

  if diff:
  if diff:
  if diff:
