In [33]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [34]:
df_train = pd.read_csv('./input/train.csv')
df_test = pd.read_csv('./input/test.csv')
df_gender_submission = pd.read_csv('./input/gender_submission.csv')

In [35]:
genders = {'male': 0, 'female': 1} # 辞書を作成
# Sexをgendersを用いて変換
df_train['Sex'] = df_train['Sex'].map(genders)
df_test['Sex'] = df_test['Sex'].map(genders)

# ダミー変数化
df_train = pd.get_dummies(df_train, columns=['Embarked'])
df_test = pd.get_dummies(df_test, columns = ['Embarked'])

# 不要な列の削除
df_train.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

In [36]:
df_train.head(6)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,0,0,1
5,0,3,0,,0,0,8.4583,0,1,0


In [37]:
X_train = df_train.iloc[:,1:]
Y_train = df_train['Survived']

# 3分割交差検証を指定し、インスタンス化
skf = KFold(n_splits=3)

params = {
        'objective': 'binary',
        'learning_rate': 0.1,
        'num_leaves' : 300
}


# skf.split(X_train.Ytrain)で、X_trainとY_trainを3分割し、交差検証をする
for train_index, test_index in skf.split(X_train, Y_train):
    X_cv_train = X_train.iloc[train_index]
    X_cv_test = X_train.iloc[test_index]
    y_cv_train = Y_train[train_index]
    y_cv_test = Y_train[test_index]
    
    lgb_train = lgb.Dataset(X_cv_train,y_cv_train)
    lgb_eval = lgb.Dataset(X_cv_test,y_cv_test)
    
    gbm = lgb.train(params = params,
            train_set = lgb_train,
            num_boost_round=50,
            valid_sets=lgb_eval,
            early_stopping_rounds=20)
    
    y_pred = gbm.predict(X_cv_test, num_iteration=gbm.best_iteration)

    # acuuracyを表示
    preds = np.round(gbm.predict(X_cv_test))
    print(round(accuracy_score(y_cv_test,preds)*100,2))

[1]	valid_0's binary_logloss: 0.620239
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's binary_logloss: 0.593798
[3]	valid_0's binary_logloss: 0.572528
[4]	valid_0's binary_logloss: 0.554015
[5]	valid_0's binary_logloss: 0.539266
[6]	valid_0's binary_logloss: 0.527844
[7]	valid_0's binary_logloss: 0.518635
[8]	valid_0's binary_logloss: 0.510455
[9]	valid_0's binary_logloss: 0.504106
[10]	valid_0's binary_logloss: 0.498639
[11]	valid_0's binary_logloss: 0.493381
[12]	valid_0's binary_logloss: 0.490238
[13]	valid_0's binary_logloss: 0.486524
[14]	valid_0's binary_logloss: 0.484927
[15]	valid_0's binary_logloss: 0.482375
[16]	valid_0's binary_logloss: 0.481922
[17]	valid_0's binary_logloss: 0.480232
[18]	valid_0's binary_logloss: 0.475995
[19]	valid_0's binary_logloss: 0.47274
[20]	valid_0's binary_logloss: 0.469822
[21]	valid_0's binary_logloss: 0.468911
[22]	valid_0's binary_logloss: 0.469166
[23]	valid_0's binary_logloss: 0.469686
[24]	valid_0's binary_loglos

# scikit-learnを使う

In [39]:
# 3分割交差検証を指定し、インスタンス化
skf = KFold(n_splits=3)

params = {
        'objective': 'binary',
        'metric' : 'binary_error',
        'learning_rate': 0.1,
        'num_leaves' : 300,
        'random_seed' : 1
}

for train_index, test_index in skf.split(X_train, Y_train):
    X_cv_train = X_train.iloc[train_index]
    X_cv_test = X_train.iloc[test_index]
    y_cv_train = Y_train[train_index]
    y_cv_test = Y_train[test_index]

    gbm = lgb.LGBMClassifier(objective='binary',
                        num_leaves = 300,
                        learning_rate = 0.1,
                        )
    gbm.fit(X_cv_train, y_cv_train,
        eval_set = [(X_cv_test, y_cv_test)],
        early_stopping_rounds=10,)
    
    y_pred = gbm.predict(X_cv_test, num_iteration=gbm.best_iteration_)
    print(round(accuracy_score(y_cv_test,y_pred)*100,2))

[1]	valid_0's binary_logloss: 0.620239
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.593798
[3]	valid_0's binary_logloss: 0.572528
[4]	valid_0's binary_logloss: 0.554015
[5]	valid_0's binary_logloss: 0.539266
[6]	valid_0's binary_logloss: 0.527844
[7]	valid_0's binary_logloss: 0.518635
[8]	valid_0's binary_logloss: 0.510455
[9]	valid_0's binary_logloss: 0.504106
[10]	valid_0's binary_logloss: 0.498639
[11]	valid_0's binary_logloss: 0.493381
[12]	valid_0's binary_logloss: 0.490238
[13]	valid_0's binary_logloss: 0.486524
[14]	valid_0's binary_logloss: 0.484927
[15]	valid_0's binary_logloss: 0.482375
[16]	valid_0's binary_logloss: 0.481922
[17]	valid_0's binary_logloss: 0.480232
[18]	valid_0's binary_logloss: 0.475995
[19]	valid_0's binary_logloss: 0.47274
[20]	valid_0's binary_logloss: 0.469822
[21]	valid_0's binary_logloss: 0.468911
[22]	valid_0's binary_logloss: 0.469166
[23]	valid_0's binary_logloss: 0.469686
[24]	valid_0's binary_loglos

In [40]:
X_train = df_train.iloc[:,1:]
Y_train = df_train['Survived']

In [41]:
# 3分割交差検証を指定し、インスタンス化

params = {
        'objective': 'binary',
        'metric' : 'binary_error',
        'learning_rate': 0.1,
        'num_leaves' : 300,
        'random_seed' : 1
}



gbm = lgb.LGBMClassifier(objective='binary',
                    num_leaves = 300,
                    learning_rate = 0.1,
                    )
gbm.fit(df_train.iloc[:,1:],df_train['Survived'])

y_pred = gbm.predict(df_test)

In [None]:
    eval = lgb.cv(lgbm_params,
        lgb_train,
        nfold=3,
        stratified=True,
        num_boost_round=20000,
        early_stopping_rounds=40,
        verbose_eval=100,
        seed = 0,
        show_stdv=True,
        )

In [28]:
gbm.get_params

<bound method LGBMModel.get_params of LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=300, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)>

In [26]:
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [30]:
df_gender_submission['Survived'] = y_pred

In [32]:
df_gender_submission.to_csv('lightgbm.csv',index = False)