In [1]:
import numpy as np
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
SEED = 0

In [3]:
train = np.load('cv_files/cv1_train.npy')
valid = np.load('cv_files/cv1_valid.npy')
column = np.load('cv_files/features_all_cv.npy', allow_pickle=True)

In [4]:
column

array(['row_id', 'user_id', 'content_id', 'content_type_id',
       'answered_correctly', 'prior_question_elapsed_time',
       'prior_question_had_explanation', 'answered_correctly_avg_c',
       'answered_correctly_sum_u', 'count_u', 'answered_correctly_avg_u',
       'prior_question_elapsed_time_mean', 'question_id', 'part'],
      dtype=object)

In [9]:
target_col = np.where(column == 'answered_correctly')[0][0]
feature_name = ['answered_correctly_avg_u', 'answered_correctly_sum_u', 'count_u', 'answered_correctly_avg_c', 'part', 'prior_question_had_explanation', 'prior_question_elapsed_time']
feature_col = np.array([np.where(column == col)[0][0] for col in feature_name])

In [10]:
feature_col, target_col

(array([10,  8,  9,  7, 13,  6,  5], dtype=int64), 4)

In [11]:
lgb_train = lgb.Dataset(train[:, feature_col], train[:, target_col])
lgb_valid = lgb.Dataset(valid[:, feature_col], valid[:, target_col])

In [12]:
model = lgb.train(
    {'objective': 'binary'}, 
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    verbose_eval=100,
    num_boost_round=10000,
    early_stopping_rounds=10
)

[LightGBM] [Info] Number of positive: 63676135, number of negative: 33141405
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 96817540, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.657692 -> initscore=0.653026
[LightGBM] [Info] Start training from score 0.653026
Training until validation scores don't improve for 10 rounds
[100]	training's binary_logloss: 0.548592	valid_1's binary_logloss: 0.55389
[200]	training's binary_logloss: 0.548165	valid_1's binary_logloss: 0.553372
[300]	training's binary_logloss: 0.547963	valid_1's binary_logloss: 0.553143
[400]	training's binary_logloss: 0.547843	valid_1's binary_logloss: 0.553021
[500]	training's binary_logloss: 0.547741	valid_1's binary_logloss: 0.552928
[600]	training's binary_logloss: 0.547655	valid_1's binary_logloss: 0.55285
[700]	training's bi

In [13]:
print('auc:', roc_auc_score(valid[:, target_col], model.predict(valid[:, feature_col])))

auc: 0.7570404951690736


In [14]:
model.save_model('model/cv_process/sample.txt', num_iteration=model.best_iteration)

<lightgbm.basic.Booster at 0x207e5565848>

In [9]:
def read_data(cv_idx):
    train = np.load('cv_files/cv{0}_train.npy'.format(cv_idx))
    valid = np.load('cv_files/cv{0}_valid.npy'.format(cv_idx))
    return train, valid

def get_dataset(cv_idx):
    column = np.load('cv_files/features_all_cv.npy', allow_pickle=True)
    target_col = np.where(column == 'answered_correctly')[0][0]
    feature_name = ['answered_correctly_avg_u', 'answered_correctly_sum_u', 'count_u', 'answered_correctly_avg_c', 'part', 'prior_question_had_explanation', 'prior_question_elapsed_time']
    feature_col = np.array([np.where(column == col)[0][0] for col in feature_name])
    train, valid = read_data(cv_idx)
    lgb_train = lgb.Dataset(train[:, feature_col], train[:, target_col])
    lgb_valid = lgb.Dataset(valid[:, feature_col], valid[:, target_col])
    return lgb_train, lgb_valid, valid[:, feature_col], valid[:, target_col]

def create_model(trial):
    num_leaves = trial.suggest_int('num_leaves', 26, 32)
    n_estimators = trial.suggest_int('n_estimators', 280, 350)
    max_depth = trial.suggest_int('max_depth', 7, 9)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 0.5)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 25, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.1, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.1, 1.0)
    model = lgb.train(
        {
            'objective': 'binary',
            'num_leaves': num_leaves,
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'min_data_in_leaf': min_data_in_leaf,
            'bagging_fraction': bagging_fraction,
            'feature_fraction': feature_fraction,
            'random_state': SEED
        }, 
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        verbose_eval=-1,
        num_boost_round=10000,
        early_stopping_rounds=10
    )
    return model

def objective(trial):
    model = create_model(trial)
    score = roc_auc_score(y_valid, model.predict(x_valid))
    return score

def train(cv_idx):
    model = lgb.train(
        params, 
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        verbose_eval=100,
        num_boost_round=10000,
        early_stopping_rounds=10
    )
    model.save_model('model/cv_process/baseline_cv{0}.txt'.format(cv_idx), num_iteration=model.best_iteration)
    print('--- cv_idx : {0}, auc : {1}'.format(cv_idx, roc_auc_score(y_valid, model.predict(x_valid))))


In [11]:
optimize = False
for cv_idx in np.arange(1, 6):
    sampler = TPESampler(seed=SEED)
    lgb_train, lgb_valid, x_valid, y_valid = get_dataset(cv_idx)
    if optimize:
        optim = optuna.create_study(direction="maximize", sampler=sampler)
        optim.optimize(objective, n_trials=50)
        params = optim.best_params
    else:
        params = dict()
    params['objective'] = 'binary'
    params['random_seed'] = SEED
    train(cv_idx)

[LightGBM] [Info] Number of positive: 63676135, number of negative: 33141405
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 96817540, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.657692 -> initscore=0.653026
[LightGBM] [Info] Start training from score 0.653026
Training until validation scores don't improve for 10 rounds
[100]	training's binary_logloss: 0.548573	valid_1's binary_logloss: 0.553858
[200]	training's binary_logloss: 0.548166	valid_1's binary_logloss: 0.553376
[300]	training's binary_logloss: 0.547969	valid_1's binary_logloss: 0.553161
[400]	training's binary_logloss: 0.547824	valid_1's binary_logloss: 0.553013
[500]	training's binary_logloss: 0.547726	valid_1's binary_logloss: 0.552917
[600]	training's binary_logloss: 0.547644	valid_1's binary_logloss: 0.552839
[700]	training's 