In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import time, gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve

train_1 = pd.read_csv('training-set.csv',dtype={'order_id':str})
test_1 = pd.read_csv('testing-set.csv',dtype={'order_id':str})
order = pd.read_pickle('order1421_newfillna.pkl')

In [None]:
ts = time.time()
train_1=train.merge(order,on='order_id')
test_1=test.merge(order,on='order_id')
folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=666)
oof_preds = np.zeros((train_1.shape[0],6))
sub_preds = np.zeros((test_1.shape[0],6))
feature_importance_df = pd.DataFrame()
feats = [f for f in train_1.columns if f not in ['order_id','deal_or_not','order_date','begin_date','return_date']]
print ('feats:' + str(len(feats)),feats)
time.time()-ts

In [None]:
gc.collect()

In [None]:
ts = time.time()
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_1[feats], train_1['deal_or_not'])):
    train_x, train_y = train_1[feats].iloc[train_idx], train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = train_1[feats].iloc[valid_idx], train_1['deal_or_not'].iloc[valid_idx]
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
    'nthread': 32,
    'boosting_type': 'dart',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 100,
    'max_depth': 8,
    'feature_fraction': 0.7,
    'min_split_gain': 0.1,
    'min_data_in_leaf': 100,
    'min_child_weight': 0.01,
    'reg_alpha': 10,
    'reg_lambda': 10,
# parameters for dart
#     'drop_rate':0.1,
#     'skip_drop':0.5,
    'max_drop':100,
    'uniform_drop':False,
    'xgboost_dart_mode':False,
#     'drop_seed':4
    }

    if n_fold >= 0:
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        
        bst = lgb.train(
        params, dtrain, num_boost_round=5000,
        valid_sets=[dval], early_stopping_rounds=500, verbose_eval=100)
        oof_preds[valid_idx,0] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        sub_preds[:,0] += bst.predict(test_1[feats], num_iteration=bst.best_iteration) / folds.n_splits
        # Make the feature importance dataframe
        gain = bst.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':bst.feature_name(),
        'split':bst.feature_importance('split'),
        'gain':100*gain/gain.sum(),
        'fold':n_fold,
        }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#         print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx,0])))
time.time()-ts

In [None]:
# 0.753232 0.757929 0.753538 0.755814 0.756462

In [None]:
gc.collect()

In [None]:
ts = time.time()
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_1[feats], train_1['deal_or_not'])):
    train_x, train_y = train_1[feats].iloc[train_idx], train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = train_1[feats].iloc[valid_idx], train_1['deal_or_not'].iloc[valid_idx]
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
    'nthread': 32,
    'boosting_type': 'dart',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 100,
    'max_depth': 8,
    'feature_fraction': 0.7,
    'min_split_gain': 0.1,
    'min_data_in_leaf': 100,
    'min_child_weight': 0.01,
    'reg_alpha': 10,
    'reg_lambda': 10,
# parameters for dart
#     'drop_rate':0.1,
#     'skip_drop':0.5,
    'max_drop':100,
    'uniform_drop':False,
    'xgboost_dart_mode':True,
#     'drop_seed':4
    }

    if n_fold >= 0:
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        
        bst = lgb.train(
        params, dtrain, num_boost_round=5000,
        valid_sets=[dval], early_stopping_rounds=500, verbose_eval=100)
        oof_preds[valid_idx,1] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        sub_preds[:,1] += bst.predict(test_1[feats], num_iteration=bst.best_iteration) / folds.n_splits
        # Make the feature importance dataframe
        gain = bst.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':bst.feature_name(),
        'split':bst.feature_importance('split'),
        'gain':100*gain/gain.sum(),
        'fold':n_fold,
        }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#         print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx,0])))
time.time()-ts

In [None]:
# 0.751736 0.755271 0.75038 0.753552 0.754325

In [None]:
gc.collect()

In [None]:
ts = time.time()
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_1[feats], train_1['deal_or_not'])):
    train_x, train_y = train_1[feats].iloc[train_idx], train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = train_1[feats].iloc[valid_idx], train_1['deal_or_not'].iloc[valid_idx]
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
    'nthread': 32,
    'boosting_type': 'dart',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 100,
    'max_depth': 8,
    'feature_fraction': 0.7,
    'min_split_gain': 0.1,
    'min_data_in_leaf': 100,
    'min_child_weight': 0.01,
    'reg_alpha': 10,
    'reg_lambda': 10,
# parameters for dart
#     'drop_rate':0.1,
#     'skip_drop':0.5,
    'max_drop':100,
    'uniform_drop':True,
    'xgboost_dart_mode':False,
#     'drop_seed':4
    }

    if n_fold >= 0:
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        
        bst = lgb.train(
        params, dtrain, num_boost_round=5000,
        valid_sets=[dval], early_stopping_rounds=500, verbose_eval=100)
        oof_preds[valid_idx,2] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        sub_preds[:,2] += bst.predict(test_1[feats], num_iteration=bst.best_iteration) / folds.n_splits
        # Make the feature importance dataframe
        gain = bst.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':bst.feature_name(),
        'split':bst.feature_importance('split'),
        'gain':100*gain/gain.sum(),
        'fold':n_fold,
        }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#         print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx,0])))
time.time()-ts

In [None]:
# 0.752523 0.757315 0.753433 0.756181 0.756143

In [None]:
gc.collect()

In [None]:
ts = time.time()
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_1[feats], train_1['deal_or_not'])):
    train_x, train_y = train_1[feats].iloc[train_idx], train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = train_1[feats].iloc[valid_idx], train_1['deal_or_not'].iloc[valid_idx]
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
    'nthread': 32,
    'boosting_type': 'dart',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 100,
    'max_depth': 8,
    'feature_fraction': 0.7,
    'min_split_gain': 0.1,
    'min_data_in_leaf': 100,
    'min_child_weight': 0.01,
    'reg_alpha': 10,
    'reg_lambda': 10,
# parameters for dart
#     'drop_rate':0.1,
#     'skip_drop':0.5,
    'max_drop':100,
    'uniform_drop':True,
    'xgboost_dart_mode':True,
#     'drop_seed':4
    }

    if n_fold >= 0:
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        
        bst = lgb.train(
        params, dtrain, num_boost_round=5000,
        valid_sets=[dval], early_stopping_rounds=500, verbose_eval=100)
        oof_preds[valid_idx,3] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        sub_preds[:,3] += bst.predict(test_1[feats], num_iteration=bst.best_iteration) / folds.n_splits
        # Make the feature importance dataframe
        gain = bst.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':bst.feature_name(),
        'split':bst.feature_importance('split'),
        'gain':100*gain/gain.sum(),
        'fold':n_fold,
        }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#         print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx,0])))
time.time()-ts

In [None]:
# 0.75104 0.755004 0.750773 0.753097 0.753949

In [None]:
gc.collect()

In [None]:
ts = time.time()
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_1[feats], train_1['deal_or_not'])):
    train_x, train_y = train_1[feats].iloc[train_idx], train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = train_1[feats].iloc[valid_idx], train_1['deal_or_not'].iloc[valid_idx]
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
    'nthread': 32,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 100,
    'max_depth': 8,
    'feature_fraction': 0.7,
    'min_split_gain': 0.1,
    'min_data_in_leaf': 20,
    'min_child_weight': 0.01,
    'reg_alpha': 10,
    'reg_lambda': 10
    }

    if n_fold >= 0:
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        
        bst = lgb.train(
        params, dtrain, num_boost_round=5000,
        valid_sets=[dval], early_stopping_rounds=500, verbose_eval=100)
        oof_preds[valid_idx,4] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        sub_preds[:,4] += bst.predict(test_1[feats], num_iteration=bst.best_iteration) / folds.n_splits
        # Make the feature importance dataframe
        gain = bst.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':bst.feature_name(),
        'split':bst.feature_importance('split'),
        'gain':100*gain/gain.sum(),
        'fold':n_fold,
        }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#         print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx,0])))
time.time()-ts

In [None]:
# 0.750171 0.7544 0.75046 0.753191 0.75456

In [None]:
gc.collect()

In [None]:
ts = time.time()
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_1[feats], train_1['deal_or_not'])):
    train_x, train_y = train_1[feats].iloc[train_idx], train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = train_1[feats].iloc[valid_idx], train_1['deal_or_not'].iloc[valid_idx]
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
    'nthread': 32,
    'boosting_type': 'rf',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 100,
    'max_depth': 8,
    'feature_fraction': 0.7,
    'bagging_fraction':0.9,
    'bagging_freq':100,
    'min_split_gain': 0.1,
    'min_data_in_leaf': 20,
    'min_child_weight': 0.01,
    'reg_alpha': 10,
    'reg_lambda': 10
    }

    if n_fold >= 0:
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        
        bst = lgb.train(
        params, dtrain, num_boost_round=5000,
        valid_sets=[dval], early_stopping_rounds=500, verbose_eval=100)
        oof_preds[valid_idx,5] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        sub_preds[:,5] += bst.predict(test_1[feats], num_iteration=bst.best_iteration) / folds.n_splits
        # Make the feature importance dataframe
        gain = bst.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':bst.feature_name(),
        'split':bst.feature_importance('split'),
        'gain':100*gain/gain.sum(),
        'fold':n_fold,
        }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#         print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx,0])))
time.time()-ts

In [None]:
# 0.701442 0.70563 0.699814 0.701301 0.706263

In [None]:
gc.collect()

In [None]:
print(roc_auc_score(train_1['deal_or_not'], oof_preds.iloc[:,0]),roc_auc_score(train_1['deal_or_not'], oof_preds.iloc[:,1]),
     roc_auc_score(train_1['deal_or_not'], oof_preds.iloc[:,2]),roc_auc_score(train_1['deal_or_not'], oof_preds.iloc[:,3]),
       roc_auc_score(train_1['deal_or_not'], oof_preds.iloc[:,4]),roc_auc_score(train_1['deal_or_not'], oof_preds.iloc[:,5]))

In [None]:
test.iloc[:,1]=sub_preds[:,0]
test.to_csv('submission1421_dartlrate1.csv', index=False)

In [None]:
a = pd.DataFrame(oof_preds)
a.to_pickle('oof_preds.pkl')

b = pd.DataFrame(sub_preds)
b.to_pickle('sub_preds.pkl')

In [None]:
oof_preds = pd.read_pickle('oof_preds.pkl')
sub_preds = pd.read_pickle('sub_preds.pkl')

In [None]:
ts=time.time()
oof_pred = np.zeros(train_1.shape[0])
sub_pred = np.zeros(test_1.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(pd.DataFrame(oof_preds), train_1['deal_or_not'])):
    train_x, train_y = pd.DataFrame(oof_preds).iloc[train_idx], train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = pd.DataFrame(oof_preds).iloc[valid_idx], train_1['deal_or_not'].iloc[valid_idx]
    print("Train Index:",train_idx,",Val Index:",valid_idx)
    params = {
        'nthread': 32,
        'boosting_type': 'dart',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'num_leaves': 15,
        'max_depth': 4,
        'feature_fraction': .9,
        'bagging_fraction':0.6,
        'bagging_freq':10,
        'min_split_gain': 0.1,
        'min_data_in_leaf': 100,
        'min_child_weight': 0.01,
        'reg_alpha': 10,
        'reg_lambda': 10,
    # parameters for dart
    #     'drop_rate':0.1,
    #     'skip_drop':0.5,
    #     'max_drop':100
    #     'uniform_drop':True,
    #     'xgboost_dart_mode':True,
    #     'drop_seed':4
    }
    if n_fold >= 0:
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)
        
        bst = lgb.train(
        params, dtrain, num_boost_round=8000,
        valid_sets=[dval], early_stopping_rounds=500, verbose_eval=200)
        oof_pred[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        sub_pred += bst.predict(pd.DataFrame(sub_preds), num_iteration=bst.best_iteration) / folds.n_splits
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_pred[valid_idx])))
time.time()-ts

In [None]:
print(roc_auc_score(train_1['deal_or_not'], oof_pred))

In [None]:
test.iloc[:,1]=sub_pred
test.to_csv('submission1421_stacking_dart_參數.csv', index=False)

In [None]:
# 0.754755 0.760061 0.754866 0.758261 0.758392
# 0.7570996115387698
# params = {
#     'nthread': 32,
#     'boosting_type': 'dart',
#     'objective': 'binary',
#     'metric': 'auc',
#     'learning_rate': 0.05,
#     'num_leaves': 15,
#     'max_depth': 4,
#     'feature_fraction': .9,
#     'bagging_fraction':0.6,
#     'bagging_freq':10,
#     'min_split_gain': 0.1,
#     'min_data_in_leaf': 100,
#     'min_child_weight': 0.01,
#     'reg_alpha': 10,
#     'reg_lambda': 10,
# # parameters for dart
# #     'drop_rate':0.1,
# #     'skip_drop':0.5,
# #     'max_drop':100
# #     'uniform_drop':True,
# #     'xgboost_dart_mode':True,
# #     'drop_seed':4
#     }

In [None]:
# 0.754725 0.760029 0.754830 0.758150 0.758237
# 0.7570709264350741
#     'num_leaves': 15,$$
#     'max_depth': 4,$$
#     'feature_fraction': .9,$$
#     'bagging_fraction':0.9,
#     'bagging_freq':100,

In [None]:
# 0.754716 0.760015 0.754880 0.758131 0.758209
# 0.7569722112854734
# params = {
#     'nthread': 32,
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'learning_rate': 0.05,
#     'num_leaves': 15,
#     'max_depth': 4,
#     'feature_fraction': .9,
#     'bagging_fraction':0.9,
#     'bagging_freq':100,
#     'min_split_gain': 0.1,
#     'min_data_in_leaf': 20,
#     'min_child_weight': 0.01,
#     'reg_alpha': 10,
#     'reg_lambda': 10

In [None]:
# 0.754618 0.759785 0.754342 0.757714 0.758126
# 0.7565652445288027
#     'feature_fraction': .9,$$
#     'bagging_fraction':0.9,
#     'bagging_freq':100,

In [None]:
# 0.754495 0.759810 0.754173 0.757745 0.758077
# 0.7556803729754726
#     'feature_fraction': .9,$$

In [None]:
# 0.754492 0.759718 0.754349 0.757744 0.758124
# 0.7545011673800177
# params = {
#     'nthread': 32,
#     'boosting_type': 'dart',
#     'objective': 'binary',
#     'metric': 'auc',
#     'learning_rate': 0.05,
#     'num_leaves': 100,
#     'max_depth': 8,
#     'feature_fraction': 0.7,
#     'min_split_gain': 0.1,
#     'min_data_in_leaf': 100,
#     'min_child_weight': 0.01,
#     'reg_alpha': 10,
#     'reg_lambda': 10 }

In [None]:
# 0.753232 0.757929 0.753538 0.755814 0.756462
# 0.751736 0.755271 0.75038 0.753552 0.754325
# 0.752523 0.757315 0.753433 0.756181 0.756143
# 0.75104 0.755004 0.750773 0.753097 0.753949
# 0.750171 0.7544 0.75046 0.753191 0.75456
# 0.701442 0.70563 0.699814 0.701301 0.706263