In [None]:
from module.prepare import *

#### LGB

In [80]:
def LGBTuning(X_train,X_test,Y_train,Y_test):
    
    params = {'boosting_type': 'dart',
              'max_depth' : -1,
              'objective': 'binary',
              'n_jobs': 3, # Updated from nthread
              'num_leaves': 400,
              'learning_rate': 0.1,
              'max_bin': 512,
              'subsample_for_bin': 200,
              'subsample': 0.8,
              'subsample_freq': 1,
              'colsample_bytree': 0.8,
              'reg_alpha': 0,
              'reg_lambda': 0,
              'min_split_gain': 0.5,
              'min_child_weight': 1,
              'min_child_samples': 5,
              'scale_pos_weight': 1,
              'num_boost_round':3000,
              'num_iterations':1000,
              'metric' : 'binary_error'}
    
    clf = lgb.LGBMClassifier(boosting_type=params['boosting_type'],
                             max_depth=params['max_depth'],
                             objective=params['objective'],
                             n_jobs=params['n_jobs'],
                             num_leaves=params['num_leaves'],
                             learning_rate=params['learning_rate'],
                             max_bin=params['max_bin'],
                             subsample_for_bin=params['subsample_for_bin'],
                             subsample=params['subsample'],
                             subsample_freq=params['subsample_freq'],
                             colsample_bytree=params['colsample_bytree'],
                             reg_alpha=params['reg_alpha'],
                             reg_lambda=params['reg_lambda'],
                             min_split_gain=params['min_split_gain'],
                             min_child_weight=params['min_child_weight'],
                             min_child_samples=params['min_child_samples'],
                             scale_pos_weight=params['scale_pos_weight'],
                             metric=params['metric'],
                             num_boost_round=params['num_boost_round'],
                             num_iterations=params['num_iterations'],
                             silent=False,
                             verbose=1
                            )
    
    gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [seed], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

    print('default params\n',clf.get_params())

    grid = GridSearchCV(clf, gridParams,
                    verbose=3,
                    cv=5,
                    n_jobs=1)

    grid.fit(X_train, Y_train)

    # Print the best parameters found
    print(grid.best_params_)
    print(grid.best_score_)
    
    return params

In [91]:
def LGBTuningSingle(X_train,X_test,Y_train,Y_test):
    
    [X_train, X_test, Y_train, Y_test] = train_test_split(X_train,Y_train,test_size=0.3)
    traindata = lgb.Dataset(X_train,Y_train)
    testdata = lgb.Dataset(X_test,Y_test)
    
    params = {'boosting_type': 'gbdt',
              'max_depth' : -1,
              'objective': 'binary',
              'n_jobs': 3, # Updated from nthread
              'num_leaves': 400,
              'learning_rate': 0.1,
              'max_bin': 512,
              'subsample_for_bin': 200,
              'subsample': 0.8,
              'subsample_freq': 1,
              'colsample_bytree': 0.8,
              'reg_alpha': 0,
              'reg_lambda': 0,
              'min_split_gain': 0.5,
              'min_child_weight': 1,
              'min_child_samples': 5,
              'scale_pos_weight': 1,
              'num_boost_round':3000,
              'num_iterations':1000,
              'n_estimators':500,
              'metric' : 'binary_error'}
    
    clf = lgb.LGBMClassifier(boosting_type=params['boosting_type'],
                             max_depth=params['max_depth'],
                             objective=params['objective'],
                             n_jobs=params['n_jobs'],
                             num_leaves=params['num_leaves'],
                             learning_rate=params['learning_rate'],
                             max_bin=params['max_bin'],
                             subsample_for_bin=params['subsample_for_bin'],
                             subsample=params['subsample'],
                             subsample_freq=params['subsample_freq'],
                             colsample_bytree=params['colsample_bytree'],
                             reg_alpha=params['reg_alpha'],
                             reg_lambda=params['reg_lambda'],
                             min_split_gain=params['min_split_gain'],
                             min_child_weight=params['min_child_weight'],
                             min_child_samples=params['min_child_samples'],
                             scale_pos_weight=params['scale_pos_weight'],
                             metric=params['metric'],
                             num_boost_round=params['num_boost_round'],
                             num_iterations=params['num_iterations'],
                             n_estimators=params['n_estimators'],
                             silent=False,
                             verbose=4,
#                              eval_set=(X_test,Y_test)
                            )
    
    gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [seed], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

    print('default params\n',clf.get_params())

    clf.fit(X_train,Y_train,verbose=4)
    print('train',clf.score(X_train,Y_train))
    print('test',clf.score(X_test,Y_test))
    return clf

In [82]:
def LGBFit(X_train,X_test,Y_train,Y_test):
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_test = lgb.Dataset(X_test, Y_test, reference=lgb_train)

    params = {    
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'acc',
                'nthread':6,
#                 'learning_rate':0.08,
                'num_leaves':300, 
                'max_depth': -1,   
                'subsample': 0.9, 
                'colsample_bytree': 0.9, 
                'feature_fraction': 1,
                'bagging_freq': 8,
#                 'num_iterations':300,
                'min_data_in_leaf':2,
                'bagging_fraction': 0.8,
                'num_boost_round':3000,
        
            }

    cv_results = lgb.cv(params, lgb_train, nfold=5  
                        ,stratified=False, shuffle=True
                        ,seed=seed,
                        metrics=['auc','binary_logloss','mae']
                        ,verbose_eval=1)
    print('best n_estimators:', len(cv_results['auc-mean']))
    for k,v in cv_results.items():
        print('best cv score:', k, pd.Series(cv_results[k]).max())
    return [lgb,cv_results]

# lgbclf = lgb.LGBMClassifier(learning_rate=0.045,
#                            max_depth=-1,
#                            objective='binary',
#                             num_leaves=1000,
#                             min_child_samples=10,
#                             n_estimators=1000,
#                             subsample=0.9,
#                             random_state=42
#                            )
# lgbclf.fit(X_train,Y_train)
# print('train',lgbclf.score(X_train,Y_train))
# print('test',lgbclf.score(X_test,Y_test))

# from sklearn.metrics import mean_squared_error
# Y_pred = lgb.predict(X_test, num_iteration=gbm.best_iteration)

#### main

In [None]:
[data,T] = ReadData()
[X,Y] = ToMatrix(data,'dense')

In [None]:
[X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y)
[X_train,X_test,Y_train,Y_test] = \
    RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test)

#### test

In [None]:
# DecisionTreePrefit(X_train,X_test,Y_train,Y_test)
[lgb,cv_results] = LGBFit(X_train,X_test,Y_train,Y_test)
WriteResult(DATASET,cv_results,conf,commons)

#### tuning

In [92]:
clf = LGBTuningSingle(X_train,X_test,Y_train,Y_test)

default params
 {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 5, 'min_child_weight': 1, 'min_split_gain': 0.5, 'n_estimators': 500, 'n_jobs': 3, 'num_leaves': 400, 'objective': 'binary', 'random_state': None, 'reg_alpha': 0, 'reg_lambda': 0, 'silent': False, 'subsample': 0.8, 'subsample_for_bin': 200, 'subsample_freq': 1, 'max_bin': 512, 'scale_pos_weight': 1, 'metric': 'binary_error', 'num_boost_round': 3000, 'num_iterations': 1000, 'verbose': 4}


  if diff:


train 0.9946487376509331
test 0.9481434058898848


  if diff:


1