In [None]:
from module.prepare import *

In [None]:
help(GridSearchCV)

#### LGB

In [None]:
def LGBTuning(X_train,X_test,Y_train,Y_test):
    
    clf = lgb.LGBMClassifier(objective='binary',
                             silent=False,
                             verbose=1,
                             random_state=seed,
                             n_jobs=3,
#                              class_weight
                            )
    
    gridParams = {
        # step 1
#     'learning_rate': [0.01,0.05,0.1],
#     'boosting_type':['gbdt','goss'],
#     'n_estimators': [50,200,500],
        # step 1 fixed
    'learning_rate': [0.05,0.01],
    'boosting_type':['gbdt'], ### goss>gbdt
    'n_estimators': [500],
        # step 2
#     'num_leaves': [200,500,800],
#     'max_bin':[127,255,511],
        # step 2 fixed
    'num_leaves': [800],
    'max_bin':[256],
        # step 3
    'num_iterations':[200,400,1000],
    'colsample_bytree' : [0.6,0.8,0.9],
    'subsample_freq':[1,3],
    'subsample' : [0.8,1],
#     'reg_alpha' : [0,0.5],
#     'reg_lambda' : [0,0.5],
    }

    print('default params\n',clf.get_params())

    grid = GridSearchCV(clf, gridParams,
                    scoring='roc_auc',
#                     refit=False,
                    verbose=3,
                    cv=5,
                    n_jobs=1)

    grid.fit(X_train, Y_train)

    # Print the best parameters found
#     print(grid.best_params_)
#     print(grid.best_score_)
    
    return grid

In [None]:
def LGBTuningSingle(X_train,X_test,Y_train,Y_test):
    
    [X_train, X_test, Y_train, Y_test] = train_test_split(X_train,Y_train,test_size=0.3)
    traindata = lgb.Dataset(X_train,Y_train)
    testdata = lgb.Dataset(X_test,Y_test)
    
    params = {'boosting_type': 'gbdt',
              'max_depth' : -1,
              'objective': 'binary',
              'n_jobs': 3, # Updated from nthread
              'num_leaves': 400,
              'learning_rate': 0.1,
              'max_bin': 512,
              'subsample_for_bin': 200,
              'subsample': 0.8,
              'subsample_freq': 1,
              'colsample_bytree': 0.8,
              'reg_alpha': 0,
              'reg_lambda': 0,
              'min_split_gain': 0.5,
              'min_child_weight': 1,
              'min_child_samples': 5,
              'scale_pos_weight': 1,
              'num_boost_round':3000,
              'num_iterations':1000,
              'n_estimators':500,
              'metric' : 'binary_error'}
    
    clf = lgb.LGBMClassifier(boosting_type=params['boosting_type'],
                             max_depth=params['max_depth'],
                             objective=params['objective'],
                             n_jobs=params['n_jobs'],
                             num_leaves=params['num_leaves'],
                             learning_rate=params['learning_rate'],
                             max_bin=params['max_bin'],
                             subsample_for_bin=params['subsample_for_bin'],
                             subsample=params['subsample'],
                             subsample_freq=params['subsample_freq'],
                             colsample_bytree=params['colsample_bytree'],
                             reg_alpha=params['reg_alpha'],
                             reg_lambda=params['reg_lambda'],
                             min_split_gain=params['min_split_gain'],
                             min_child_weight=params['min_child_weight'],
                             min_child_samples=params['min_child_samples'],
                             scale_pos_weight=params['scale_pos_weight'],
                             metric=params['metric'],
                             num_boost_round=params['num_boost_round'],
                             num_iterations=params['num_iterations'],
                             n_estimators=params['n_estimators'],
                             silent=False,
                             verbose=4,
#                              eval_set=(X_test,Y_test)
                            )
    
    gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [seed], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

    print('default params\n',clf.get_params())

    clf.fit(X_train,Y_train,verbose=4)
    print('train',clf.score(X_train,Y_train))
    print('test',clf.score(X_test,Y_test))
    return clf

In [None]:
def LGBFit(X_train,X_test,Y_train,Y_test):
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_test = lgb.Dataset(X_test, Y_test, reference=lgb_train)

    params = {    
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'acc',
#                 'nthread':6,
                'learning_rate':0.08,
                'num_leaves':300, 
                'max_depth': -1,   
                'subsample': 0.9, 
                'colsample_bytree': 0.9, 
                'feature_fraction': 1,
#                 'bagging_freq': 8,
# #                 'num_iterations':300,
#                 'min_data_in_leaf':2,
#                 'bagging_fraction': 0.8,
#                 'num_boost_round':3000,
            }

    cv_results = lgb.cv(params, lgb_train, nfold=5  
                        ,stratified=False, shuffle=True
                        ,seed=seed,
                        metrics=['auc','binary_logloss','mae']
                        ,verbose_eval=1)
    print('best n_estimators:', len(cv_results['auc-mean']))
    for k,v in cv_results.items():
        print('best cv score:', k, pd.Series(cv_results[k]).max())
    return [lgb,cv_results]

#### main

In [None]:
[data,T] = ReadData()
[X,Y] = ToMatrix(data,'dense')

In [None]:
[X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y)
[X_train,X_test,Y_train,Y_test] = \
    RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test)

In [None]:
arr = ToMatrix(data,'sparse')
# ENTROPY_IM = 1e-1
[X,Y] = MutualInformationFeatureSelection(arr,data)
[X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y)
[X_train,X_test,Y_train,Y_test] = \
    RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test)

#### test

In [None]:
# DecisionTreePrefit(X_train,X_test,Y_train,Y_test)
[lgb,cv_results] = LGBFit(X_train,X_test,Y_train,Y_test)
WriteResult(DATASET,cv_results,conf,commons)

#### tuning

In [None]:
clf1 = LGBTuningSingle(X_train,X_test,Y_train,Y_test)

In [59]:
clf2 = LGBTuning(X_train,X_test,Y_train,Y_test)

[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1, score=0.978, total=  58.8s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1, score=0.981, total=  53.6s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1, score=0.982, total=  52.9s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1, score=0.979, total=  53.0s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=1, score=0.980, total=  53.7s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3, score=0.978, total=  52.5s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3, score=0.981, total=  53.2s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3, score=0.982, total=  54.4s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3, score=0.979, total=  53.3s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=200, num_leaves=800, subsample=1, subsample_freq=3, score=0.980, total=  53.2s
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.980, total= 1.3min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.982, total= 1.3min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.983, total= 1.7min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.980, total= 1.7min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.982, total= 1.7min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3, score=0.980, total= 1.6min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3, score=0.983, total= 1.6min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3, score=0.983, total= 1.6min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3, score=0.980, total= 1.6min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=0.8, subsample_freq=3, score=0.982, total= 1.6min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1, score=0.980, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1, score=0.983, total= 2.0min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1, score=0.983, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1, score=0.980, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=1, score=0.982, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3, score=0.980, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3, score=0.983, total= 2.0min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3, score=0.983, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3, score=0.980, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=400, num_leaves=800, subsample=1, subsample_freq=3, score=0.982, total= 2.1min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.983, total= 4.0min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.985, total= 3.9min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.985, total= 4.0min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1 




[CV]  boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1, score=0.982, total= 3.9min
[CV] boosting_type=gbdt, colsample_bytree=0.6, learning_rate=0.01, max_bin=256, n_estimators=500, num_iterations=1000, num_leaves=800, subsample=0.8, subsample_freq=1 




KeyboardInterrupt: 

In [None]:
# rts = clf2.cv_results_['rank_test_score']
# flogging(str(clf2.cv_results_))



In [None]:
a = np.vstack([np.array(params),np.array(rank),np.array(score)])

a = a.T

a = np.array( list(sorted(a,key=lambda x:x[2],reverse=True)) )

a