In [None]:
from module.prepare import *

In [None]:
help(GridSearchCV)

#### LGB

In [None]:
def LGBTuning(X_train,X_test,Y_train,Y_test):
    
    clf = lgb.LGBMClassifier(objective='binary',
                             silent=False,
                             verbose=1,
                             random_state=seed,
                             n_jobs=1,
#                              class_weight
                            )
    
    gridParams = {
    'learning_rate': [0.01,0.05,0.1],
    'boosting_type':['gbdt','goss','rf'],
    'n_estimators': [50,200,500],
    'num_iterations':[200,400,800],
#     'num_leaves': [200,500,800],
#     'max_bin':[127,255,511],
#     'colsample_bytree' : [0.65, 0.66],
#     'subsample_freq':[1,2,3],
#     'subsample' : [0.6,0.8,1],
#     'reg_alpha' : [0,0.5,1],
#     'reg_lambda' : [0,0.5,1],
    }

    print('default params\n',clf.get_params())

    grid = GridSearchCV(clf, gridParams,
                    scoring=['roc_auc','accuracy'],
                    refit=False,
                    verbose=3,
                    cv=5,
                    n_jobs=1)

    grid.fit(X_train, Y_train)

    # Print the best parameters found
    print(grid.best_params_)
    print(grid.best_score_)
    
    return [grid,params]

In [None]:
def LGBTuningSingle(X_train,X_test,Y_train,Y_test):
    
    [X_train, X_test, Y_train, Y_test] = train_test_split(X_train,Y_train,test_size=0.3)
    traindata = lgb.Dataset(X_train,Y_train)
    testdata = lgb.Dataset(X_test,Y_test)
    
    params = {'boosting_type': 'gbdt',
              'max_depth' : -1,
              'objective': 'binary',
              'n_jobs': 3, # Updated from nthread
              'num_leaves': 500,
              'learning_rate': 0.05,
              'max_bin': 512,
              'num_iterations':1000,
              'n_estimators':500,
              'metric' : 'binary_error'}
    
    clf = lgb.LGBMClassifier(boosting_type=params['boosting_type'],
                             max_depth=params['max_depth'],
                             objective=params['objective'],
                             n_jobs=params['n_jobs'],
                             num_leaves=params['num_leaves'],
                             learning_rate=params['learning_rate'],
                             max_bin=params['max_bin'],
                             metric=params['metric'],
                             num_iterations=params['num_iterations'],
                             n_estimators=params['n_estimators'],
                             silent=False,
                             verbose=4,
#                              eval_set=(X_test,Y_test)
                            )

    print('default params\n',clf.get_params())

    clf.fit(X_train,Y_train,verbose=4)
    print('train',clf.score(X_train,Y_train))
    print('test',clf.score(X_test,Y_test))
    return clf

In [None]:
def LGBFit(X_train,X_test,Y_train,Y_test):
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_test = lgb.Dataset(X_test, Y_test, reference=lgb_train)

    params = {'boosting_type': 'gbdt',
              'max_depth' : -1,
              'objective': 'binary',
              'n_jobs': 3, # Updated from nthread
              'num_leaves': 500,
              'learning_rate': 0.06,
              'max_bin': 512,
              'num_iterations':1000,
              'n_estimators':500,
              'metric' : 'binary_error'}

    cv_results = lgb.cv(params, lgb_train, nfold=5  
                        ,stratified=False, shuffle=True
                        ,seed=seed,
                        metrics=['auc','binary_logloss','mae']
                        ,verbose_eval=1)
    print('best n_estimators:', len(cv_results['auc-mean']))
    for k,v in cv_results.items():
        print('best cv score:', k, pd.Series(cv_results[k]).max())
    return [lgb,cv_results]

#### main

In [None]:
[data,T] = ReadData()
[X,Y] = ToMatrix(data,'dense')

In [None]:
[X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y)
[X_train,X_test,Y_train,Y_test] = \
    RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test)

In [None]:
arr = ToMatrix(data,'sparse')
# ENTROPY_IM = 1e-1
[X,Y] = MutualInformationFeatureSelection(arr,data)
[X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y)
[X_train,X_test,Y_train,Y_test] = \
    RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test)

#### test

In [None]:
# DecisionTreePrefit(X_train,X_test,Y_train,Y_test)
[lgb,cv_results] = LGBFit(X_train,X_test,Y_train,Y_test)
WriteResult(DATASET,cv_results,conf,commons)

#### tuning

In [None]:
clf1 = LGBTuningSingle(X_train,X_test,Y_train,Y_test)

In [None]:
clf2 = LGBTuning(X_train,X_test,Y_train,Y_test)

In [8]:
clf3 = LGBFit(X_train,X_test,Y_train,Y_test)

[<16659x2547 sparse matrix of type '<class 'numpy.int32'>'
 	with 6854953 stored elements in Compressed Sparse Row format>,
 <4165x2547 sparse matrix of type '<class 'numpy.int32'>'
 	with 1719975 stored elements in Compressed Sparse Row format>]