In [16]:
from lightgbm import LGBMClassifier
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, f1_score
from joblib import load
import matplotlib.pyplot as plt

## Загружаем энкодер класса

In [18]:
class_enc = load('processing/binaries/class_encoder.joblib')

In [4]:
df = pd.read_csv('boost_ready.csv')

## Балансируем классы, чтобы модель не переобучалась под 'others'

In [7]:
df1 = pd.concat([df[df['class'] != 3], df[df['class'] == 3].sample(30000)])

In [12]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from sklearn.metrics import make_scorer

## Подбираем параметры классификатора

In [13]:
def gini_lgb(truth, predictions):
    score = gini(truth, predictions) / gini(truth, truth)
    return 'gini', score, True


def objective(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.01,
        **params
    )
    
    score = cross_val_score(clf, 
                            df.drop(columns=['class']), 
                            df['class'], 
                            scoring='f1_macro', 
                            cv=KFold(n_splits=5)).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

space = {
    'num_leaves': hp.quniform('num_leaves', 8, 128, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

Gini 0.922 params {'num_leaves': 108, 'colsample_bytree': '0.572'}                              
Gini 0.921 params {'num_leaves': 96, 'colsample_bytree': '0.587'}                               
Gini 0.908 params {'num_leaves': 20, 'colsample_bytree': '0.378'}                               
Gini 0.923 params {'num_leaves': 94, 'colsample_bytree': '0.783'}                               
Gini 0.913 params {'num_leaves': 22, 'colsample_bytree': '0.880'}                               
Gini 0.915 params {'num_leaves': 120, 'colsample_bytree': '0.329'}                              
Gini 0.922 params {'num_leaves': 100, 'colsample_bytree': '0.663'}                              
Gini 0.915 params {'num_leaves': 28, 'colsample_bytree': '0.917'}                               
Gini 0.922 params {'num_leaves': 104, 'colsample_bytree': '0.926'}                              
Gini 0.923 params {'num_leaves': 98, 'colsample_bytree': '0.844'}                               
100%|████████████████████████|

In [14]:
print("Hyperopt estimated optimum {}".format(best))

Hyperopt estimated optimum {'colsample_bytree': 0.37846198822882576, 'num_leaves': 20.0}


In [16]:
lgbt = LGBMClassifier(num_leaves=20, colsample_bytree=0.378)

In [17]:
lgbt.fit(df1.drop(columns=['class']), df1['class'])

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.378,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=20, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Сохраняем модель

In [13]:
import pickle
# now you can save it to a file
with open('LGBMClassifier.pkl', 'wb') as f:
    pickle.dump(lgbt, f)

In [20]:
with open('processing/binaries/LGBMClassifier.pkl', 'rb') as f:
    
    lgbt = pickle.load(f)