In [54]:
import joblib
import mlflow
import pandas as pd
import numpy as np
import lightgbm as lgb
from kedro.io import PickleLocalDataSet
from catboost import CatBoostClassifier, cv, Pool
from twitter_bot_detection.helpers import log_running_time
from eli5 import explain_prediction, explain_weights

from sklearn.metrics import accuracy_score, classification_report, f1_score

In [55]:
X_train = pd.read_pickle("data/05_model_input/X_train.pkl")
X_test = pd.read_pickle("data/05_model_input/X_test.pkl")
y_train = pd.read_pickle("data/05_model_input/y_train.pkl")
y_test = pd.read_pickle("data/05_model_input/y_test.pkl")

In [56]:
# @log_running_time
def train_lightgbm(X_train: PickleLocalDataSet, X_test: PickleLocalDataSet, y_train: PickleLocalDataSet, y_test: PickleLocalDataSet, log=False) -> PickleLocalDataSet:
    features = X_train.columns.values
    train_data = lgb.Dataset(X_train, label=y_train)
    validation_data = train_data.create_valid(X_test, label=y_test)

    params = {
        'num_leaves': 31,
        'objective': 'binary',
        'metric': 'binary_logloss',
    }
    num_round = 2000
    early_stopping_rounds=30
    
    
    model = lgb.train(params, train_data, num_round, valid_sets=[validation_data], early_stopping_rounds=early_stopping_rounds, verbose_eval=200)

    y_pred = np.round(model.predict(X_test))

    f1 = f1_score(y_test, y_pred, average="weighted")        
    
    joblib.dump(model, 'data/06_models/lightgbm.pkl')

#     model.save_model('data/06_models/lightgbm.txt', num_iteration=model.best_iteration)

    print(classification_report(y_test, y_pred, digits=5))
    if log:
        mlflow.set_tracking_uri("databricks")
        mlflow.set_experiment("/Users/firefly.eugene@gmail.com/twitter-bot-detection")

        run_id = mlflow.search_runs(experiment_ids="3889491181315524", filter_string="tags.`mlflow.runName`='lightgbm'", run_view_type=1)["run_id"][0]
        mlflow.start_run(run_id=run_id, nested=False)
#         mlflow.start_run(run_name='lightgbm', nested=False)

        with mlflow.start_run(nested=True):
            mlflow.set_tags({
                "lib": "lihgtgbm",
                "features": features,
            })

            mlflow.log_params(params)
            mlflow.log_param("num_round", num_round)
            mlflow.log_param("early_stopping_rounds", early_stopping_rounds)
            mlflow.log_metric("f1", f1, 1)
            mlflow.log_artifact('data/05_model_input/X_test.pkl')
        mlflow.end_run()
    
    return model

In [57]:
m = train_lightgbm(X_train, X_test, y_train, y_test, log=True);

Training until validation scores don't improve for 30 rounds
[200]	valid_0's binary_logloss: 0.179344
Early stopping, best iteration is:
[250]	valid_0's binary_logloss: 0.178987
              precision    recall  f1-score   support

           0    0.92804   0.97063   0.94886      4903
           1    0.93404   0.84676   0.88826      2408

    accuracy                        0.92983      7311
   macro avg    0.93104   0.90870   0.91856      7311
weighted avg    0.93002   0.92983   0.92890      7311



In [53]:
explain_weights(m, X_test.iloc[0])

In [45]:
#eli5, shapley values
m

<lightgbm.basic.Booster at 0x7f294690d990>