In [3]:
import joblib
import mlflow
import numpy as np
import pandas as pd
import xgboost as xgb
from kedro.io import PickleLocalDataSet
from sklearn.preprocessing import StandardScaler
from eli5 import show_weights

from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

In [4]:
X_train = pd.read_pickle("data/05_model_input/X_train.pkl")
X_test = pd.read_pickle("data/05_model_input/X_test.pkl")
y_train = pd.read_pickle("data/05_model_input/y_train.pkl")
y_test = pd.read_pickle("data/05_model_input/y_test.pkl")

In [5]:
def run_xgboost(X_train: PickleLocalDataSet, X_test: PickleLocalDataSet, y_train: PickleLocalDataSet, y_test: PickleLocalDataSet, log=False) -> PickleLocalDataSet:
    params = {
        'booster': 'dart',
        'max_depth': 20, 
        'learning_rate': 0.02,
        'objective': 'binary:logistic',
        'sample_type': 'uniform',
        'normalize_type': 'tree',
        'tree_method': 'gpu_hist',
        'rate_drop': 0.1,
        'skip_drop': 0.5,
        'eval_metric': 'logloss',
        'nthread': 8,
    }
    features = X_train.columns

    xgb_train = xgb.DMatrix(X_train, label=y_train, feature_names=features)
    xgb_test = xgb.DMatrix(X_test, label=y_test, feature_names=features)
    num_round = 50
    model = xgb.train(params, xgb_train, num_round)
    y_pred = model.predict(xgb_test)

    f1 = f1_score(y_test, np.round(y_pred), average="weighted")

    joblib.dump(model, 'data/06_models/xgboost.pkl')

#     model.save_model('data/06_models/xgboost.model')
    print(classification_report(y_test, np.round(y_pred), digits=5))
    if log:
        mlflow.set_tracking_uri("databricks")
        mlflow.set_experiment("/Users/firefly.eugene@gmail.com/twitter-bot-detection")

        run_id = mlflow.search_runs(experiment_ids="3889491181315524", filter_string="tags.`mlflow.runName`='xgboost'", run_view_type=1)["run_id"][0]    
        mlflow.start_run(run_id=run_id, nested=False)
        with mlflow.start_run(nested=True):
            mlflow.set_tags({
                "lib": "xgboost",
                "description": "",
                "features": features,
            })

            mlflow.log_params(params)
            mlflow.log_metric("f1", f1, 1)
            mlflow.log_artifact('data/05_model_input/X_test.pkl')
        mlflow.end_run()
    return model

In [6]:
%%time
m = run_xgboost(X_train, X_test, y_train, y_test, log=True);

              precision    recall  f1-score   support

           0    0.92148   0.96227   0.94143      4903
           1    0.91556   0.83306   0.87236      2408

    accuracy                        0.91971      7311
   macro avg    0.91852   0.89766   0.90690      7311
weighted avg    0.91953   0.91971   0.91868      7311



  from collections import (
  class ResultIterable(collections.Iterable):


CPU times: user 11.4 s, sys: 161 ms, total: 11.6 s
Wall time: 17.1 s


In [7]:
show_weights(m, top=50)

Weight,Feature
0.38,is_retweet_mean
0.1154,replies_mean
0.0463,hashtags_std
0.0428,account_active_for_days
0.0348,statuses_count
0.028,verified
0.0243,description_urls_count
0.023,tweets_to_faw_ratio
0.0229,mentions_std
0.0215,unique_sources
