In [52]:
import mlflow
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/firefly.eugene@gmail.com/twitter-bot-detection")

In [53]:
users = pd.read_pickle("data/04_features/users.pkl")

In [70]:
def run_xgboost(users, cat_features=[]):
    users = pd.get_dummies(users, columns=cat_features)
    X = users.drop(columns=["label"])
    y = users["label"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    run_id = mlflow.search_runs(experiment_ids="3889491181315524", filter_string="tags.`mlflow.runName`='xgboost'", run_view_type=1)["run_id"][0]    
    mlflow.end_run()
    mlflow.start_run(run_id=run_id, nested=False)

    with mlflow.start_run(nested=True):
        mlflow.set_tags({
            "lib": "xgboost",
            "description": "listed_count_cat added",
            "features": users.columns.values,
        })


        params = {
            'booster': 'dart',
            'max_depth': 5, 
            'learning_rate': 0.02,
            'objective': 'binary:logistic',
            'sample_type': 'uniform',
            'normalize_type': 'tree',
            'tree_method': 'gpu_hist',
            'rate_drop': 0.1,
            'skip_drop': 0.5,
            'eval_metric': 'logloss',
            'nthread': 8,
        }

        xgb_train = xgb.DMatrix(X_train, label=y_train)
        xgb_test = xgb.DMatrix(X_test, label=y_test, feature_names=X.columns)
        num_round = 50
        model = xgb.train(params, xgb_train, num_round)
        y_pred = model.predict(xgb_test)

        f1 = f1_score(y_test, np.round(y_pred), average="weighted")

        mlflow.log_param("cat_features", cat_features)  
        mlflow.log_params(params)  
        mlflow.log_metric("f1", f1, 1)
        mlflow.log_artifact('data/04_features/users.pkl')

        model.save_model('data/06_models/xgboost.model')
        print(classification_report(y_test, np.round(y_pred)))
    mlflow.end_run()
    return f1

In [71]:
%%time
run_xgboost(users, cat_features=["created_at_time", "listed_count_cat"])

              precision    recall  f1-score   support

           0       0.86      0.96      0.91      4948
           1       0.88      0.67      0.76      2368

    accuracy                           0.86      7316
   macro avg       0.87      0.81      0.83      7316
weighted avg       0.87      0.86      0.86      7316

CPU times: user 3.38 s, sys: 180 ms, total: 3.56 s
Wall time: 21.8 s


0.859392324633081