In [43]:
import mlflow
import pandas as pd
import catboost as ctb
from sklearn.metrics import accuracy_score, classification_report, f1_score
from catboost import CatBoostClassifier, cv, Pool
from sklearn.model_selection import train_test_split, StratifiedKFold
from eli5 import show_weights, explain_prediction, explain_weights_catboost
import shap

In [6]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/firefly.eugene@gmail.com/twitter-bot-detection")

In [7]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [25]:
users = pd.read_pickle("../data/04_features/user_features.pkl")
users.fillna()

In [26]:
X = users.drop(columns=["label"])
y = users["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify)

In [27]:
train_pool = Pool(X_train, y_train, cat_features=["created_at_time"])
train_pool_slice = train_pool.slice([2, 3])

In [None]:
mlflow.start_run(run_name="catboost", nested=False)

In [28]:
with mlflow.start_run(nested=True):
    mlflow.set_tags({
        "lib": "catboost",
        "description": "tweets per day aded",
    })

    params = {
        "iterations": 2000,
        "learning_rate": 0.02,
        "loss_function": 'Logloss',
        "random_seed": 1,
        "od_wait": 30,
        "od_type": "Iter",
        "thread_count": 8,
        "cat_features": ["created_at_time",]
    }
    model = CatBoostClassifier(**params)
    
    model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        verbose=200,
        plot=False,
    )
    
    y_pred = model.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average="weighted")

    
    mlflow.log_params(params)       
    mlflow.log_metric("f1", f1, 1)
    mlflow.log_artifact('../data/04_features/user_features.pkl')
    
    model.save_model('../data/06_models/catboost', format="cbm", export_parameters=None, pool=None)
    print(classification_report(y_test, y_pred))

0:	learn: 0.6803268	test: 0.6802663	best: 0.6802663 (0)	total: 12.6ms	remaining: 25.1s
200:	learn: 0.3064747	test: 0.3157352	best: 0.3157352 (200)	total: 2.55s	remaining: 22.8s
400:	learn: 0.2782812	test: 0.2936176	best: 0.2936176 (400)	total: 5.08s	remaining: 20.2s
600:	learn: 0.2642813	test: 0.2848709	best: 0.2848709 (600)	total: 7.64s	remaining: 17.8s
800:	learn: 0.2539880	test: 0.2797824	best: 0.2797824 (800)	total: 10.1s	remaining: 15.2s
1000:	learn: 0.2451079	test: 0.2771737	best: 0.2771737 (1000)	total: 12.7s	remaining: 12.7s
1200:	learn: 0.2372145	test: 0.2749893	best: 0.2749893 (1200)	total: 15.4s	remaining: 10.2s
1400:	learn: 0.2303988	test: 0.2737050	best: 0.2737050 (1400)	total: 17.9s	remaining: 7.66s
1600:	learn: 0.2239928	test: 0.2729066	best: 0.2729057 (1586)	total: 20.8s	remaining: 5.18s
1800:	learn: 0.2181514	test: 0.2720506	best: 0.2720462 (1799)	total: 23.5s	remaining: 2.6s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.271764233
bestIteration = 

In [41]:
explain_weights_catboost(model)

Weight,Feature
0.1477,followers_count
0.0908,faw_to_tweets_ratio
0.0865,tweets_per_day
0.0865,tweets_to_faw_ratio
0.0843,account_active_for_days
0.0811,friends_count
0.0809,favourites_count
0.0646,statuses_count
0.0643,verified
0.0511,fr_to_flw_ratio


In [19]:
mlflow.end_run()

In [10]:
sorted([(feature, round(score,2)) for score, feature in zip(model.feature_importances_, users.columns)], key=lambda x: x[1])
model.get_feature_importance(train_pool, "Interaction")[:5]

array([[ 1.        ,  5.        ,  2.82067909],
       [ 2.        ,  5.        ,  2.78316243],
       [ 1.        , 12.        ,  2.33374435],
       [ 1.        , 16.        ,  2.26089442],
       [ 1.        ,  2.        ,  2.22755059]])

In [None]:
import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'booster': 'dart',
         'max_depth': 5, 'learning_rate': 0.1,
         'objective': 'binary:logistic',
         'sample_type': 'uniform',
         'normalize_type': 'tree',
         'rate_drop': 0.1,
         'skip_drop': 0.5}
num_round = 50
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)