# Modules import

In [1]:
import os.path
import pickle

import numpy as np
import optuna
import pandas as pd
from sklearnex import patch_sklearn

patch_sklearn()
import utils.optuna_utils as ou
from optuna import create_study

optuna.logging.set_verbosity(optuna.logging.WARNING)

TIMEOUT = 1200  # seconds
TRIALS = 100  # number of trials
N_SPLITS = 5

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
np.random.seed(42)

# Simple processing

In [3]:
DATA_PATH = os.path.join('..', 'data', 'bert_embeddings')
train = pd.read_pickle(os.path.join(DATA_PATH, 'train.pkl'))
validation = pd.read_pickle(os.path.join(DATA_PATH, 'validation.pkl'))
test = pd.read_pickle(os.path.join(DATA_PATH, 'test.pkl'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type,bert_embeddings
0,bot#9,YEA now that note GOOD,bot,others,"[0.10614613, 0.0023416397, 0.18387558, 0.25720..."
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"[-0.15180907, 0.1564969, -0.10380695, 0.157478..."
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"[0.19033994, -0.039005734, -0.015785955, 0.235..."
3,bot#1,The decade in the significantly easier schedul...,bot,others,"[0.1858164, 0.07074168, 0.030424008, 0.2930759..."
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"[0.20630777, 0.35826805, 0.041690856, 0.272989..."


In [4]:
train["bert_embeddings"].iloc[0].shape

(768,)

In [5]:
train["bert_embeddings"].iloc[1].shape

(768,)

In [6]:
def get_x_y(df: pd.DataFrame) -> (pd.Series, pd.Series):
    """
    Get predictors and target from dataframe
    :param df: dataframe
    :return: predictors, target
    """
    x = np.concatenate([x.reshape(1, -1) for x in df["bert_embeddings"]])
    y = df["account.type"]
    y = np.where(y == "bot", 1, 0)
    return x, y


x_train, y_train = get_x_y(train)
x_validation, y_validation = get_x_y(validation)
x_train = np.concatenate([x_train, x_validation])
y_train = np.append(y_train, y_validation)
x_train.shape, y_train.shape

((23014, 768), (23014,))

In [7]:
x_test, y_test = get_x_y(test)
x_test.shape, y_test.shape

((2558, 768), (2558,))

In [8]:
preprocessed_path = os.path.join("..", "models", "bert_embeddings")
os.makedirs(preprocessed_path, exist_ok=True)

In [9]:
studies = []
results_optuna = []
results_test = []
for model in ou.MODELS.keys():
    study = create_study(study_name=f'bert_embeddings_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder=None, n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True, n_trials=TRIALS)
    studies.append(study)
    retrained_model = ou.get_best_model(study.best_params, x_train, y_train)
    with open(os.path.join(preprocessed_path, f"{model}.pickle"), "wb") as f:
        pickle.dump(retrained_model, f)
    results_test.append({"type": "bert_embeddings",
                         "model": model,
                         **ou.get_score(retrained_model, x_test, y_test)})
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results_optuna.append({
        "type": "bert_embeddings",
        "model": model,
        "score": study.best_value,
        "params": study.best_params
    })

  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LGBM
Best accuracy: 0.8596951387811214
Best params: {'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth': 7, 'lgbm_n_estimators': 385, 'lgbm_subsample': 0.9695453366156187}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: XGB
Best accuracy: 0.8540899189752803
Best params: {'xgb_booster': 'dart', 'xgb_max_depth': 15, 'xgb_n_estimators': 244, 'xgb_subsample': 0.8931104403282367}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: RF
Best accuracy: 0.8111590133251474
Best params: {'rf_max_depth': 12, 'rf_n_estimators': 32, 'rf_criterion': 'gini', 'rf_min_samples_split': 0.012701770392299291}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: SVC
Best accuracy: 0.8783788335143825
Best params: {'svc_kernel': 'rbf', 'svc_C': 21.642802198313092}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LR
Best accuracy: 0.8450507674108376
Best params: {'lr_penalty': 'l1', 'lr_C': 11.802229858706795}


# Results

In [10]:
results_df = pd.DataFrame(results_optuna)
results_df.sort_values(by="score", ascending=False)

Unnamed: 0,type,model,score,params
3,bert_embeddings,SVC,0.878379,"{'svc_kernel': 'rbf', 'svc_C': 21.642802198313..."
0,bert_embeddings,LGBM,0.859695,"{'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth..."
1,bert_embeddings,XGB,0.85409,"{'xgb_booster': 'dart', 'xgb_max_depth': 15, '..."
4,bert_embeddings,LR,0.845051,"{'lr_penalty': 'l1', 'lr_C': 11.802229858706795}"
2,bert_embeddings,RF,0.811159,"{'rf_max_depth': 12, 'rf_n_estimators': 32, 'r..."


In [11]:
results_df.to_csv(os.path.join("results", "bert_optuna.csv"), index=False)

In [12]:
results_test_df = pd.DataFrame(results_test)
results_test_df.sort_values(by="balanced_accuracy", ascending=False)

Unnamed: 0,type,model,balanced_accuracy,f1_score,precision,recall
3,bert_embeddings,SVC,0.875681,0.876265,0.872868,0.879687
0,bert_embeddings,LGBM,0.856122,0.859004,0.842857,0.875781
1,bert_embeddings,XGB,0.851823,0.854622,0.839488,0.870313
4,bert_embeddings,LR,0.839317,0.841618,0.830418,0.853125
2,bert_embeddings,RF,0.806096,0.806854,0.804348,0.809375


In [13]:
results_test_df.sort_values(by="balanced_accuracy", ascending=False).to_csv(
    os.path.join("results", "bert.csv"), index=False)