# Modules import

In [1]:
import os.path
import pickle

import numpy as np
import optuna
import pandas as pd
from sklearnex import patch_sklearn

patch_sklearn()
import utils.optuna_utils as ou
from optuna import create_study

optuna.logging.set_verbosity(optuna.logging.WARNING)

TIMEOUT = 1200  # seconds
TRIALS = 100  # number of trials
N_SPLITS = 5

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
np.random.seed(42)

# Simple processing

In [3]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [4]:
def get_x_y(df: pd.DataFrame) -> (pd.Series, pd.Series):
    """
    Get predictors and target from dataframe
    :param df: dataframe
    :return: predictors, target
    """
    x = df["text"]
    y = df["account.type"]
    y = np.where(y == "bot", 1, 0)
    return x, y


x_train, y_train = get_x_y(train)
x_validation, y_validation = get_x_y(validation)
x_train = pd.concat([x_train, x_validation])
y_train = np.append(y_train, y_validation)
x_train.shape, y_train.shape

((23014,), (23014,))

In [5]:
x_test, y_test = get_x_y(test)
x_test.shape, y_test.shape

((2558,), (2558,))

In [6]:
preprocessed_path = os.path.join("..", "models", "tfidf", "preprocessed")
os.makedirs(preprocessed_path, exist_ok=True)

In [7]:
studies = []
results_optuna = []
results_test = []
for model in ou.MODELS.keys():
    study = create_study(study_name=f'simple_processing_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True, n_trials=TRIALS)
    studies.append(study)
    retrained_model = ou.get_best_model(study.best_params, x_train, y_train)
    with open(os.path.join(preprocessed_path, f"{model}.pickle"), "wb") as f:
        pickle.dump(retrained_model, f)
    results_test.append({"type": "preprocessed",
                         "model": model,
                         **ou.get_score(retrained_model, x_test, y_test)})
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results_optuna.append({
        "type": "preprocessed",
        "model": model,
        "score": study.best_value,
        "params": study.best_params
    })

  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LGBM
Best accuracy: 0.8277647009516128
Best params: {'lgbm_boosting_type': 'dart', 'lgbm_max_depth': 8, 'lgbm_n_estimators': 395, 'lgbm_subsample': 0.9786672573591595, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 3719, 'tfidf_max_df': 0.9620582864667189, 'tfidf_min_df': 0.0004178886620714936}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: XGB
Best accuracy: 0.82046196059508
Best params: {'xgb_booster': 'dart', 'xgb_max_depth': 13, 'xgb_n_estimators': 103, 'xgb_subsample': 0.9150958883282474, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 1466, 'tfidf_max_df': 0.9625476812915221, 'tfidf_min_df': 3.475395363315486e-05}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: RF
Best accuracy: 0.8078636342526254
Best params: {'rf_max_depth': 15, 'rf_n_estimators': 201, 'rf_criterion': 'log_loss', 'rf_min_samples_split': 0.013860937783493876, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 6481, 'tfidf_max_df': 0.9478755977460126, 'tfidf_min_df': 0.0005835201888275228}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: SVC
Best accuracy: 0.8279359142381717
Best params: {'svc_kernel': 'rbf', 'svc_C': 1.4318393113722407, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 2493, 'tfidf_max_df': 0.821402704543996, 'tfidf_min_df': 0.0007078433320654484}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]



Model: LR
Best accuracy: 0.8051664916587093
Best params: {'lr_penalty': 'l1', 'lr_C': 10.022385601516065, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 4565, 'tfidf_max_df': 0.9250257928550478, 'tfidf_min_df': 0.003670702795655745}


# Stemming

In [8]:
DATA_PATH = os.path.join('..', 'data', 'stemmed')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens,new_text
0,bot#9,YEA now that note GOOD,bot,others,"['yea', 'note', 'good']",yea note good
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"['listen', 'thi', 'charm', 'man', 'the', 'smit...",listen thi charm man the smith <url>
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"['wish', 'would', 'see', 'hoe', 'worst', 'part']",wish would see hoe worst part
3,bot#1,The decade in the significantly easier schedul...,bot,others,"['the', 'decad', 'significantli', 'easier', 's...",the decad significantli easier schedul i don't...
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"['""', 'theim', 'class', '=\\', '""', 'alignnon'...",""" theim class =\ "" alignnon size-ful wp-imag -..."


In [9]:
x_train, y_train = get_x_y(train)
x_validation, y_validation = get_x_y(validation)
x_train = pd.concat([x_train, x_validation])
y_train = np.append(y_train, y_validation)
x_train.shape, y_train.shape

((23014,), (23014,))

In [10]:
x_test, y_test = get_x_y(test)
x_test.shape, y_test.shape

((2558,), (2558,))

In [11]:
preprocessed_path = os.path.join("..", "models", "tfidf", "stemmed")
os.makedirs(preprocessed_path, exist_ok=True)

In [12]:
for model in ou.MODELS.keys():
    study = create_study(study_name=f'stemming_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True, n_trials=TRIALS)
    studies.append(study)
    retrained_model = ou.get_best_model(study.best_params, x_train, y_train)
    with open(os.path.join(preprocessed_path, f"{model}.pickle"), "wb") as f:
        pickle.dump(retrained_model, f)
    results_test.append({"type": "stemming",
                         "model": model,
                         **ou.get_score(retrained_model, x_test, y_test)})
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results_optuna.append({
        "type": "stemming",
        "model": model,
        "score": study.best_value,
        "params": study.best_params
    })

  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LGBM
Best accuracy: 0.8243324116625151
Best params: {'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth': 14, 'lgbm_n_estimators': 47, 'lgbm_subsample': 0.6763114591400695, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 7896, 'tfidf_max_df': 0.8726995572865293, 'tfidf_min_df': 0.00010761923718688525}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: XGB
Best accuracy: 0.8175518633344548
Best params: {'xgb_booster': 'gbtree', 'xgb_max_depth': 3, 'xgb_n_estimators': 126, 'xgb_subsample': 0.6789374154333612, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 1993, 'tfidf_max_df': 0.8038501703381888, 'tfidf_min_df': 0.0018039544794470064}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: RF
Best accuracy: 0.8046936681101521
Best params: {'rf_max_depth': 15, 'rf_n_estimators': 117, 'rf_criterion': 'entropy', 'rf_min_samples_split': 0.026128551127656066, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 1738, 'tfidf_max_df': 0.8149080290001827, 'tfidf_min_df': 5.672873705708315e-05}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: SVC
Best accuracy: 0.8273282005217389
Best params: {'svc_kernel': 'rbf', 'svc_C': 1.2769323445571867, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 8432, 'tfidf_max_df': 0.901358223945256, 'tfidf_min_df': 0.0003497466006027218}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LR
Best accuracy: 0.8168546067116212
Best params: {'lr_penalty': 'l2', 'lr_C': 1.476130789859466, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 9379, 'tfidf_max_df': 0.9000799027049124, 'tfidf_min_df': 1.7381024782369694e-05}


# Lemmatization

In [13]:
DATA_PATH = os.path.join('..', 'data', 'lemmatized')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens,new_text
0,bot#9,YEA now that note GOOD,bot,others,"['yea', 'note', 'good']",yea note good
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"['listen', 'thi', 'charm', 'man', 'the', 'smit...",listen thi charm man the smith <url>
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"['wish', 'would', 'see', 'hoe', 'worst', 'part']",wish would see hoe worst part
3,bot#1,The decade in the significantly easier schedul...,bot,others,"['the', 'decad', 'significantli', 'easier', 's...",the decad significantli easier schedul i don't...
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"['""', 'theim', 'class', '=\\', '""', 'alignnon'...",""" theim class =\ "" alignnon size-ful wp-imag -..."


In [14]:
x_train, y_train = get_x_y(train)
x_validation, y_validation = get_x_y(validation)
x_train = pd.concat([x_train, x_validation])
y_train = np.append(y_train, y_validation)
x_train.shape, y_train.shape

((23014,), (23014,))

In [15]:
x_test, y_test = get_x_y(test)
x_test.shape, y_test.shape

((2558,), (2558,))

In [16]:
preprocessed_path = os.path.join("..", "models", "tfidf", "lemmatization")
os.makedirs(preprocessed_path, exist_ok=True)

In [17]:
for model in ou.MODELS.keys():
    study = create_study(study_name=f'lemmatization_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True, n_trials=TRIALS)
    studies.append(study)
    retrained_model = ou.get_best_model(study.best_params, x_train, y_train)
    with open(os.path.join(preprocessed_path, f"{model}.pickle"), "wb") as f:
        pickle.dump(retrained_model, f)
    results_test.append({"type": "lemmatization",
                         "model": model,
                         **ou.get_score(retrained_model, x_test, y_test)})
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results_optuna.append({
        "type": "lemmatization",
        "model": model,
        "score": study.best_value,
        "params": study.best_params
    })

  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LGBM
Best accuracy: 0.8295011310384824
Best params: {'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth': 12, 'lgbm_n_estimators': 333, 'lgbm_subsample': 0.9367393538754355, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 5697, 'tfidf_max_df': 0.8461080367889807, 'tfidf_min_df': 0.00012931054966621824}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: XGB
Best accuracy: 0.8141634298690065
Best params: {'xgb_booster': 'dart', 'xgb_max_depth': 1, 'xgb_n_estimators': 234, 'xgb_subsample': 0.6118808404517502, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 1010, 'tfidf_max_df': 0.9170902863066774, 'tfidf_min_df': 0.0006777006523809902}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: RF
Best accuracy: 0.8015220594981745
Best params: {'rf_max_depth': 14, 'rf_n_estimators': 126, 'rf_criterion': 'log_loss', 'rf_min_samples_split': 0.07965827051163638, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 1548, 'tfidf_max_df': 0.9551921262748005, 'tfidf_min_df': 0.0005103527159608934}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: SVC
Best accuracy: 0.8260242874042223
Best params: {'svc_kernel': 'rbf', 'svc_C': 1.3347440516526416, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 1743, 'tfidf_max_df': 0.9592307945556902, 'tfidf_min_df': 0.0005156788905434892}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LR
Best accuracy: 0.814377555031224
Best params: {'lr_penalty': 'l2', 'lr_C': 1.728445496973784, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 4147, 'tfidf_max_df': 0.9701699845498118, 'tfidf_min_df': 0.00023461362461710234}


# Results

In [18]:
results_df = pd.DataFrame(results_optuna)
results_df.sort_values(by="score", ascending=False)

Unnamed: 0,type,model,score,params
10,lemmatization,LGBM,0.829501,"{'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth..."
3,preprocessed,SVC,0.827936,"{'svc_kernel': 'rbf', 'svc_C': 1.4318393113722..."
0,preprocessed,LGBM,0.827765,"{'lgbm_boosting_type': 'dart', 'lgbm_max_depth..."
8,stemming,SVC,0.827328,"{'svc_kernel': 'rbf', 'svc_C': 1.2769323445571..."
13,lemmatization,SVC,0.826024,"{'svc_kernel': 'rbf', 'svc_C': 1.3347440516526..."
5,stemming,LGBM,0.824332,"{'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth..."
1,preprocessed,XGB,0.820462,"{'xgb_booster': 'dart', 'xgb_max_depth': 13, '..."
6,stemming,XGB,0.817552,"{'xgb_booster': 'gbtree', 'xgb_max_depth': 3, ..."
9,stemming,LR,0.816855,"{'lr_penalty': 'l2', 'lr_C': 1.476130789859466..."
14,lemmatization,LR,0.814378,"{'lr_penalty': 'l2', 'lr_C': 1.728445496973784..."


In [19]:
results_df.sort_values(by="score", ascending=False).to_csv(
    os.path.join("results", "tfidf_optuna.csv"), index=False)

In [20]:
results_test_df = pd.DataFrame(results_test)
results_test_df.sort_values(by="balanced_accuracy", ascending=False)

Unnamed: 0,type,model,balanced_accuracy,f1_score,precision,recall
13,lemmatization,SVC,0.831058,0.843251,0.787263,0.907813
3,preprocessed,SVC,0.828714,0.840727,0.786395,0.903125
8,stemming,SVC,0.825982,0.837294,0.786942,0.894531
9,stemming,LR,0.824428,0.834011,0.791579,0.88125
0,preprocessed,LGBM,0.82361,0.840241,0.768633,0.926562
10,lemmatization,LGBM,0.822456,0.835626,0.778677,0.901563
6,stemming,XGB,0.818936,0.83255,0.775084,0.899219
14,lemmatization,LR,0.818568,0.827637,0.788952,0.870313
5,stemming,LGBM,0.818528,0.835578,0.764591,0.921094
11,lemmatization,XGB,0.809153,0.82559,0.76087,0.902344


In [21]:
results_test_df.sort_values(by="balanced_accuracy", ascending=False).to_csv(
    os.path.join("results", "tfidf.csv"), index=False)