# Modules import

In [1]:
import os.path

import pickle
import numpy as np
import optuna
import pandas as pd
from sklearnex import patch_sklearn

patch_sklearn()
import utils.optuna_utils as ou
from optuna import create_study

optuna.logging.set_verbosity(optuna.logging.WARNING)

TIMEOUT = 1200  # seconds
TRIALS = 100  # number of trials
N_SPLITS = 5

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
np.random.seed(42)

# Simple processing

In [3]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [4]:
def get_x_y(df):
    x = df["text"]
    y = df["account.type"]
    return x, y


x_train, y_train = get_x_y(train)
y_train = np.where(y_train == "bot", 1, 0)
x_validation, y_validation = get_x_y(validation)
y_validation = np.where(y_validation == "bot", 1, 0)

In [5]:
preprocessed_path = os.path.join("..", "models", "tfidf", "preprocessed")
os.makedirs(preprocessed_path, exist_ok=True)

In [None]:
studies = []
results = []
results_validation = []
for model in ou.MODELS.keys():
    study = create_study(study_name=f'simple_processing_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True, n_trials=TRIALS)
    studies.append(study)
    retrained_model = ou.get_best_model(study.best_params, x_train, y_train)
    with open(os.path.join(preprocessed_path, f"{model}.pickle"), "wb") as f:
        pickle.dump(retrained_model, f)
    results_validation.append({"type": "preprocessed",
                               "model": model,
                               **ou.get_score(retrained_model, x_validation, y_validation)})
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results.append({
        "type": "preprocessed",
        "model": model,
        "score": study.best_value
    })

  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: LGBM
Best accuracy: 0.8285694901979305
Best params: {'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth': 6, 'lgbm_n_estimators': 249, 'lgbm_subsample': 0.6827074964028326, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 7225, 'tfidf_max_df': 0.9635461108586068, 'tfidf_min_df': 0.0002199172290726914}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: XGB
Best accuracy: 0.8193477545538617
Best params: {'xgb_booster': 'gbtree', 'xgb_max_depth': 7, 'xgb_n_estimators': 25, 'xgb_subsample': 0.643613401965495, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 3071, 'tfidf_max_df': 0.843311343081232, 'tfidf_min_df': 0.0005725360114299726}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

Model: RF
Best accuracy: 0.8049660519287668
Best params: {'rf_max_depth': 12, 'rf_n_estimators': 341, 'rf_criterion': 'log_loss', 'rf_min_samples_split': 0.042375275245554914, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 1414, 'tfidf_max_df': 0.8565717212703119, 'tfidf_min_df': 0.0006480440018553581}


  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

# Stemming

In [None]:
DATA_PATH = os.path.join('..', 'data', 'stemmed')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

In [None]:
def get_x_y(df):
    x = df["text"]
    y = df["account.type"]
    return x, y


x_train, y_train = get_x_y(train)
y_train = np.where(y_train == "bot", 1, 0)
x_validation, y_validation = get_x_y(validation)
y_validation = np.where(y_validation == "bot", 1, 0)

In [None]:
preprocessed_path = os.path.join("..", "models", "tfidf", "stemmed")
os.makedirs(preprocessed_path, exist_ok=True)

In [None]:
for model in ou.MODELS.keys():
    study = create_study(study_name=f'stemming_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True)
    studies.append(study)
    retrained_model = ou.get_best_model(study.best_params, x_train, y_train)
    with open(os.path.join(preprocessed_path, f"{model}.pickle"), "wb") as f:
        pickle.dump(retrained_model, f)
    results_validation.append({"type": "stemming",
                               "model": model,
                               **ou.get_score(retrained_model, x_validation, y_validation)})
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results.append({
        "type": "stemming",
        "model": model,
        "score": study.best_value
    })

# Lemmatization

In [None]:
DATA_PATH = os.path.join('..', 'data', 'lemmatized')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

In [None]:
def get_x_y(df):
    x = df["text"]
    y = df["account.type"]
    return x, y


x_train, y_train = get_x_y(train)
y_train = np.where(y_train == "bot", 1, 0)

In [None]:
preprocessed_path = os.path.join("..", "models", "tfidf", "lemmatization")
os.makedirs(preprocessed_path, exist_ok=True)

In [None]:
for model in ou.MODELS.keys():
    study = create_study(study_name=f'lemmatization_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True)
    studies.append(study)
    retrained_model = ou.get_best_model(study.best_params, x_train, y_train)
    with open(os.path.join(preprocessed_path, f"{model}.pickle"), "wb") as f:
        pickle.dump(retrained_model, f)
    results_validation.append({"type": "lemmatization",
                               "model": model,
                               **ou.get_score(retrained_model, x_validation, y_validation)})
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results.append({
        "type": "lemmatization",
        "model": model,
        "score": study.best_value
    })

# Results

In [17]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="score", ascending=False)

Unnamed: 0,type,model,score
10,lemmatization,LGBM,0.832188
6,stemming,XGB,0.829194
5,stemming,LGBM,0.829049
0,preprocessed,LGBM,0.828569
13,lemmatization,SVC,0.827937
3,preprocessed,SVC,0.825715
8,stemming,SVC,0.824947
11,lemmatization,XGB,0.820843
1,preprocessed,XGB,0.819348
14,lemmatization,LR,0.817795


In [18]:
results_validation_df = pd.DataFrame(results_validation)
results_validation_df.sort_values(by="balanced_accuracy", ascending=False)

Unnamed: 0,type,model,balanced_accuracy,f1_score,precision,recall
3,preprocessed,SVC,0.830959,0.841677,0.792337,0.897569
10,lemmatization,LGBM,0.828781,0.840615,0.787121,0.90191
5,stemming,LGBM,0.824438,0.836305,0.784195,0.895833
13,lemmatization,SVC,0.824007,0.835433,0.785332,0.892361
0,preprocessed,LGBM,0.822252,0.836727,0.774575,0.909722
6,stemming,XGB,0.818792,0.830969,0.779468,0.889757
4,preprocessed,LR,0.817069,0.826535,0.786667,0.87066
8,stemming,SVC,0.816172,0.830868,0.7702,0.90191
14,lemmatization,LR,0.814035,0.822259,0.788217,0.859375
11,lemmatization,XGB,0.814003,0.828388,0.769747,0.896701
