# Modules import

In [5]:
import os.path

import numpy as np
import optuna
import pandas as pd
from sklearnex import patch_sklearn

patch_sklearn()
import utils.optuna_utils as ou
from optuna import create_study

optuna.logging.set_verbosity(optuna.logging.WARNING)

TIMEOUT = 30
N_SPLITS = 5

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Simple processing

In [6]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [7]:
def get_x_y(df):
    x = df["text"]
    y = df["account.type"]
    return x, y


x_train, y_train = get_x_y(train)
y_train = np.where(y_train == "bot", 1, 0)

In [8]:
studies = []
results = []
for model in ou.MODELS.keys():
    study = create_study(study_name=f'simple_processing_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True)
    studies.append(study)
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results.append({
        "type": "preprocessed",
        "model": model,
        "score": study.best_value
    })

  self._init_valid()


   0%|          | 00:00/00:30

Model: LGBM
Best accuracy: 0.8122062493121949
Best params: {'lgbm_boosting_type': 'dart', 'lgbm_max_depth': 9, 'lgbm_n_estimators': 80, 'lgbm_subsample': 0.6622662504219524, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 8754, 'tfidf_max_df': 0.8622308378312066, 'tfidf_min_df': 0.0031465759604621946}


  self._init_valid()


   0%|          | 00:00/00:30

Model: XGB
Best accuracy: 0.8144704400259372
Best params: {'xgb_booster': 'gbtree', 'xgb_max_depth': 12, 'xgb_n_estimators': 22, 'xgb_subsample': 0.8548761976278334, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 2450, 'tfidf_max_df': 0.8999962432624573, 'tfidf_min_df': 0.00035909771758639857}


  self._init_valid()


   0%|          | 00:00/00:30

Model: RF
Best accuracy: 0.7877788743483745
Best params: {'rf_max_depth': 10, 'rf_n_estimators': 74, 'rf_criterion': 'gini', 'rf_min_samples_split': 0.05950593651008745, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 1190, 'tfidf_max_df': 0.8151750312854288, 'tfidf_min_df': 0.08869793720586412}


  self._init_valid()


   0%|          | 00:00/00:30

Model: SVC
Best accuracy: 0.7901426203743924
Best params: {'svc_kernel': 'linear', 'svc_C': 89.02333367421613, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 1798, 'tfidf_max_df': 0.8294991187011957, 'tfidf_min_df': 0.01858793862354524}


  self._init_valid()


   0%|          | 00:00/00:30

Model: LR
Best accuracy: 0.8052435060626335
Best params: {'lr_penalty': 'l2', 'lr_C': 1.6522035086728435, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 2027, 'tfidf_max_df': 0.8550328639128291, 'tfidf_min_df': 0.0010453731753158335}


# Stemming

In [9]:
DATA_PATH = os.path.join('..', 'data', 'stemmed')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens,new_text
0,bot#9,YEA now that note GOOD,bot,others,"['yea', 'note', 'good']",yea note good
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"['listen', 'thi', 'charm', 'man', 'the', 'smit...",listen thi charm man the smith <url>
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"['wish', 'would', 'see', 'hoe', 'worst', 'part']",wish would see hoe worst part
3,bot#1,The decade in the significantly easier schedul...,bot,others,"['the', 'decad', 'significantli', 'easier', 's...",the decad significantli easier schedul i don't...
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"['""', 'theim', 'class', '=\\', '""', 'alignnon'...",""" theim class =\ "" alignnon size-ful wp-imag -..."


In [10]:
def get_x_y(df):
    x = df["text"]
    y = df["account.type"]
    return x, y


x_train, y_train = get_x_y(train)
y_train = np.where(y_train == "bot", 1, 0)

In [11]:
studies = []
for model in ou.MODELS.keys():
    study = create_study(study_name=f'stemming_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True)
    studies.append(study)
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results.append({
        "type": "stemming",
        "model": model,
        "score": study.best_value
    })

  self._init_valid()


   0%|          | 00:00/00:30

Model: LGBM
Best accuracy: 0.8069888827303331
Best params: {'lgbm_boosting_type': 'dart', 'lgbm_max_depth': 13, 'lgbm_n_estimators': 154, 'lgbm_subsample': 0.9707343137517281, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 8051, 'tfidf_max_df': 0.9262126823956709, 'tfidf_min_df': 0.012333529465341053}


  self._init_valid()


   0%|          | 00:00/00:30

Model: XGB
Best accuracy: 0.8131659293871939
Best params: {'xgb_booster': 'dart', 'xgb_max_depth': 15, 'xgb_n_estimators': 17, 'xgb_subsample': 0.9880881136232063, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 1527, 'tfidf_max_df': 0.9493639257383366, 'tfidf_min_df': 0.0031107890759746407}


  self._init_valid()


   0%|          | 00:00/00:30

Model: RF
Best accuracy: 0.7913044411771135
Best params: {'rf_max_depth': 7, 'rf_n_estimators': 368, 'rf_criterion': 'gini', 'rf_min_samples_split': 0.040479653163297186, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 8620, 'tfidf_max_df': 0.88220906074233, 'tfidf_min_df': 0.014001795496675485}


  self._init_valid()


   0%|          | 00:00/00:30

Model: SVC
Best accuracy: 0.794192781582564
Best params: {'svc_kernel': 'poly', 'svc_C': 0.6545254251277063, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 5171, 'tfidf_max_df': 0.9014141995356088, 'tfidf_min_df': 0.03747304027155967}


  self._init_valid()


   0%|          | 00:00/00:30

Model: LR
Best accuracy: 0.7915401131162263
Best params: {'lr_penalty': 'l1', 'lr_C': 0.32771730940086424, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 4066, 'tfidf_max_df': 0.991343155828387, 'tfidf_min_df': 0.031170045341059185}


# Lemmatization

In [12]:
DATA_PATH = os.path.join('..', 'data', 'lemmatized')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens,new_text
0,bot#9,YEA now that note GOOD,bot,others,"['yea', 'note', 'good']",yea note good
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"['listen', 'thi', 'charm', 'man', 'the', 'smit...",listen thi charm man the smith <url>
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"['wish', 'would', 'see', 'hoe', 'worst', 'part']",wish would see hoe worst part
3,bot#1,The decade in the significantly easier schedul...,bot,others,"['the', 'decad', 'significantli', 'easier', 's...",the decad significantli easier schedul i don't...
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"['""', 'theim', 'class', '=\\', '""', 'alignnon'...",""" theim class =\ "" alignnon size-ful wp-imag -..."


In [13]:
def get_x_y(df):
    x = df["text"]
    y = df["account.type"]
    return x, y


x_train, y_train = get_x_y(train)
y_train = np.where(y_train == "bot", 1, 0)

In [14]:
studies = []
for model in ou.MODELS.keys():
    study = create_study(study_name=f'lemmatization_{model}', direction='maximize')
    study.optimize(lambda trial: ou.objective(trial, x_train, y_train, model=model, encoder="TFIDF", n_splits=N_SPLITS),
                   timeout=TIMEOUT, show_progress_bar=True)
    studies.append(study)
    print(f"Model: {model}")
    print(f"Best accuracy: {study.best_value}")
    print(f"Best params: {study.best_params}")
    results.append({
        "type": "stemming",
        "model": model,
        "score": study.best_value
    })

  self._init_valid()


   0%|          | 00:00/00:30

Model: LGBM
Best accuracy: 0.8110481115744799
Best params: {'lgbm_boosting_type': 'dart', 'lgbm_max_depth': 9, 'lgbm_n_estimators': 48, 'lgbm_subsample': 0.9373842683838434, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 1692, 'tfidf_max_df': 0.9452268989686466, 'tfidf_min_df': 0.002355054391130196}


  self._init_valid()


   0%|          | 00:00/00:30

Model: XGB
Best accuracy: 0.7959709666554187
Best params: {'xgb_booster': 'dart', 'xgb_max_depth': 13, 'xgb_n_estimators': 136, 'xgb_subsample': 0.8795264566829186, 'tfidf_ngram_range': 'digram', 'tfidf_max_features': 5051, 'tfidf_max_df': 0.8783184190610382, 'tfidf_min_df': 0.015782947549799278}


  self._init_valid()


   0%|          | 00:00/00:30

Model: RF
Best accuracy: 0.789470324541756
Best params: {'rf_max_depth': 15, 'rf_n_estimators': 76, 'rf_criterion': 'log_loss', 'rf_min_samples_split': 0.08581653299056421, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 3118, 'tfidf_max_df': 0.8952442038542965, 'tfidf_min_df': 0.013226049233646276}


  self._init_valid()


   0%|          | 00:00/00:30

Model: SVC
Best accuracy: 0.769372429688912
Best params: {'svc_kernel': 'rbf', 'svc_C': 38.648206973989694, 'tfidf_ngram_range': 'unigram', 'tfidf_max_features': 5465, 'tfidf_max_df': 0.8041768123131288, 'tfidf_min_df': 0.08010536021005206}


  self._init_valid()


   0%|          | 00:00/00:30



Model: LR
Best accuracy: 0.8043731724217416
Best params: {'lr_penalty': 'l2', 'lr_C': 76.85352668867269, 'tfidf_ngram_range': 'trigram', 'tfidf_max_features': 1102, 'tfidf_max_df': 0.9978600237617486, 'tfidf_min_df': 0.0021278601499148484}




In [17]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="score", ascending=False)

Unnamed: 0,type,model,score
1,preprocessed,XGB,0.81447
6,stemming,XGB,0.813166
0,preprocessed,LGBM,0.812206
10,stemming,LGBM,0.811048
5,stemming,LGBM,0.806989
4,preprocessed,LR,0.805244
14,stemming,LR,0.804373
11,stemming,XGB,0.795971
8,stemming,SVC,0.794193
9,stemming,LR,0.79154
