# XGB Tuning: Attacks

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from eda_functions import split_data

In [2]:
df = pd.read_csv('../data/attack_clean.csv', index_col='rev_id')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115563 entries, 37675 to 699897151
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  115563 non-null  object
 1   target   115563 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.6+ MB


Unnamed: 0_level_0,comment,target
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1
37675,This is not creative Those are the dictionar...,0
44816,the term standard model is itself less NPOV...,0
49851,True or false the situation as of March 2002...,0
89320,Next maybe you could work on being less conde...,0
93890,This page will need disambiguation,0


In [3]:
# Split data
X_train, X_test, y_train, y_test = split_data(df, pct_positive=0.5, test_size=5_000, train_size=20_000)

# Check number of observations and class proportions 
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=20000),Test (n=5000)
0,0.5,0.868
1,0.5,0.132


**`Baseline Accuracy: 86.8%`**

In [4]:
%%time
pipe = Pipeline([
    ('cvec', CountVectorizer(strip_accents='ascii', ngram_range=(1, 2), max_features=10_000)),
    ('xgb', XGBClassifier(eval_metric='error', use_label_encoder=False))
])

regex_list = [
    r'\w+|[A-Z]\w+',     # captures lowercase words, and words with any uppercase characters
    r'\w+|[A-Z]\w+|\S+'  # captures words along with consecutive puncuation such as !!, ???, etc.
]

params = {
    'cvec__token_pattern': regex_list,
    'cvec__max_df': [1.0, 0.95, 0.90, 0.80],
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [4, 5, 6]
}

gs = GridSearchCV(pipe, params, cv=3)
gs.fit(X_train, y_train)

Wall time: 1h 13min 58s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(max_features=10000,
                                                        ngram_range=(1, 2),
                                                        strip_accents='ascii')),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      eval_metric='error',
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                  

In [5]:
gs.best_params_

{'cvec__max_df': 1.0,
 'cvec__token_pattern': '\\w+|[A-Z]\\w+',
 'xgb__max_depth': 6,
 'xgb__n_estimators': 150}

In [6]:
print('Best GridSearchCV score: ', round(gs.best_score_, 4))
print('Train score: ', round(gs.score(X_train, y_train), 4))
print('Test score: ', round(gs.score(X_test, y_test), 4))

Best GridSearchCV score:  0.8579
Train score:  0.9248
Test score:  0.8904
