# XGB Tuning: Toxicity

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from eda_functions import split_data

In [5]:
df = pd.read_csv('../data/toxicity_cleaned.csv', index_col='rev_id').rename({'toxicity': 'target'}, axis=1).dropna()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159666 entries, 2232 to 699897151
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  159666 non-null  object
 1   target   159666 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


Unnamed: 0_level_0,comment,target
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2232,This::One can make an analogy in mathematical ...,0
4216,":Clarification for you (and Zundark's right, ...",0
8953,Elected or Electoral? JHK,0
26547,This is such a fun entry. DevotchkaI once ha...,0
28959,Please relate the ozone hole to increases in c...,0


In [6]:
# Split data
X_train, X_test, y_train, y_test = split_data(df, pct_positive=0.5, test_size=5_000, train_size=20_000)

# Check number of observations and class proportions 
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=20000),Test (n=5000)
0,0.5,0.8806
1,0.5,0.1194


**`Baseline Accuracy: 88.2%`**

In [6]:
%%time
pipe = Pipeline([
    ('cvec', CountVectorizer(strip_accents='ascii', ngram_range=(1, 2), max_features=10_000)),
    ('xgb', XGBClassifier(eval_metric='error', use_label_encoder=False))
])

regex_list = [
    r'\w+|[A-Z]\w+',     # captures lowercase words, and words with any uppercase characters
    r'\w+|[A-Z]\w+|\S+'  # captures words along with consecutive puncuation such as !!, ???, etc.
]

params = {
    'cvec__token_pattern': regex_list,
    'cvec__max_df': [1.0, 0.95, 0.90, 0.80],
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [4, 5, 6]
}

gs = GridSearchCV(pipe, params, cv=3)
gs.fit(X_train, y_train)

Wall time: 1h 15min 21s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(max_features=10000,
                                                        ngram_range=(1, 2),
                                                        strip_accents='ascii')),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      eval_metric='error',
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                  

In [7]:
gs.best_params_

{'cvec__max_df': 1.0,
 'cvec__token_pattern': '\\w+|[A-Z]\\w+',
 'xgb__max_depth': 6,
 'xgb__n_estimators': 150}

In [8]:
print('Best GridSearchCV score: ', round(gs.best_score_, 4))
print('Train score: ', round(gs.score(X_train, y_train), 4))
print('Test score: ', round(gs.score(X_test, y_test), 4))

Best GridSearchCV score:  0.8706
Train score:  0.9314


ValueError: np.nan is an invalid document, expected byte or unicode string.