# Model XGBoost - Toxicity Data

In [1]:
#Imports
import pandas as pd
import numpy as np

from eda_functions import split_data

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# Read in cleaned toxicity file
toxicity_df = pd.read_csv('../data/toxicity_cleaned.csv')
toxicity_df.head()

Unnamed: 0,rev_id,comment,toxicity
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [4]:
# Renaname toxicity column
toxicity_df.rename(columns={'toxicity': 'target'}, inplace=True)
toxicity_df.head()

Unnamed: 0,rev_id,comment,target
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [5]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    toxicity_df,
    pct_positive=0.5,
    random_state=42)

In [6]:
# View the split for the train and test data
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=25624),Test (n=47680)
0,0.5,0.883284
1,0.5,0.116716


In [7]:
# Set up a pipeline with CountVectorizer and XGBClassifier
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('xgb', XGBClassifier())
])

In [8]:
# Set parameters
pipe_params = {
    'cvec__max_features': [4_000, 5_000, 6_000],
    'cvec__min_df'      : [2, 3],
    'cvec__max_df'      : [9, .95],
    'cvec__stop_words'  : ['english'],
    'cvec__token_pattern' : [r'\w+|[A-Z]\w+'],
    'cvec__strip_accents' : ['ascii'],
    'cvec__ngram_range' : [(1,1), (1,2)],
    'xgb__colsample_bytree': [0.5, 0.6, 0.7],
    'xgb__n_estimators': [100, 200, 250]
}

In [9]:
# Instatiate Gridsearch
gs = GridSearchCV(estimator = pipe,
                     param_grid = pipe_params,
                     cv = 5)

In [1]:
#gs.fit(X_train, y_train)
#uncomment this code to run gridsearch

In [11]:
gs.best_score_

0.8703559947449591

In [12]:
print(f'Train score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Train score: 0.9164455198251639
Test score: 0.916736577181208


In [13]:
gs.best_params_

{'cvec__max_df': 0.95,
 'cvec__max_features': 5000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'cvec__strip_accents': 'ascii',
 'cvec__token_pattern': '\\w+|[A-Z]\\w+',
 'xgb__colsample_bytree': 0.7,
 'xgb__n_estimators': 250}

In [14]:
# Set up a pipeline with TfidVectorizer and XGBClassifier
pipe_tfid = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('xgb', XGBClassifier())
])

In [15]:
# Set parameters
pipe_tfid_params = {
    'tvec__max_features': [4_000, 5_000, 6_000],
    'tvec__min_df'      : [2, 3],
    'tvec__max_df'      : [9, .95],
    'tvec__stop_words'  : ['english'],
    'tvec__token_pattern' : [r'\w+|[A-Z]\w+'],
    'tvec__strip_accents' : ['ascii'],
    'tvec__ngram_range' : [(1,1), (1,2)],
    'xgb__colsample_bytree': [0.5, 0.75],
    'xgb__n_estimators': [100, 200, 300]
}

In [18]:
# Instatiate Gridsearch
gs_tfid = GridSearchCV(estimator = pipe_tfid,
                     param_grid = pipe_tfid_params,
                     cv = 5)

In [2]:
#gs_tfid.fit(X_train, y_train)
#uncomment this code to run gridsearch

In [20]:
gs_tfid.best_score_

0.8723853620456579

In [21]:
print(f'Train score: {gs_tfid.score(X_train, y_train)}')
print(f'Test score: {gs_tfid.score(X_test, y_test)}')

Train score: 0.9345925694661255
Test score: 0.9128775167785235


In [22]:
gs_tfid.best_params_

{'tvec__max_df': 0.95,
 'tvec__max_features': 6000,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': 'english',
 'tvec__strip_accents': 'ascii',
 'tvec__token_pattern': '\\w+|[A-Z]\\w+',
 'xgb__colsample_bytree': 0.75,
 'xgb__n_estimators': 300}