## Model XGBoost - Toxic Data

In [1]:
#Imports
import pandas as pd
import numpy as np

from eda_functions import split_data

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pickle

In [2]:
# Read in cleaned toxicity file
toxicity_df = pd.read_csv('../data/toxicity_cleaned.csv')
toxicity_df.head()

Unnamed: 0,rev_id,comment,toxicity
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [3]:
# Renaname toxicity column
toxicity_df.rename(columns={'toxicity': 'target'}, inplace=True)
toxicity_df.head()

Unnamed: 0,rev_id,comment,target
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [4]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    toxicity_df,
    pct_positive=0.5,
    random_state=42)

In [5]:
# Set up a pipeline with CountVectorizer and XGBClassifier
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('xgb', XGBClassifier(use_label_encoder=False))
])

In [6]:
# Set parameters
pipe_params = {
    'cvec__max_features': [5_000],
    'cvec__min_df'      : [2],
    'cvec__max_df'      : [.95],
    'cvec__stop_words'  : ['english'],
    'cvec__token_pattern' : [r'\w+|[A-Z]\w+'],
    'cvec__strip_accents' : ['ascii'],
    'cvec__ngram_range' : [(1,1)],
    'xgb__colsample_bytree': [0.7],
    'xgb__n_estimators': [250]
}

In [7]:
# Instatiate Gridsearch
gs = GridSearchCV(estimator = pipe,
                     param_grid = pipe_params,
                     cv = 5)

In [8]:
gs.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                   

In [9]:
print(f'Train score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Train score: 0.9141429909459882
Test score: 0.9174286912751678


In [10]:
with open('../model_for_app/xgboost_toxic.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(gs, pickle_out)