## Model XGBoost - Attacks Data

In [1]:
#Imports
import pandas as pd
import numpy as np

from eda_functions import split_data

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pickle

In [2]:
# Read in cleaned attack file
attack_df = pd.read_csv('../data/attack_clean.csv')
attack_df.head()

Unnamed: 0,rev_id,comment,target
0,37675,This is not creative Those are the dictionar...,0
1,44816,the term standard model is itself less NPOV...,0
2,49851,True or false the situation as of March 2002...,0
3,89320,Next maybe you could work on being less conde...,0
4,93890,This page will need disambiguation,0


In [3]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    attack_df,
    pct_positive=0.5,
    random_state=42)

In [4]:
# Set up a pipeline with CountVectorizer and XGBClassifier
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('xgb', XGBClassifier(use_label_encoder=False))
])

In [5]:
# Set parameters
pipe_params = {
    'cvec__max_features': [6_000],
    'cvec__min_df'      : [3],
    'cvec__max_df'      : [.95],
    'cvec__stop_words'  : ['english'],
    'cvec__token_pattern' : [r'\w+|[A-Z]\w+'],
    'cvec__strip_accents' : ['ascii'],
    'cvec__ngram_range' : [(1,1)],
    'xgb__colsample_bytree': [0.6],
    'xgb__n_estimators': [250]
}

In [6]:
# Instatiate Gridsearch
gs = GridSearchCV(estimator = pipe,
                     param_grid = pipe_params,
                     cv = 5)

In [7]:
gs.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                   

In [8]:
print(f'Train score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Train score: 0.9047640819312363
Test score: 0.9091092650282682


In [9]:
with open('../model_for_app/xgboost_attacks.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(gs, pickle_out)