# Model XGBoost - Aggression Data

In [29]:
#Imports
import pandas as pd
import numpy as np

from eda_functions import split_data

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# Read in cleaned aggression file
aggression_df = pd.read_csv('../data/aggression_clean_data.csv')
aggression_df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,aggression,aggression_score,label
0,37675,This is not creative Those are the dictionary...,2002,True,article,random,train,0.1,0.0,0
1,44816,the term standard model is itself less NPOV t...,2002,True,article,random,train,0.0,0.111111,0
2,49851,True or false the situation as of March 2002 w...,2002,True,article,random,train,0.0,0.1,0
3,89320,Next maybe you could work on being less conde...,2002,True,article,random,dev,0.444444,-0.444444,0
4,93890,This page will need disambiguation,2002,True,article,random,train,0.0,0.333333,0


In [3]:
# Drop unnecessary columns and rename label column
aggression_df = aggression_df[['rev_id','comment','label']]
aggression_df.rename(columns={'label':'target'}, inplace=True)
aggression_df.head()

Unnamed: 0,rev_id,comment,target
0,37675,This is not creative Those are the dictionary...,0
1,44816,the term standard model is itself less NPOV t...,0
2,49851,True or false the situation as of March 2002 w...,0
3,89320,Next maybe you could work on being less conde...,0
4,93890,This page will need disambiguation,0


In [5]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

In [6]:
# View the split for the train and test data
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=23824),Test (n=34647)
0,0.5,0.853147
1,0.5,0.146853


In [7]:
# Set up a pipeline with CountVectorizer and XGBClassifier
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('xgb', XGBClassifier())
])

In [16]:
# Set parameters
pipe_params = {
    'cvec__max_features': [2_000, 3_000, 5_000],
    'cvec__min_df'      : [2,3],
    'cvec__max_df'      : [9, .95],
    'cvec__stop_words'  : ['english'],
    'cvec__token_pattern' : [r'\w+|[A-Z]\w+'],
    'cvec__strip_accents' : ['ascii'],
    'cvec__ngram_range' : [(1,1), (1,2)],
    'xgb__colsample_bytree': [0.5, 0.75],
    'xgb__n_estimators': [100, 150, 200]
}

In [17]:
# Instatiate Gridsearch
gs = GridSearchCV(estimator = pipe,
                     param_grid = pipe_params,
                     cv = 5)

In [18]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                   

In [19]:
gs.best_score_

0.8354602417748364

In [20]:
print(f'Train score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Train score: 0.8891873740765615
Test score: 0.8932952347966635


In [21]:
gs.best_params_

{'cvec__max_df': 0.95,
 'cvec__max_features': 5000,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'cvec__strip_accents': 'ascii',
 'cvec__token_pattern': '\\w+|[A-Z]\\w+',
 'xgb__colsample_bytree': 0.75,
 'xgb__n_estimators': 200}

In [22]:
# Set parameters
pipe_params = {
    'cvec__max_features': [2_000, 3_000, 5_000, 7_000],
    'cvec__min_df'      : [2, 3, 4],
    'cvec__max_df'      : [9, .95],
    'cvec__stop_words'  : ['english'],
    'cvec__token_pattern' : [r'\w+|[A-Z]\w+'],
    'cvec__strip_accents' : ['ascii'],
    'cvec__ngram_range' : [(1,1), (1,2)],
    'xgb__colsample_bytree': [0.5, 0.75, 0.85],
    'xgb__n_estimators': [100, 150, 200, 250]
}

In [23]:
# Instatiate Gridsearch
gs = GridSearchCV(estimator = pipe,
                     param_grid = pipe_params,
                     cv = 5)

In [25]:
gs.fit(X_train, y_train)





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                   

In [28]:
gs.best_score_

0.8381886446353951

In [26]:
print(f'Train score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Train score: 0.8978760913364674
Test score: 0.8937570352411465


In [27]:
gs.best_params_

{'cvec__max_df': 0.95,
 'cvec__max_features': 5000,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'cvec__strip_accents': 'ascii',
 'cvec__token_pattern': '\\w+|[A-Z]\\w+',
 'xgb__colsample_bytree': 0.75,
 'xgb__n_estimators': 250}

In [35]:
# Set up a pipeline with TfidVectorizer and XGBClassifier
pipe_tfid = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('xgb', XGBClassifier())
])

In [36]:
# Set parameters
pipe_tfid_params = {
    'tvec__max_features': [4_000, 5_000, 6_000],
    'tvec__min_df'      : [2, 3],
    'tvec__max_df'      : [9, .95],
    'tvec__stop_words'  : ['english'],
    'tvec__token_pattern' : [r'\w+|[A-Z]\w+'],
    'tvec__strip_accents' : ['ascii'],
    'tvec__ngram_range' : [(1,1), (1,2)],
    'xgb__colsample_bytree': [0.5, 0.75, 0.85],
    'xgb__n_estimators': [100, 200, 300]
}

In [37]:
# Instatiate Gridsearch
gs_tfid = GridSearchCV(estimator = pipe_tfid,
                     param_grid = pipe_tfid_params,
                     cv = 5)

In [38]:
gs_tfid.fit(X_train, y_train)





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                   

In [39]:
gs_tfid.best_score_

0.8392379009059727

In [40]:
print(f'Train score: {gs_tfid.score(X_train, y_train)}')
print(f'Test score: {gs_tfid.score(X_test, y_test)}')

Train score: 0.9157991940899933
Test score: 0.8887926804629549


In [41]:
gs_tfid.best_params_

{'tvec__max_df': 0.95,
 'tvec__max_features': 5000,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english',
 'tvec__strip_accents': 'ascii',
 'tvec__token_pattern': '\\w+|[A-Z]\\w+',
 'xgb__colsample_bytree': 0.5,
 'xgb__n_estimators': 300}