# XGB Tuning: Aggression

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from eda_functions import split_data

In [2]:
df = pd.read_csv('../data/aggression_clean_data.csv', index_col='rev_id').rename({'label': 'target'}, axis=1)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115493 entries, 37675 to 699897151
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   comment           115493 non-null  object 
 1   year              115493 non-null  int64  
 2   logged_in         115493 non-null  bool   
 3   ns                115493 non-null  object 
 4   sample            115493 non-null  object 
 5   split             115493 non-null  object 
 6   aggression        115493 non-null  float64
 7   aggression_score  115493 non-null  float64
 8   target            115493 non-null  int64  
dtypes: bool(1), float64(2), int64(2), object(4)
memory usage: 8.0+ MB


Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,aggression,aggression_score,target
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
37675,This is not creative Those are the dictionary...,2002,True,article,random,train,0.1,0.0,0
44816,the term standard model is itself less NPOV t...,2002,True,article,random,train,0.0,0.111111,0
49851,True or false the situation as of March 2002 w...,2002,True,article,random,train,0.0,0.1,0
89320,Next maybe you could work on being less conde...,2002,True,article,random,dev,0.444444,-0.444444,0
93890,This page will need disambiguation,2002,True,article,random,train,0.0,0.333333,0


In [3]:
# Split data
X_train, X_test, y_train, y_test = split_data(df, pct_positive=0.5, test_size=5_000, train_size=20_000)

# Check number of observations and class proportions 
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=20000),Test (n=5000)
0,0.5,0.855
1,0.5,0.145


**`Baseline Accuracy: 85.5%`**

In [None]:
%%time
pipe = Pipeline([
    ('cvec', CountVectorizer(strip_accents='ascii', ngram_range=(1, 2), max_features=10_000)),
    ('xgb', XGBClassifier(eval_metric='error', use_label_encoder=False))
])

regex_list = [
    r'\w+|[A-Z]\w+',     # captures lowercase words, and words with any uppercase characters
    r'\w+|[A-Z]\w+|\S+'  # captures words along with consecutive puncuation such as !!, ???, etc.
]

params = {
    'cvec__token_pattern': regex_list,
    'cvec__max_df': [1.0, 0.95, 0.90, 0.80],
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [4, 5, 6]
}

gs = GridSearchCV(pipe, params, cv=3)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
print('Best GridSearchCV score: ', round(gs.best_score_, 4))
print('Train score: ', round(gs.score(X_train, y_train), 4))
print('Test score: ', round(gs.score(X_test, y_test), 4))