# Model Metrics

In Cyberbulling text detection, we should optimize for sensitivity (recall). We want to maximize the number of true positives predicted and decrease the number of false negatives. This will decrease the risk of cyberbulling comment without going undetected.

In [1]:
#Imports
import pandas as pd
import numpy as np

import sys
sys.path.append('../eda_cleaning/')
from eda_functions import split_data

from sklearn.metrics import recall_score
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

## Aggression

In [2]:
# Read in cleaned aggression file
aggression_df = pd.read_csv('../clean data/aggression_clean_data.csv')
aggression_df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,aggression,aggression_score,label
0,37675,This is not creative Those are the dictionary...,2002,True,article,random,train,0.1,0.0,0
1,44816,the term standard model is itself less NPOV t...,2002,True,article,random,train,0.0,0.111111,0
2,49851,True or false the situation as of March 2002 w...,2002,True,article,random,train,0.0,0.1,0
3,89320,Next maybe you could work on being less conde...,2002,True,article,random,dev,0.444444,-0.444444,0
4,93890,This page will need disambiguation,2002,True,article,random,train,0.0,0.333333,0


In [3]:
# Drop unnecessary columns and rename label column
aggression_df = aggression_df[['rev_id','comment','label']]
aggression_df.rename(columns={'label':'target'}, inplace=True)

In [4]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

In [5]:
# View the split for the train and test data
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=23824),Test (n=34647)
0,0.5,0.853147
1,0.5,0.146853


### XGBoost

In [6]:
# Instantiate CountVectorizer with best parameters found from XGBoost gridsearch
cvec = CountVectorizer(max_df = 0.95,
                    max_features = 5000,
                    min_df = 3,
                    ngram_range = (1, 1),
                    stop_words = 'english',
                    strip_accents = 'ascii',
                    token_pattern = '\\w+|[A-Z]\\w+')
cvec.fit(X_train)

X_train_cvec_xg = cvec.transform(X_train)
X_test_cvec_xg = cvec.transform(X_test)

In [7]:
# Instantiate XGBoost with best parameters found from gridsearch
xg = XGBClassifier(colsample_bytree = 0.75, n_estimators = 200)

In [8]:
# Fit the model
xg.fit(X_train_cvec_xg, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [9]:
# Recall score
recall_score(y_test, xg.predict(X_test_cvec_xg))

0.7696540880503144

### Naive Bayes

In [10]:
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Naive Bayes gridsearch
cvec = CountVectorizer(ngram_range = (1, 1),
                    stop_words = None)
cvec.fit(X_train)

X_train_cvec_mnb = cvec.transform(X_train)
X_test_cvec_mnb = cvec.transform(X_test)

In [11]:
# Instantiate Naive Bayes
mnb = MultinomialNB()

In [12]:
# Fit the model
mnb.fit(X_train_cvec_mnb, y_train)

MultinomialNB()

In [13]:
# Recall score
recall_score(y_test, mnb.predict(X_test_cvec_mnb))

0.7696540880503144

### Logistic Regression

In [14]:
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Logistic Regression gridsearch
cvec = CountVectorizer(stop_words = None,
                       max_df = 0.95,
                       max_features = 15000,
                       min_df = 3,
                       ngram_range = (1, 1))
cvec.fit(X_train)

X_train_cvec_lr = cvec.transform(X_train)
X_test_cvec_lr = cvec.transform(X_test)

In [15]:
# Instantiate Logistic Regression
lr = LogisticRegressionCV()

In [16]:
# Fit the model
lr.fit(X_train_cvec_lr, y_train)

LogisticRegressionCV()

In [17]:
# Recall score
recall_score(y_test, lr.predict(X_test_cvec_lr))

0.8294025157232704

### SVC

In [18]:
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from SVC gridsearch
cvec = CountVectorizer(max_features = 4000,
                       ngram_range = (1, 1),
                       stop_words = 'english')
cvec.fit(X_train)

X_train_cvec_svc = cvec.transform(X_train)
X_test_cvec_svc = cvec.transform(X_test)

In [19]:
# Instantiate SVC
svc = SVC()

In [20]:
# Fit the model
svc.fit(X_train_cvec_svc, y_train)

SVC()

In [21]:
# Recall score
recall_score(y_test, svc.predict(X_test_cvec_svc))

0.6035770440251572