# Model Metrics

In Cyberbulling text detection, we should optimize for sensitivity (recall). We want to maximize the number of true positives predicted and decrease the number of false negatives. This will decrease the risk of cyberbulling comment without going undetected.

In [1]:
#Imports
import pandas as pd
import numpy as np

import sys
sys.path.append('../eda_cleaning/')
from eda_functions import split_data

from sklearn.metrics import recall_score
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Create a function to store recall scores in a DataFrame
recall_df = pd.DataFrame(columns = ['topic', 'model', 'recall_score'])

def add_scores(topic, name, model):
    df = pd.DataFrame(columns = ['topic', 'model', 'recall_score'])
    df = df.append({'topic': topic,
                    'model': name,
                    'recall_score': recall_score(y_test, model.predict(X_test_cvec))},
                    ignore_index=True)
    return df

## Aggression

In [3]:
# Read in cleaned aggression file
aggression_df = pd.read_csv('../clean data/aggression_clean_data.csv')
aggression_df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,aggression,aggression_score,label
0,37675,This is not creative Those are the dictionary...,2002,True,article,random,train,0.1,0.0,0
1,44816,the term standard model is itself less NPOV t...,2002,True,article,random,train,0.0,0.111111,0
2,49851,True or false the situation as of March 2002 w...,2002,True,article,random,train,0.0,0.1,0
3,89320,Next maybe you could work on being less conde...,2002,True,article,random,dev,0.444444,-0.444444,0
4,93890,This page will need disambiguation,2002,True,article,random,train,0.0,0.333333,0


In [4]:
# Drop unnecessary columns and rename label column
aggression_df = aggression_df[['rev_id','comment','label']]
aggression_df.rename(columns={'label':'target'}, inplace=True)

In [5]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

In [6]:
# View the split for the train and test data
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=23824),Test (n=34647)
0,0.5,0.853147
1,0.5,0.146853


### XGBoost

In [7]:
# Instantiate CountVectorizer with best parameters found from XGBoost gridsearch
cvec = CountVectorizer(max_df = 0.95,
                    max_features = 5000,
                    min_df = 3,
                    ngram_range = (1, 1),
                    stop_words = 'english',
                    strip_accents = 'ascii',
                    token_pattern = '\\w+|[A-Z]\\w+')
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [8]:
# Instantiate XGBoost with best parameters found from gridsearch
xg = XGBClassifier(colsample_bytree = 0.75, n_estimators = 200)

In [9]:
# Fit the model
xg.fit(X_train_cvec, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
# Recall score
recall_score(y_test, xg.predict(X_test_cvec))

0.7682783018867925

In [11]:
# Add score
recall_df = recall_df.append(add_scores('aggression', 'XGBoost', xg))

In [12]:
add_scores('aggression', 'XGBoost', xg)

Unnamed: 0,topic,model,recall_score
0,aggression,XGBoost,0.768278


### Naive Bayes

In [13]:
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Naive Bayes gridsearch
cvec = CountVectorizer(ngram_range = (1, 1),
                    stop_words = None)
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [14]:
# Instantiate Naive Bayes
mnb = MultinomialNB()

In [15]:
# Fit the model
mnb.fit(X_train_cvec, y_train)

MultinomialNB()

In [16]:
# Recall score
recall_score(y_test, mnb.predict(X_test_cvec))

0.7635613207547169

In [17]:
# Add score
recall_df = recall_df.append(add_scores('aggression', 'Naive Bayes', mnb), ignore_index=True)

### Logistic Regression

In [18]:
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Logistic Regression gridsearch
cvec = CountVectorizer(stop_words = None,
                       max_df = 0.95,
                       max_features = 15000,
                       min_df = 3,
                       ngram_range = (1, 1))
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [19]:
# Instantiate Logistic Regression
lr = LogisticRegressionCV()

In [20]:
# Fit the model
lr.fit(X_train_cvec, y_train)

LogisticRegressionCV()

In [21]:
# Recall score
recall_score(y_test, lr.predict(X_test_cvec))

0.8294025157232704

In [22]:
# Add score
recall_df = recall_df.append(add_scores('aggression', 'Logistic Regression', lr), ignore_index=True)

### SVC

In [23]:
X_train, X_test, y_train, y_test = split_data(
    aggression_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from SVC gridsearch
cvec = CountVectorizer(max_features = 4000,
                       ngram_range = (1, 1),
                       stop_words = 'english')
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [24]:
# Instantiate SVC
svc = SVC()

In [25]:
# Fit the model
svc.fit(X_train_cvec, y_train)

SVC()

In [26]:
# Recall score
recall_score(y_test, svc.predict(X_test_cvec))

0.6191037735849056

In [27]:
# Add score
recall_df = recall_df.append(add_scores('aggression', 'SVC', svc), ignore_index=True)

## Toxicity

In [28]:
# Read in cleaned toxicity file
toxicity_df = pd.read_csv('../clean data/toxicity_cleaned.csv')
toxicity_df.head()

Unnamed: 0,rev_id,comment,toxicity
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [29]:
# Renaname toxicity column
toxicity_df.rename(columns={'toxicity': 'target'}, inplace=True)
toxicity_df.head()

Unnamed: 0,rev_id,comment,target
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [30]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    toxicity_df,
    pct_positive=0.5,
    random_state=42)

In [31]:
# View the split for the train and test data
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=25624),Test (n=47680)
0,0.5,0.883284
1,0.5,0.116716


### XGBoost

In [32]:
# Instantiate CountVectorizer with best parameters found from XGBoost gridsearch
cvec = CountVectorizer(max_df = 0.95,
                    max_features = 5000,
                    min_df = 2,
                    ngram_range = (1, 1),
                    stop_words = 'english',
                    strip_accents = 'ascii',
                    token_pattern = '\\w+|[A-Z]\\w+')
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [33]:
# Instantiate XGBoost with best parameters found from gridsearch
xg = XGBClassifier(colsample_bytree = 0.7, n_estimators = 250)

In [34]:
# Fit the model
xg.fit(X_train_cvec, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [35]:
# Recall score
recall_score(y_test, xg.predict(X_test_cvec))

0.7960467205750225

In [36]:
# Add score
recall_df = recall_df.append(add_scores('toxicity', 'XGBoost', xg), ignore_index=True)

### Naive Bayes

In [37]:
X_train, X_test, y_train, y_test = split_data(
    toxicity_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Naive Bayes gridsearch
cvec = CountVectorizer(ngram_range = (1, 1),
                    stop_words = None)
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [38]:
# Instantiate Naive Bayes
mnb = MultinomialNB()

In [39]:
# Fit the model
mnb.fit(X_train_cvec, y_train)

MultinomialNB()

In [40]:
# Recall score
recall_score(y_test, mnb.predict(X_test_cvec))

0.8361185983827494

In [41]:
# Add score
recall_df = recall_df.append(add_scores('toxicity', 'Naive Bayes', mnb), ignore_index=True)

### Logistic Regression

In [42]:
X_train, X_test, y_train, y_test = split_data(
    toxicity_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Logistic Regression gridsearch
cvec = CountVectorizer(stop_words = None,
                       max_df = 0.85,
                       max_features = 13000,
                       min_df = 2,
                       ngram_range = (1, 1))
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [43]:
# Instantiate Logistic Regression
lr = LogisticRegressionCV()

In [44]:
# Fit the model
lr.fit(X_train_cvec, y_train)

LogisticRegressionCV()

In [45]:
# Recall score
recall_score(y_test, lr.predict(X_test_cvec))

0.8612758310871519

In [46]:
recall_df = recall_df.append(add_scores('toxicity', 'Logistic Regression', lr), ignore_index=True)

### SVC

In [47]:
X_train, X_test, y_train, y_test = split_data(
    toxicity_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from SVC gridsearch
cvec = CountVectorizer(max_features = 4000,
                       ngram_range = (1, 1),
                       stop_words = 'english')
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [48]:
# Instantiate SVC
svc = SVC()

In [49]:
# Fit the model
svc.fit(X_train_cvec, y_train)

SVC()

In [50]:
# Recall score
recall_score(y_test, svc.predict(X_test_cvec))

0.7773584905660378

In [51]:
recall_df = recall_df.append(add_scores('toxicity', 'SVC', svc), ignore_index=True)

## Attack

In [52]:
# Read in cleaned attack file
attack_df = pd.read_csv('../clean data/attack_clean.csv')
attack_df.head()

Unnamed: 0,rev_id,comment,target
0,37675,This is not creative Those are the dictionar...,0
1,44816,the term standard model is itself less NPOV...,0
2,49851,True or false the situation as of March 2002...,0
3,89320,Next maybe you could work on being less conde...,0
4,93890,This page will need disambiguation,0


In [53]:
# Apply the custom train test split function to balance the classes in the training data only
X_train, X_test, y_train, y_test = split_data(
    attack_df,
    pct_positive=0.5,
    random_state=42)

In [54]:
# View the split for the train and test data
pd.DataFrame({
    f'Train (n={y_train.shape[0]})': y_train.value_counts(normalize=True),
    f'Test (n={y_test.shape[0]})': y_test.value_counts(normalize=True)})

Unnamed: 0,Train (n=21872),Test (n=34668)
0,0.5,0.866563
1,0.5,0.133437


### XGBoost

In [55]:
# Instantiate CountVectorizer with best parameters found from XGBoost gridsearch
cvec = CountVectorizer(max_df = 0.95,
                    max_features = 6000,
                    min_df = 3,
                    ngram_range = (1, 1),
                    stop_words = 'english',
                    strip_accents = 'ascii',
                    token_pattern = '\\w+|[A-Z]\\w+')
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [56]:
# Instantiate XGBoost with best parameters found from gridsearch
xg = XGBClassifier(colsample_bytree = 0.6, n_estimators = 250)

In [57]:
# Fit the model
xg.fit(X_train_cvec, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [58]:
# Recall score
recall_score(y_test, xg.predict(X_test_cvec))

0.7853437094682231

In [59]:
recall_df = recall_df.append(add_scores('attack', 'XGBoost', xg), ignore_index=True)

### Naive Bayes

In [60]:
X_train, X_test, y_train, y_test = split_data(
    attack_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Naive Bayes gridsearch
cvec = CountVectorizer(ngram_range = (1, 1),
                    stop_words = None)
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [61]:
# Instantiate Naive Bayes
mnb = MultinomialNB()

In [62]:
# Fit the model
mnb.fit(X_train_cvec, y_train)

MultinomialNB()

In [63]:
# Recall score
recall_score(y_test, mnb.predict(X_test_cvec))

0.7788586251621271

In [64]:
recall_df = recall_df.append(add_scores('attack', 'Naive Bayes', mnb), ignore_index=True)

### Logistic Regression

In [65]:
X_train, X_test, y_train, y_test = split_data(
    attack_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from Logistic Regression gridsearch
cvec = CountVectorizer(stop_words = None,
                       max_df = 0.8,
                       max_features = 12000,
                       min_df = 2,
                       ngram_range = (1, 1))
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [66]:
# Instantiate Logistic Regression
lr = LogisticRegressionCV()

In [67]:
# Fit the model
lr.fit(X_train_cvec, y_train)

LogisticRegressionCV()

In [68]:
# Recall score
recall_score(y_test, lr.predict(X_test_cvec))

0.8389537397319499

In [69]:
# Add score
recall_df = recall_df.append(add_scores('attack', 'Logistic Regression', lr), ignore_index=True)

### SVC

In [70]:
X_train, X_test, y_train, y_test = split_data(
    attack_df,
    pct_positive=0.5,
    random_state=42)

# Instantiate CountVectorizer with best parameters found from SVC gridsearch
cvec = CountVectorizer(max_features = 6000,
                       min_df = 2,
                       stop_words = 'english')
cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [71]:
# Instantiate SVC
svc = SVC()

In [72]:
# Fit the model
svc.fit(X_train_cvec, y_train)

SVC()

In [73]:
# Recall score
recall_score(y_test, svc.predict(X_test_cvec))

0.6394293125810635

In [74]:
# Add score
recall_df = recall_df.append(add_scores('attack', 'SVC', svc), ignore_index=True)

## View All Recall Scores

In [75]:
recall_df

Unnamed: 0,topic,model,recall_score
0,aggression,XGBoost,0.768278
1,aggression,Naive Bayes,0.763561
2,aggression,Logistic Regression,0.829403
3,aggression,SVC,0.619104
4,toxicity,XGBoost,0.796047
5,toxicity,Naive Bayes,0.836119
6,toxicity,Logistic Regression,0.861276
7,toxicity,SVC,0.777358
8,attack,XGBoost,0.785344
9,attack,Naive Bayes,0.778859
