In [1]:
## DATA MANIPULATION
import numpy as np
import pandas as pd
import pickle

## TF-IDF VECTORIZER
from sklearn.feature_extraction.text import TfidfVectorizer

## CLASSIFICATION
from sklearn.base import clone
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

### LOAD PREPROCESSED DATA

In [2]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)

### CREATE BAG OF WORDS (TF-IDF) TRANSFORMER

In [3]:
tfidf = TfidfVectorizer(ngram_range=(1,2),
                        min_df=3, 
                        max_df=0.9, 
                        strip_accents='unicode', 
                        use_idf=1,
                        smooth_idf=1, 
                        sublinear_tf=1)

X_train = tfidf.fit_transform(train['comment_text'])      # vectorized train x
X_valid = tfidf.transform(valid['comment_text'])          # vectorized valid x

In [4]:
results = pd.DataFrame(columns=['Label','Accuracy', 'Recall', 'Precision', 'F1', 'Vectorizer', 'model'])

### STRONG BASELINE (LINEAR MODEL): NB-SVM
- based on Wang & Manning, 2012 (https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf)
- A SVM model (liblinear) that takes transformed features using Naive Bayes' log-count ratio

In [5]:
## NB-SVM MODEL
def NB_SVM(x,y):
    y = y.values
    sum_1 = x[y==1].sum(axis=0)+1           # Feature Sum for Class 1
    p_1 = (sum_1) / ((y==1).sum())          # Convert to ratio of feature in class 1 - p(f|1)

    sum_0 = x[y==0].sum(axis=0)+1           # Feature Sum for Class 0
    p_0 = (sum_0) / ((y==0).sum())          # Convert to ratio of feature in class 0 - p(f|0) 

    r = np.log(p_1/p_0)                     # Compute log ratios (the transformation matrix)
    x_nb = x.multiply(r)                    # Obtain NB feature
    
    m = LinearSVC(class_weight='balanced')  # Balanced class weights [n_samples / n_classes * np.bincount(y)]
    m.fit(x_nb,y)                           # Fit model
    return m , r                            # return fitted model & transformation matrix (need for X_valid / X_test)

labels = train.columns[2:]                  # Grab labels

## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Get model and transformation matrix for category
    m,r = NB_SVM(X_train, train[label])
    # Get predictions
    preds = m.predict(X_valid.multiply(r))
    # Evaluate predictions
    acc, prec, recall, f1 = (accuracy_score(valid[label], preds), 
                             precision_score(valid[label], preds), 
                             recall_score(valid[label], preds), 
                             f1_score(valid[label], preds))
    
    results = results.append({'Label': label,
                            'Accuracy':acc,
                            'Recall':recall,
                            'Precision':prec,
                            'F1':f1,
                            'Vectorizer':'tfidf',
                            'model': 'NB_SVM'}, 
                            ignore_index = True)
    
    print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                    label, 
                                    acc, 
                                    prec, 
                                    recall,
                                    f1))

Results for toxic comments: Accuracy - 0.96; Precision - 0.83; Recall - 0.75; F1 - 0.79
Results for severe_toxic comments: Accuracy - 0.99; Precision - 0.40; Recall - 0.34; F1 - 0.37
Results for obscene comments: Accuracy - 0.98; Precision - 0.84; Recall - 0.76; F1 - 0.80
Results for threat comments: Accuracy - 1.00; Precision - 0.68; Recall - 0.38; F1 - 0.48
Results for insult comments: Accuracy - 0.97; Precision - 0.75; Recall - 0.65; F1 - 0.70
Results for identity_hate comments: Accuracy - 0.99; Precision - 0.55; Recall - 0.31; F1 - 0.40


### Loop through Logistic Regression, SVM, XGBoost

In [6]:
## CREATE MODELS
models = {
    'Logistic Regression': LogisticRegression(solver='saga',class_weight='balanced'),
    'SVM': LinearSVC(class_weight='balanced'),
    'XGBoost': ['see below']}
                      
## LOOP THROUGH MODELS
for m_label, model in models.items():
    for label in labels:
        # clone base model (re-initialize weights)
        if m_label == 'XGBoost':
            model = XGBClassifier(n_estimators=100,
                        scale_pos_weight= sum(train[label]==0) / sum(train[label]==1),
                        n_jobs=-1)
        
        m = clone(model) 

        # Fit model
        m.fit(X_train, train[label])
        
        # Get predictions
        preds = m.predict(X_valid)

        # Evaluate predictions
        acc, prec, recall, f1 = (accuracy_score(valid[label], preds), 
                                precision_score(valid[label], preds), 
                                recall_score(valid[label], preds), 
                                f1_score(valid[label], preds))
        
        # Save results to dataframe
        results = results.append({'Label': label,
                                'Accuracy':acc,
                                'Recall':recall,
                                'Precision':prec,
                                'F1':f1,
                                'Vectorizer':'tfidf',
                                'model': m_label}, 
                                ignore_index = True)
        
        # print results
        print('{0} Results for {1} comments: Accuracy - {2:.2f}; Precision - {3:.2f}; Recall - {4:.2f}; F1 - {5:.2f}'.format(
                                        m_label,
                                        label, 
                                        acc, 
                                        prec, 
                                        recall,
                                        f1))

Logistic Regression Results for toxic comments: Accuracy - 0.95; Precision - 0.71; Recall - 0.83; F1 - 0.77
Logistic Regression Results for severe_toxic comments: Accuracy - 0.92; Precision - 0.11; Recall - 0.98; F1 - 0.20
Logistic Regression Results for obscene comments: Accuracy - 0.98; Precision - 0.92; Recall - 0.60; F1 - 0.72
Logistic Regression Results for threat comments: Accuracy - 1.00; Precision - 0.00; Recall - 0.00; F1 - 0.00
Logistic Regression Results for insult comments: Accuracy - 0.97; Precision - 0.83; Recall - 0.49; F1 - 0.62
Logistic Regression Results for identity_hate comments: Accuracy - 0.88; Precision - 0.07; Recall - 0.96; F1 - 0.13
SVM Results for toxic comments: Accuracy - 0.96; Precision - 0.79; Recall - 0.78; F1 - 0.79
SVM Results for severe_toxic comments: Accuracy - 0.99; Precision - 0.39; Recall - 0.52; F1 - 0.45
SVM Results for obscene comments: Accuracy - 0.98; Precision - 0.82; Recall - 0.81; F1 - 0.82
SVM Results for threat comments: Accuracy - 1.00

In [7]:
results.head(10)

Unnamed: 0,Label,Accuracy,Recall,Precision,F1,Vectorizer,model
0,toxic,0.960804,0.751525,0.826661,0.787304,tfidf,NB_SVM
1,severe_toxic,0.988207,0.342205,0.395604,0.366972,tfidf,NB_SVM
2,obscene,0.97932,0.759096,0.840767,0.797847,tfidf,NB_SVM
3,threat,0.997683,0.375,0.678571,0.483051,tfidf,NB_SVM
4,insult,0.971515,0.653424,0.747296,0.697214,tfidf,NB_SVM
5,identity_hate,0.991606,0.314225,0.554307,0.401084,tfidf,NB_SVM
6,toxic,0.951442,0.832382,0.71277,0.767946,tfidf,Logistic Regression
7,severe_toxic,0.923546,0.977186,0.113516,0.203403,tfidf,Logistic Regression
8,obscene,0.975446,0.596609,0.917935,0.723186,tfidf,Logistic Regression
9,threat,0.997095,0.0,0.0,0.0,tfidf,Logistic Regression


In [8]:
## SAVE RESULTS
results.to_csv('../artifacts/tfidf.csv', index=False)