In [1]:
## DATA MANIPULATION
import numpy as np
import pandas as pd
import pickle

## TF-IDF VECTORIZER
from sklearn.feature_extraction.text import TfidfVectorizer

## CLASSIFICATION
from sklearn.base import clone
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

### Load preprocessed data

In [2]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]
ys_train = train[labels]
ys_valid = valid[labels]

## COMBINE TOXIC CATEGORIES
y_train = ys_train.sum(axis=1)
y_valid = ys_valid.sum(axis=1)
y_train.loc[y_train>1] = 1
y_valid.loc[y_valid>1] = 1

### TFIDF

In [3]:
tfidf = TfidfVectorizer(ngram_range=(1,2),
                        min_df=3, 
                        max_df=0.9, 
                        strip_accents='unicode', 
                        use_idf=1,
                        smooth_idf=1, 
                        sublinear_tf=1)

X_train = tfidf.fit_transform(train['comment_text'])      # vectorized train x
X_valid = tfidf.transform(valid['comment_text'])          # vectorized valid x

### NB-SVM

In [4]:
results = pd.DataFrame(columns=['Label','Accuracy', 'Recall', 'Precision', 'F1', 'Vectorizer', 'model'])

## NB-SVM MODEL
def NB_SVM(x,y):
    y = y.values
    sum_1 = x[y==1].sum(axis=0)+1           # Feature Sum for Class 1
    p_1 = (sum_1) / ((y==1).sum())          # Convert to ratio of feature in class 1 - p(f|1)

    sum_0 = x[y==0].sum(axis=0)+1           # Feature Sum for Class 0
    p_0 = (sum_0) / ((y==0).sum())          # Convert to ratio of feature in class 0 - p(f|0) 

    r = np.log(p_1/p_0)                     # Compute log ratios (the transformation matrix)
    x_nb = x.multiply(r)                    # Obtain NB feature
    
    m = LogisticRegression(solver='liblinear',class_weight='balanced')  # Balanced class weights [n_samples / n_classes * np.bincount(y)]
    m.fit(x_nb,y)                           # Fit model
    return m , r                            # return fitted model & transformation matrix (need for X_valid / X_test)

# Get model and transformation matrix for category
m,r = NB_SVM(X_train, y_train)
# Get predictions
preds = m.predict(X_valid.multiply(r))
# Evaluate predictions
acc, prec, recall, f1 = (accuracy_score(y_valid, preds), 
                            precision_score(y_valid, preds), 
                            recall_score(y_valid, preds), 
                            f1_score(y_valid, preds))

results = results.append({'Label': 'Toxic_Combined',
                        'Accuracy':acc,
                        'Recall':recall,
                        'Precision':prec,
                        'F1':f1,
                        'Vectorizer':'tfidf',
                        'model': 'NB_SVM'}, 
                        ignore_index = True)

print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                'Toxic_combined', 
                                acc, 
                                prec, 
                                recall,
                                f1))

Results for Toxic_combined comments: Accuracy - 0.96; Precision - 0.79; Recall - 0.83; F1 - 0.81


In [5]:
preds_proba = m.predict_proba(X_valid.multiply(r))
preds_df = pd.DataFrame(data=preds_proba[:,1],columns=['tfidf_nbsvm'])
preds_df.to_csv('../artifacts/simple/preds/tfidf_nbsvm.csv')

### Loop through models

In [5]:
## CREATE MODELS
models = {
    'Logistic Regression': LogisticRegression(solver='saga',class_weight='balanced'),
    'SVM': LinearSVC(class_weight='balanced'),
    'XGBoost': XGBClassifier(n_estimators=100,
                        scale_pos_weight= sum(y_train==0) / sum(y_train==1),
                        n_jobs=-1)}
                      
## LOOP THROUGH MODELS
for m_label, model in models.items():
        m = clone(model) 

        # Fit model
        m.fit(X_train, y_train)
        
        # Get predictions
        preds = m.predict(X_valid)

        # Evaluate predictions
        acc, prec, recall, f1 = (accuracy_score(y_valid, preds), 
                                precision_score(y_valid, preds), 
                                recall_score(y_valid, preds), 
                                f1_score(y_valid, preds))
        
        # Save results to dataframe
        results = results.append({'Label': 'Toxic_Combined',
                                'Accuracy':acc,
                                'Recall':recall,
                                'Precision':prec,
                                'F1':f1,
                                'Vectorizer':'tfidf',
                                'model': m_label}, 
                                ignore_index = True)
        
        # print results
        print('{0} Results for {1} comments: Accuracy - {2:.2f}; Precision - {3:.2f}; Recall - {4:.2f}; F1 - {5:.2f}'.format(
                                        m_label,
                                        'Toxic_Combined', 
                                        acc, 
                                        prec, 
                                        recall,
                                        f1))

Logistic Regression Results for Toxic_Combined comments: Accuracy - 0.95; Precision - 0.72; Recall - 0.84; F1 - 0.77
SVM Results for Toxic_Combined comments: Accuracy - 0.96; Precision - 0.82; Recall - 0.79; F1 - 0.80
XGBoost Results for Toxic_Combined comments: Accuracy - 0.94; Precision - 0.66; Recall - 0.82; F1 - 0.73


In [6]:
## SAVE RESULTS
results.to_csv('../artifacts/simple/tfidf.csv', index=False)

In [7]:
results

Unnamed: 0,Label,Accuracy,Recall,Precision,F1,Vectorizer,model
0,Toxic_Combined,0.962533,0.771497,0.849072,0.808428,tfidf,NB_SVM
1,Toxic_Combined,0.949562,0.844144,0.715071,0.774265,tfidf,Logistic Regression
2,Toxic_Combined,0.960577,0.791698,0.817764,0.80452,tfidf,SVM
3,Toxic_Combined,0.937428,0.821905,0.655193,0.729141,tfidf,XGBoost
