In [5]:
import sys, os 
import pandas as pd 
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
def load_data():

    data = os.path.join("data", "train.csv")

    df = pd.read_csv(data)
    X_train = df[['id', 'comment_text']]
    y_train = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    X_test = pd.read_csv(os.path.join("data", "test.csv"))
    y_test = pd.read_csv(os.path.join("data", "test_labels.csv"))
    test = X_test.merge(y_test, on='id')
    test = test[ (test['toxic']!=-1) | (test['severe_toxic']!=-1) | 
                (test['obscene']!=-1) | (test['threat']!=-1) | (test['insult']!=-1) 
                | (test['identity_hate']!=-1) ]
    test = test.reset_index(drop=True) 
    
    X_test = test[['id', 'comment_text']]
    y_test = test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    return X_train, y_train, X_test, y_test 

In [7]:
# Constants 

ngrams = 1
penalty = 'l2'
Cs = 10
cv = 5
max_iter = 1000 
scoring = 'accuracy'

In [8]:
X_train, y_train, X_test, y_test = load_data()
np.random.seed(42)

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, ngrams))

In [10]:
def tfidf_transform(corpus, is_train=False):
    if is_train: 
        X = vectorizer.fit_transform(corpus)
    else:
        X = vectorizer.transform(corpus)
    return X

In [11]:
def print_results(model, X, y):
    """
    Evaluate the model, prints results, and return as tuple
    """
    prediction = model.predict(X)
    precision, recall, fbeta_score, support = \
        precision_recall_fscore_support(y, prediction)
    accuracy = accuracy_score(y, prediction)

    print ("Precision: {}\nRecall: {}\nF-Score: {}\nSupport: {}\nAccuracy {}\n".format(
            precision, recall, fbeta_score, support, accuracy))

    print (classification_report(y, prediction))

    return (precision, recall, fbeta_score, support, accuracy)

In [12]:
# turn the corpus into a list to pass into the vectorizer 
train_corpus = X_train['comment_text'].values.tolist()
test_corpus = X_test['comment_text'].values.tolist()

X_train_tfidf = tfidf_transform(train_corpus, is_train=True)
X_test_tfidf = tfidf_transform(test_corpus, is_train=False)

In [13]:
def run_model(): 
    labels = ['toxic' , 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    for label in labels: 
        print("Train model on the \"{}\" label.\n".format(label))
        y_train_label = y_train[label].values.tolist()
        y_test_label = y_test[label].values.tolist()
        
        lr = LogisticRegressionCV(   class_weight="balanced", 
                                     Cs = Cs, 
                                     cv = cv, 
                                     penalty = penalty, 
                                     scoring = scoring, 
                                     max_iter = max_iter)
        
        lr.fit(X_train_tfidf, y_train_label)
        
        print("Train Results\n")
        print_results(lr, X_train_tfidf, y_train_label)
        
        print("Test Results\n")
        
        print_results(lr, X_test_tfidf, y_test_label)

In [14]:
run_model()

Train model on the "toxic" label.





Train Results

Precision: [0.99978171 0.86943891]
Recall: [0.98411389 0.99797306]
F-Score: [0.99188594 0.92928247]
Support: [144277  15294]
Accuracy 0.9854422169441815

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    144277
           1       0.87      1.00      0.93     15294

   micro avg       0.99      0.99      0.99    159571
   macro avg       0.93      0.99      0.96    159571
weighted avg       0.99      0.99      0.99    159571

Example predictions: 
[0 0 0 ... 0 0 0]
Test Results

Precision: [0.98559356 0.48064953]
Recall: [0.90054934 0.87487685]
F-Score: [0.94115417 0.62043668]
Support: [57888  6090]
Accuracy 0.8981055987995874

              precision    recall  f1-score   support

           0       0.99      0.90      0.94     57888
           1       0.48      0.87      0.62      6090

   micro avg       0.90      0.90      0.90     63978
   macro avg       0.73      0.89      0.78     63978
weighted avg       0.94   



Train Results

Precision: [0.99999362 0.56464754]
Recall: [0.99222034 0.99937304]
F-Score: [0.99609181 0.72159348]
Support: [157976   1595]
Accuracy 0.9922918324758258

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    157976
           1       0.56      1.00      0.72      1595

   micro avg       0.99      0.99      0.99    159571
   macro avg       0.78      1.00      0.86    159571
weighted avg       1.00      0.99      0.99    159571

Example predictions: 
[0 0 0 ... 0 0 0]
Test Results

Precision: [0.99852706 0.18115942]
Recall: [0.98045935 0.7493188 ]
F-Score: [0.98941073 0.29177719]
Support: [63611   367]
Accuracy 0.9791334521241677

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     63611
           1       0.18      0.75      0.29       367

   micro avg       0.98      0.98      0.98     63978
   macro avg       0.59      0.86      0.64     63978
weighted avg       0.99   



Train Results

Precision: [0.99996    0.88297427]
Recall: [0.99259539 0.99928986]
F-Score: [0.99626408 0.93753817]
Support: [151122   8449]
Accuracy 0.99294984677667

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    151122
           1       0.88      1.00      0.94      8449

   micro avg       0.99      0.99      0.99    159571
   macro avg       0.94      1.00      0.97    159571
weighted avg       0.99      0.99      0.99    159571

Example predictions: 
[0 0 0 ... 0 0 0]
Test Results

Precision: [0.98870182 0.53663717]
Recall: [0.95657439 0.8214576 ]
F-Score: [0.9723728  0.64918103]
Support: [60287  3691]
Accuracy 0.9487792678733314

              precision    recall  f1-score   support

           0       0.99      0.96      0.97     60287
           1       0.54      0.82      0.65      3691

   micro avg       0.95      0.95      0.95     63978
   macro avg       0.76      0.89      0.81     63978
weighted avg       0.96     



Train Results

Precision: [0.99996659 0.79434914]
Recall: [0.98656506 0.99936524]
F-Score: [0.99322062 0.88514083]
Support: [151694   7877]
Accuracy 0.9871969217464326

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    151694
           1       0.79      1.00      0.89      7877

   micro avg       0.99      0.99      0.99    159571
   macro avg       0.90      0.99      0.94    159571
weighted avg       0.99      0.99      0.99    159571

Example predictions: 
[0 0 0 ... 0 0 0]
Test Results

Precision: [0.9861626  0.48097776]
Recall: [0.95336163 0.76364167]
F-Score: [0.96948475 0.590212  ]
Support: [60551  3427]
Accuracy 0.9431992247335022

              precision    recall  f1-score   support

           0       0.99      0.95      0.97     60551
           1       0.48      0.76      0.59      3427

   micro avg       0.94      0.94      0.94     63978
   macro avg       0.73      0.86      0.78     63978
weighted avg       0.96   



Train Results

Precision: [0.9999873  0.65560748]
Recall: [0.99534034 0.99857651]
F-Score: [0.99765841 0.79153738]
Support: [158166   1405]
Accuracy 0.9953688326826303

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    158166
           1       0.66      1.00      0.79      1405

   micro avg       1.00      1.00      1.00    159571
   macro avg       0.83      1.00      0.89    159571
weighted avg       1.00      1.00      1.00    159571

Example predictions: 
[0 0 0 ... 0 0 0]
Test Results

Precision: [0.99624433 0.33926031]
Recall: [0.98531597 0.66994382]
F-Score: [0.99075001 0.45042493]
Support: [63266   712]
Accuracy 0.9818062458970271

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     63266
           1       0.34      0.67      0.45       712

   micro avg       0.98      0.98      0.98     63978
   macro avg       0.67      0.83      0.72     63978
weighted avg       0.99   

0.7999999999999999