# Loading the data

In [1]:
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
import sklearn

data_dir='./dataset/'

In [2]:
df = pd.read_csv(data_dir+'df_processed.csv')
df.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,label_sum,comment_lower,has_apostrophe,has_new_line,com_processed
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,1,1,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...,1,0,aww matches background colour seemingly stuck ...


In [3]:
target_columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def prepare_TFIDF(df, col):
    df_train, df_test = train_test_split(df, test_size=0.2, random_state = 8848)
    
    vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95)
    vectorizer.fit_transform(df_train[col].values)
    
    X_train = vectorizer.transform(df_train[col].values)
    X_test  = vectorizer.transform(df_test[col].values)
    
    y_train = df_train[target_columns].values
    y_test  = df_test[target_columns].values

    return X_train, X_test, y_train, y_test, vectorizer

X_train_tf, X_test_tf, y_train_tf, y_test_tf, vectorizer= prepare_TFIDF(df, "com_processed")

In [5]:
print("X_train Shape: ", X_train_tf.toarray().shape)
print("y_train Shape: ", y_train_tf.shape)

X_train Shape:  (127608, 147278)
y_train Shape:  (127608, 6)


In [6]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import average_precision_score, roc_auc_score

In [12]:
def LR_model(X_train, y_train, X_test, multi=False):
    if multi:
        model = MultiOutputClassifier(LogisticRegression(penalty='l2', C=1.0, max_iter=500))
        model.fit(X_train, y_train)
        y_pred_labs = model.predict(X_test)
        #y_pred_scores = model.decision_function(X_test)
    else:
        model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0, max_iter=500))
        model.fit(X_train, y_train)
        y_pred_labs = model.predict(X_test)

    return model, y_pred_labs

In [13]:
model_LR, y_pred_labs =  LR_model(X_train_tf, y_train_tf, X_test_tf)

In [14]:
def print_evaluation_scores(y_val, predicted):
    acc        = 100*accuracy_score(y_val, predicted)
    f1_micro   = 100*f1_score(y_val, predicted, average='micro')
    avg_prec_micro = 100*average_precision_score(y_val, predicted, average='micro')
    roc_auc    = 100*roc_auc_score(y_val, predicted)
    
    print ("Accuracy \t= %1.2f \nF1_micro\t= %1.2f" %(acc, f1_micro))
    print ("Avg Prec micro\t= %1.2f \nROC_AUC score\t= %1.2f" % ( avg_prec_micro, roc_auc))

In [15]:
print ("Logistic Regression with TFIDF features")
print_evaluation_scores(y_test_tf, y_pred_labs)

Logistic Regression with TFIDF features
Accuracy 	= 92.01 
F1_micro	= 68.00
Avg Prec micro	= 50.55 
ROC_AUC score	= 68.61


In [16]:
model_LR_multi, y_pred_labs_multi =  LR_model(X_train_tf, y_train_tf, X_test_tf, multi=True)

In [17]:
print ("Logistic Regression with TFIDF features (using MultiOutputClassifier)")
print_evaluation_scores(y_test_tf, y_pred_labs)

Logistic Regression with TFIDF features (using MultiOutputClassifier)
Accuracy 	= 92.01 
F1_micro	= 68.00
Avg Prec micro	= 50.55 
ROC_AUC score	= 68.61


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def prepare_BOW(df, col):
    df_train, df_test = train_test_split(df, test_size=0.2, random_state = 8848)
    
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(df_train[col].values)
    
    X_train = vectorizer.transform(df_train[col].values)
    X_test  = vectorizer.transform(df_test[col].values)
    
    y_train = df_train[target_columns].values
    y_test  = df_test[target_columns].values

    return X_train, X_test, y_train, y_test, vectorizer

X_train_bow, X_test_bow, y_train_bow, y_test_bow, count_vectorizer= prepare_BOW(df, "com_processed")


In [None]:
#model_LR_bow, y_pred_labs_bow, y_pred_scores_bow =  LR_model(X_train_bow,
#                                                             y_train_bow,
#                                                             X_test_bow)

LR on the BOW data was too much to train. It failed. even after setting the max_iter=1000 the model didn't run.

So Use PCA to convert the data to a smaller dimensional matrix.

In [None]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA


# pca_pipe = Pipeline([('scale',StandardScaler(with_mean=False)),
#                          ('pca',PCA())])

# pca_pipe.fit(X_train_tf)

# #pca = PCA(n_components = 10)
# #pca.fit(X_train_tf)

# from sklearn.decomposition import PCA
# pca = PCA(n_components = n_comp)
# pca.fit(train_tfidf)
# X_train = pca.transform(train_tfidf)
# X_test = pca.transform(test_tfidf)

# pca = PCA(n_components=200)

# pca.fit(X[X.columns[1:]])

# plt.figure(figsize=(10,8))

# plt.plot(range(1,201),
#         np.cumsum(pca.explained_variance_ratio_))

# plt.xlabel("PCA Component", fontsize=16)
# plt.ylabel("Cumulative Variance Explained", fontsize=16)

# plt.show()