In [5]:
import numpy as np
import pandas as pd
import joblib
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [6]:
df = pd.read_csv('../data/train.csv')
df.shape
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
labels = df[label_columns]
labels

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [8]:
x_train, x_test, y_train, y_test = train_test_split(df['comment_text'], labels, test_size=0.2, random_state=42)

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [10]:
pac = PassiveAggressiveClassifier(max_iter=100, random_state=42)
multi_output_classifier = MultiOutputClassifier(pac, n_jobs=-1)
multi_output_classifier.fit(tfidf_train, y_train)

In [11]:
y_pred = multi_output_classifier.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 90.54%


In [12]:
y_test_np = y_test.values
y_pred_binary = (y_pred > 0.5).astype(int)
for i, label in enumerate(label_columns):
    cm = confusion_matrix(y_test_np[:, i], y_pred_binary[:, i])
    print(f'Confusion Matrix for {label}:')
    print(cm)


Confusion Matrix for toxic:
[[28285   574]
 [  920  2136]]
Confusion Matrix for severe_toxic:
[[31464   130]
 [  217   104]]
Confusion Matrix for obscene:
[[29965   235]
 [  512  1203]]
Confusion Matrix for threat:
[[31810    31]
 [   52    22]]
Confusion Matrix for insult:
[[29908   393]
 [  731   883]]
Confusion Matrix for identity_hate:
[[31528    93]
 [  190   104]]


In [16]:
# Export the model

#joblib.dump(multi_output_classifier, 'toxic_comment_model.pkl')
#joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
