In [28]:
import pandas as pd
import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import seaborn as sns
import re

In [24]:
comments = pd.read_csv('./Dataset/attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('./Dataset/attack_annotations.tsv',  sep = '\t')

In [25]:
len(annotations['rev_id'].unique())

115864

In [26]:
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5
labels.describe()

count     115864
unique         2
top        False
freq      102274
Name: attack, dtype: object

In [27]:
comments['attack'] = labels

In [37]:
def preprocess(data):
    ans = re.sub("NEWLINE_TOKEN", " ",data)
    ans = re.sub("TAB_TOKEN", " ",ans)
    ans = re.sub("[=_]+"," ",ans)
    ans = re.sub(" +"," ",ans)
    return ans

In [38]:
comments['comment'] = comments['comment'].apply(preprocess)

In [39]:
comments['comment'].head()

rev_id
37675    `- This is not ``creative``. Those are the dic...
44816    ` :: the term ``standard model`` is itself les...
49851     True or false, the situation as of March 2002...
89320     Next, maybe you could work on being less cond...
93890                 This page will need disambiguation. 
Name: comment, dtype: object

In [40]:
comments.query('attack')['comment'].head()

rev_id
801279                          Iraq is not good USA is bad 
2702703     fuck off you little asshole. If you want to t...
4632658         i have a dick, its bigger than yours! hahaha
6545332     renault you sad little bpy for driving a rena...
6545351     renault you sad little bo for driving a renau...
Name: comment, dtype: object

In [41]:
train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

In [42]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LogisticRegression()),
])

In [43]:
clf = clf.fit(train_comments['comment'], train_comments['attack'])

In [44]:
auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:, 1])
print('Test ROC AUC: %.3f' %auc)

Test ROC AUC: 0.957


In [52]:
#!pip3 install joblib
import joblib

In [54]:
filename = 'finalized_model.pkl'
joblib.dump(clf, filename)

['finalized_model.pkl']

In [56]:
loaded_model = joblib.load(filename)
result = loaded_model.score(test_comments['comment'], test_comments['attack'])

0.94063335921994995