In [4]:
#import dependencies
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
#read datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
subm = pd.read_csv('sample_submission.csv')


In [6]:
#view training dataset
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
#create a list of all the labels to predict
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
#none for comment with no label
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
#length of training and testing models
len(train),len(test)

(159571, 153164)

In [10]:
#discard empty comments
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [11]:
#building the model
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

In [12]:
#using tfidf
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [13]:
trn_term_doc, test_term_doc

(<159571x426005 sparse matrix of type '<class 'numpy.float64'>'
 	with 17775104 stored elements in Compressed Sparse Row format>,
 <153164x426005 sparse matrix of type '<class 'numpy.float64'>'
 	with 14765755 stored elements in Compressed Sparse Row format>)

In [14]:
#naive bayes feature equation
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)
x = trn_term_doc
test_x = test_term_doc

In [15]:
#Fit a model for one dependent at a time
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [16]:
#predict
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [17]:
#result csv file
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission.csv', index=False)

In [7]:
train.tail(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159551,ffbc2db4225258dd,While about half the references are from BYU-I...,0,0,0,0,0,0
159552,ffbcd64a71775e04,Prague Spring \n\nI think that Prague Spring d...,0,0,0,0,0,0
159553,ffbd331a3aa269b9,I see this as having been merged; undoing one ...,0,0,0,0,0,0
159554,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0
159555,ffc2f409658571f1,"""\n\nHow come when you download that MP3 it's ...",0,0,0,0,0,0
159556,ffc671f2acdd80e1,"I'll be on IRC, too, if you have a more specif...",0,0,0,0,0,0
159557,ffc7bbb177c3c966,It is my opinion that that happens to be off-t...,0,0,0,0,0,0
159558,ffca1e81aefc48ac,Please stop removing content from Wikipedia; i...,0,0,0,0,0,0
159559,ffca8d71d71a3fae,Image:Barack-obama-mother.jpg listed for delet...,0,0,0,0,0,0
159560,ffcdcb71854f6d8a,"""Editing of article without Consensus & Remova...",0,0,0,0,0,0


In [8]:
test.tail(20)

Unnamed: 0,id,comment_text
153144,fff7159b3ee95618,"== Your name mentioned == \n Hi, I just though..."
153145,fff718ffe5f05559,I've just discovered yet another list: List of...
153146,fff7fc22a0cdccd3,==Wikiproject Video Games assessment== \n I do...
153147,fff83b80284d8440,::Consensus for ruining Wikipedia? I think tha...
153148,fff8ef316d0c6990,== DAP ? == \n\n What's point with DAP ?! Naz...
153149,fff8f521a7dbcd47,shut down the mexican border withought looking...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ..."
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...
153152,fff9fa508f400ee6,==WTF== \n It's no longer a redlink. Now what...
153153,fffa3fae1890b40a,""" \n\n ==""""Illness"""" no shit== \n Just for the..."


In [9]:
res = pd.read_csv('submission.csv')

In [10]:
res.tail(20)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
153144,fff7159b3ee95618,0.023022,0.000687,0.005124,0.000133,0.008803,0.000382
153145,fff718ffe5f05559,0.003207,0.000265,0.001317,0.000113,0.002126,0.000598
153146,fff7fc22a0cdccd3,0.00219,0.000294,0.001755,0.000188,0.001309,0.00032
153147,fff83b80284d8440,0.004067,0.003003,0.002162,6.7e-05,0.008432,0.000171
153148,fff8ef316d0c6990,0.013927,0.00066,0.00718,0.000113,0.004878,0.000443
153149,fff8f521a7dbcd47,0.246353,0.002702,0.014607,0.000542,0.040689,0.001924
153150,fff8f64043129fa2,0.004216,0.000238,0.001459,0.000124,0.000763,0.000241
153151,fff9d70fe0722906,0.38476,0.001617,0.071984,0.000148,0.030128,0.00076
153152,fff9fa508f400ee6,0.152463,0.000678,0.05508,0.000162,0.003397,0.000379
153153,fffa3fae1890b40a,0.951063,0.005803,0.355226,0.002306,0.023371,0.000638
