In [1]:
import pandas

In [2]:
train = pandas.read_csv('./dataset/train.csv')
test = pandas.read_csv('./dataset/test.csv')
test_labels = pandas.read_csv('./dataset/test_labels.csv')

print(train.shape)
print(test.shape)
print(test_labels.shape)

(159571, 8)
(153164, 2)
(153164, 7)


In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
train.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [6]:
train[train['toxic'] == 1]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
44,001956c382006abd,I'm Sorry \n\nI'm sorry I screwed around with ...,1,0,0,0,0,0
51,001dc38a83d420cf,GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK T...,1,0,1,0,0,0
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0
56,0020fd96ed3b8c8b,=Tony Sidaway is obviously a fistfuckee. He lo...,1,0,1,0,1,0
58,0021fe88bc4da3e6,My Band Page's deletion. You thought I was gon...,1,0,1,0,0,0


In [7]:
train['comment_text'][0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

## Imports

In [8]:
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import numpy as np
import pandas as pd
import time

## Constants

In [9]:
token_regex = r'[a-zA-Z0-9]+'

In [25]:
def training(X, classifier):
    test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    roc_aoc_scores = []
    
    for eval_col in test_cols:
#         print("FIT ", eval_col)
        y = train[eval_col]
        X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.3, random_state=101)

        clf = classifier().fit(X_train, y_train)
        predicted= clf.predict_proba(X_test)[:,1]

        roc_aoc_scores.append(metrics.roc_auc_score(y_test, predicted))
    
    print("Score :" , np.mean(np.array(roc_aoc_scores)))

In [19]:
token = RegexpTokenizer(token_regex)
cv = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
text_cv = cv.fit_transform(train['comment_text'])

In [13]:
class LemmaTokenizer(object):
    def __init__(self, token_regex):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(token_regex).tokenize
    def tokenize(self, articles):
#         print(articles)
#         print(token.tokenize(articles))
        return [self.wnl.lemmatize(t) for t in self.tokenizer(articles)]

lemma = LemmaTokenizer(token_regex)

In [14]:
class LemmaPOSTokenizer(object):
    def __init__(self, token_regex):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(token_regex).tokenize
        
        self.tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }
        
    def tokenize(self, articles):
#         print(articles)
#         print(self.tokenizer(articles))
#         print(pos_tag(self.tokenizer(articles)))
#         a = [self.wnl.lemmatize(t,  get_wordnet_pos(t)) for t in self.tokenizer(articles)]
#         print(a)
#         return a
        return [self.wnl.lemmatize(t[0], self.tag_dict.get(t[1][0], wordnet.NOUN) ) for t in pos_tag(self.tokenizer(articles))]

lemma_pos = LemmaPOSTokenizer(token_regex)

In [20]:
cv_lemma_pos = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma_pos.tokenize)

start = time.time()
text_cv_lemma_pos = cv_lemma_pos.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

  'stop_words.' % sorted(inconsistent))


525.1412253379822


In [21]:
cv_lemma = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma.tokenize)

start = time.time()
text_cv_lemma =cv_lemma.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

  'stop_words.' % sorted(inconsistent))


49.207603931427


In [22]:
text_cv

<159571x227490 sparse matrix of type '<class 'numpy.int64'>'
	with 4748387 stored elements in Compressed Sparse Row format>

In [23]:
text_cv_lemma

<159571x219000 sparse matrix of type '<class 'numpy.int64'>'
	with 4727202 stored elements in Compressed Sparse Row format>

In [24]:
text_cv_lemma_pos

<159571x216623 sparse matrix of type '<class 'numpy.int64'>'
	with 4551942 stored elements in Compressed Sparse Row format>

In [27]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
tf = TfidfVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)

text_tf = tf.fit_transform(train['comment_text'])

In [28]:
tf_lemma_pos = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma_pos.tokenize)

start = time.time()
text_tf_lemma_pos = tf_lemma_pos.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

  'stop_words.' % sorted(inconsistent))


560.5757603645325


In [29]:
tf_lemma = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma.tokenize)

start = time.time()
text_tf_lemma =tf_lemma.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

  'stop_words.' % sorted(inconsistent))


46.17615747451782


In [30]:
text_tf

<159571x227490 sparse matrix of type '<class 'numpy.float64'>'
	with 4748387 stored elements in Compressed Sparse Row format>

In [32]:
text_tf_lemma

<159571x219000 sparse matrix of type '<class 'numpy.int64'>'
	with 4727202 stored elements in Compressed Sparse Row format>

In [33]:
text_tf_lemma_pos

<159571x216623 sparse matrix of type '<class 'numpy.int64'>'
	with 4551942 stored elements in Compressed Sparse Row format>

In [37]:
def classifierMNB():
    return MultinomialNB()

def classifierLSVC():
    return SVC(probability = True)

def classifierLR():
    return LogisticRegression(C=4, dual=True ,solver='liblinear')


In [36]:
print("#1")
training(text_cv, classifierMNB)
training(text_cv, classifierLR)

print("#2")
training(text_cv_lemma, classifierMNB)
training(text_cv_lemma, classifierLR)

print("#3")
training(text_cv_lemma_pos, classifierMNB)
training(text_cv_lemma_pos, classifierLR)

print("#4")
training(text_tf, classifierMNB)
training(text_tf, classifierLR)

print("#5")
training(text_tf_lemma, classifierMNB)
training(text_tf_lemma, classifierLR)

print("#6")
training(text_tf_lemma_pos, classifierMNB)
training(text_tf_lemma_pos, classifierLR)

#1
Score : 0.8624565578633603




Score : 0.939653879355482
#2
Score : 0.8616465869550839




Score : 0.9386275720347369
#3
Score : 0.8593732824253507




Score : 0.9409338925164135
#4
Score : 0.844086677142248
Score : 0.9749997779762191
#5
Score : 0.8616465869550839




Score : 0.9388777762250657
#6
Score : 0.8593732824253507




Score : 0.9409502994387643




In [None]:
print("#1")
training(text_cv, classifierLSVC)

# print("#2")
# training(text_cv_lemma, classifierLSVC)

# print("#3")
# training(text_cv_lemma_pos, classifierLSVC)

# print("#4")
# training(text_tf, classifierLSVC)

# print("#5")
# training(text_tf_lemma, classifierLSVC)

# print("#6")
# training(text_tf_lemma_pos, classifierLSVC)

#1




In [69]:
def create_submission(X, TEST ,classifier):
    test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    roc_aoc_scores = []
    df = pd.DataFrame()
    df['id'] = test['id']
    
    for eval_col in test_cols:
        print("FIT ", eval_col)
        y = train[eval_col]

        clf = classifier().fit(X, y)
        
        predicted= clf.predict_proba(TEST)[:,1]
        df[eval_col] = predicted

    return df

def calc_score(test_submission):
    test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    roc_aoc_scores = []
    selection = test_labels['toxic'] != -1
    
    test_dataset = test_labels[selection]
    predicted_dataset = test_submission[selection]
    
    for eval_col in test_cols:
        roc_aoc_scores.append(metrics.roc_auc_score(test_dataset[eval_col], predicted_dataset[eval_col]))
    
    print(roc_aoc_scores)
    print("Score :" , np.mean(np.array(roc_aoc_scores)))

In [45]:
df = create_submission(text_counts_tf, text_counts_tf_test, classifierLR)

FIT  toxic
FIT  severe_toxic
FIT  obscene
FIT  threat
FIT  insult
FIT  identity_hate


In [46]:
df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999436,0.155168,0.998686,0.028091,0.943338,0.243038
1,0000247867823ef7,0.00446,0.001217,0.001015,0.000466,0.004197,0.001587
2,00013b17ad220c46,0.03648,0.004995,0.016637,0.001604,0.023716,0.006046
3,00017563c3f7919a,0.001371,0.000849,0.001366,0.001147,0.001324,0.000147
4,00017695ad8997eb,0.014229,0.001501,0.00453,0.001244,0.005928,0.001523


In [48]:
df.to_csv('submission_tfidf_lr.csv', index=False)

In [70]:
calc_score(df)

[0.9590566668686306, 0.9835660481836188, 0.9736424523539231, 0.9839533173088607, 0.9651417711882404, 0.9733629579691526]
Score : 0.9731205356454043
