## Imports

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import numpy as np
import pandas as pd
import time

In [6]:
train = pd.read_csv('./preprocessed/train.csv')
test = pd.read_csv('./preprocessed/test.csv')
test_labels = pd.read_csv('./dataset/test_labels.csv')

print(train.shape)
print(test.shape)
print(test_labels.shape)

(119989, 10)
(153164, 4)
(153164, 7)


In [7]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_lemma,comment_stem
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,Explanation Why the edits make under my userna...,explan whi the edit made under my usernam hard...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,D'aww ! He match this background colour I 'm s...,d'aww ! He match thi background colour I 'm se...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"Hey man , I 'm really not try to edit war . It...","hey man , I 'm realli not tri to edit war . It..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,`` More I ca n't make any real suggestion on i...,`` more I ca n't make ani real suggest on impr...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"You , sir , be my hero . Any chance you rememb...","you , sir , are my hero . ani chanc you rememb..."


In [8]:
test.head()

Unnamed: 0,id,comment_text,comment_lemma,comment_stem
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,Yo bitch Ja Rule be more succesful then you 'l...,Yo bitch Ja rule is more succes then you 'll e...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,"== From RfC == The title be fine as it be , IMO .","== from rfc == the titl is fine as it is , imo ."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",`` == Sources == * Zawe Ashton on Lapland — / ``,`` == sourc == * zaw ashton on lapland — / ``
3,00017563c3f7919a,":If you have a look back at the source, the in...",": If you have a look back at the source , the ...",": If you have a look back at the sourc , the i..."
4,00017695ad8997eb,I don't anonymously edit articles at all.,I do n't anonymously edit article at all .,I do n't anonym edit articl at all .


In [9]:
train[train['toxic'] == 1][:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_lemma,comment_stem
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,cocksuck befor you piss around ON MY work
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0,Hey ... what be it.. @ | talk . What be it ......,hey ... what is it.. @ | talk . what is it ......
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0,"Bye ! Do n't look , come or think of comming b...","bye ! Do n't look , come or think of com back ..."
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1,You be gay or antisemmitian ? Archangel WHite ...,you are gay or antisemmitian ? archangel white...
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0,"FUCK YOUR FILTHY MOTHER IN THE ASS , DRY !","fuck your filthi mother IN the ass , dri !"


In [10]:
train['comment_text'][0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

## Constants

In [8]:
token_regex = r'[a-zA-Z0-9]+'

In [32]:
def training(X, classifier):
    test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    roc_aoc_scores = []
    
    for eval_col in test_cols:
#         print("FIT ", eval_col)
        y = train[eval_col]
        X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.3, random_state=101)

        clf = classifier().fit(X_train, y_train)
        predicted= clf.predict_proba(X_test)[:,1]

        roc_aoc_scores.append(metrics.roc_auc_score(y_test, predicted))
    
    avg_score = np.mean(np.array(roc_aoc_scores))
    print("Score :" , avg_score)
    return clf, avg_score

In [21]:
class Tokenizer(object):
    def __init__(self, token_regex):
        self.tokenizer = RegexpTokenizer(token_regex).tokenize
    def tokenize(self, articles):
        return self.tokenizer(articles)

simple = Tokenizer(token_regex)

In [20]:
class PorterStemTokenizer(object):
    def __init__(self, token_regex):
        self.stemmer = PorterStemmer()
        self.tokenizer = RegexpTokenizer(token_regex).tokenize
    def tokenize(self, articles):
        return [self.stemmer.stem(t) for t in self.tokenizer(articles)]

stem = PorterStemTokenizer(token_regex)

In [13]:
class LemmaTokenizer(object):
    def __init__(self, token_regex):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(token_regex).tokenize
    def tokenize(self, articles):
#         print(articles)
#         print(token.tokenize(articles))
        return [self.wnl.lemmatize(t) for t in self.tokenizer(articles)]

lemma = LemmaTokenizer(token_regex)

In [14]:
class LemmaPOSTokenizer(object):
    def __init__(self, token_regex):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(token_regex).tokenize
        
        self.tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }
        
    def tokenize(self, articles):
#         print(articles)
#         print(self.tokenizer(articles))
#         print(pos_tag(self.tokenizer(articles)))
#         a = [self.wnl.lemmatize(t,  get_wordnet_pos(t)) for t in self.tokenizer(articles)]
#         print(a)
#         return a
        return [self.wnl.lemmatize(t[0], self.tag_dict.get(t[1][0], wordnet.NOUN) ) for t in pos_tag(self.tokenizer(articles))]

lemma_pos = LemmaPOSTokenizer(token_regex)

In [20]:
cv_lemma_pos = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma_pos.tokenize)

start = time.time()
text_cv_lemma_pos = cv_lemma_pos.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

In [21]:
cv_lemma = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma.tokenize)

start = time.time()
text_cv_lemma =cv_lemma.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

In [47]:
training_text = train['comment_text']
# training_text = train['comment_text'][:1000]

In [63]:
# With Regex Tokenizer
tf = TfidfVectorizer(ngram_range = (1,2),  strip_accents='unicode', tokenizer = simple.tokenize, max_features=80000)
text_tf = tf.fit_transform(training_text)

In [54]:
def clf():
    return LogisticRegression(C=2, solver='lbfgs')
trained_clf, score = training(text_tf, clf)

Score : 0.9769499633715956


In [50]:
text_cv

<159571x12364848 sparse matrix of type '<class 'numpy.int64'>'
	with 21162991 stored elements in Compressed Sparse Row format>

In [65]:
text_tf

<159571x80000 sparse matrix of type '<class 'numpy.float64'>'
	with 13169732 stored elements in Compressed Sparse Row format>

In [66]:
def clf():
    return LogisticRegression(C=4, solver='lbfgs', max_iter=1000)
trained_clf, score = training(text_tf, clf)

Score : 0.9790874467858717


In [64]:
def clf():
    return LogisticRegression(C=4, solver='lbfgs', max_iter=500)
trained_clf, score = training(text_tf, clf)

Score : 0.9790874467858717


In [59]:
def clf():
    return LogisticRegression(C=4, solver='lbfgs', max_iter=500)
trained_clf, score = training(text_tf, clf)

Score : 0.978680153436004


In [34]:
# With Regex Tokenizer
cv = CountVectorizer(lowercase=False, stop_words='english', tokenizer = simple.tokenize)
text_cv = cv.fit_transform(training_text)

training(text_cv, classifierLR)

<159571x227490 sparse matrix of type '<class 'numpy.int64'>'
	with 4748387 stored elements in Compressed Sparse Row format>

In [None]:
# With Porter Tokenizer
cv = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = simple.tokenize)
text_cv = cv.fit_transform(training_text)

training(text_cv, classifierLR)

## Count Vectorizer

In [22]:
token = RegexpTokenizer(token_regex)

# With Regex Tokenizer
cv = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
text_cv = cv.fit_transform(training_text)


In [23]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
tf = TfidfVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)

text_tf = tf.fit_transform(train['comment_text'])

In [24]:
tf_lemma_pos = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma_pos.tokenize)

start = time.time()
text_tf_lemma_pos = tf_lemma_pos.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

In [27]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
tf = TfidfVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)

text_tf = tf.fit_transform(train['comment_text'])

In [28]:
tf_lemma_pos = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma_pos.tokenize)

start = time.time()
text_tf_lemma_pos = tf_lemma_pos.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

In [29]:
tf_lemma = CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = lemma.tokenize)

start = time.time()
text_tf_lemma =tf_lemma.fit_transform(train['comment_text'])
end = time.time()
print (end - start)

In [30]:
text_tf

In [32]:
text_tf_lemma

In [33]:
text_tf_lemma_pos

In [25]:
def classifierMNB():
    return MultinomialNB()

def classifierLSVC():
    return SVC(probability = True)

def classifierLR():
    return LogisticRegression(C=4, dual=True ,solver='liblinear')


In [36]:
print("#1")
training(text_cv, classifierMNB)
training(text_cv, classifierLR)

print("#2")
training(text_cv_lemma, classifierMNB)
training(text_cv_lemma, classifierLR)

print("#3")
training(text_cv_lemma_pos, classifierMNB)
training(text_cv_lemma_pos, classifierLR)

print("#4")
training(text_tf, classifierMNB)
training(text_tf, classifierLR)

print("#5")
training(text_tf_lemma, classifierMNB)
training(text_tf_lemma, classifierLR)

print("#6")
training(text_tf_lemma_pos, classifierMNB)
training(text_tf_lemma_pos, classifierLR)

In [None]:
print("#1")
training(text_cv, classifierLSVC)

# print("#2")
# training(text_cv_lemma, classifierLSVC)

# print("#3")
# training(text_cv_lemma_pos, classifierLSVC)

# print("#4")
# training(text_tf, classifierLSVC)

# print("#5")
# training(text_tf_lemma, classifierLSVC)

# print("#6")
# training(text_tf_lemma_pos, classifierLSVC)

In [69]:
def create_submission(X, TEST ,classifier):
    test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    roc_aoc_scores = []
    df = pd.DataFrame()
    df['id'] = test['id']
    
    for eval_col in test_cols:
        print("FIT ", eval_col)
        y = train[eval_col]

        clf = classifier().fit(X, y)
        
        predicted= clf.predict_proba(TEST)[:,1]
        df[eval_col] = predicted

    return df

def calc_score(test_submission):
    test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    roc_aoc_scores = []
    selection = test_labels['toxic'] != -1
    
    test_dataset = test_labels[selection]
    predicted_dataset = test_submission[selection]
    
    for eval_col in test_cols:
        roc_aoc_scores.append(metrics.roc_auc_score(test_dataset[eval_col], predicted_dataset[eval_col]))
    
    print(roc_aoc_scores)
    print("Score :" , np.mean(np.array(roc_aoc_scores)))

In [45]:
df = create_submission(text_counts_tf, text_counts_tf_test, classifierLR)

In [46]:
df.head()

In [48]:
df.to_csv('submission_tfidf_lr.csv', index=False)

In [70]:
calc_score(df)

In [None]:
calc_score(df)