In [118]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

import contractions
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import re
import string
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/affan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/affan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [119]:
def fix_contractions(text):
    fix = text
    try:
        fix = contractions.fix(text)
    except:
        pass
    return fix

def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)
    # return re.sub(r"https?://\S+|www\.\S+", "", text)

def rm_punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# handle case like "shut up okay?Im only 10 years old"
# become "shut up okay Im only 10 years old"
def rm_punct2(text):
    return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)

def rm_number(text):
    return re.sub(r'\d+', '', text)

def rm_whitespaces(text):
    return re.sub(r' +', ' ', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

def spell_correction(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

def clean_pipeline(text):
    text_lower = text.lower()
    fix_contr = fix_contractions(text_lower)
    no_link = rm_link(fix_contr)
    no_punct = rm_punct2(no_link)
    # no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_punct)
    no_nonasci = rm_nonascii(no_whitespaces)
    no_emoji = rm_emoji(no_nonasci)
    spell_corrected = spell_correction(no_emoji)
    return spell_corrected

In [120]:
def tokenize(text):
    return word_tokenize(text)

def rm_stopwords(text):
    return [i for i in text if i not in stopwords]

def postag(text):
    # not implemented yet
    return

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()    
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    # make sure lemmas does not contains sotpwords
    return rm_stopwords(lemmas)

def preprocess_pipeline(text):
    tokens = tokenize(text)
    no_stopwords = rm_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

In [4]:
train = pd.read_csv('../data/raw/jigsaw/train.csv')
test = pd.read_csv('../data/raw/jigsaw/test.csv')

len(train), len(test)

(159571, 153164)

In [5]:
def is_toxic(row):
    return 1 if row.sum() > 1 else -1 if row.sum() < 0 else 0

train['is_toxic'] = train.iloc[:, 2:].apply(is_toxic, axis=1)
test['is_toxic'] = test.iloc[:, 2:].apply(is_toxic, axis=1)

In [6]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


In [43]:
train['clean'] = train.comment_text.apply(clean_pipeline)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_toxic,clean,processed
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,Explanation Why edits made username Hardcore M...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,d aww he matches this background colour i am s...,D aww He match background colour I seemingly s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,hey man i am really not trying to edit war it ...,Hey man I really trying edit war It guy consta...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,\nmore\ni cannot make any real suggestions on...,More I make real suggestion improvement I wond...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,you sir are my hero any chance you remember wh...,You sir hero Any chance remember page


In [44]:
train['processed'] = train.clean.apply(preprocess_pipeline)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_toxic,clean,processed
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,d aww he matches this background colour i am s...,aww match background colour seemingly stuck th...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,hey man i am really not trying to edit war it ...,hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,\nmore\ni cannot make any real suggestions on...,make real suggestion improvement wondered sect...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,you sir are my hero any chance you remember wh...,sir hero chance remember page


## Feature Extraction

In [21]:
toxvocab = pd.read_csv('../data/processed/toxvocab_jigsaw_keep_number.csv')
toxvocab.sort_values(by='weight', ascending=False).head()

Unnamed: 0,token,weight,occurence
8780,oxymoron83,6.388438,176.0
1924,bunksteve,6.337069,278.0
3244,derka,5.599605,141.0
8006,motherfucker,5.060153,619.0
4796,fuck,4.676083,16619.0


In [37]:
# prepare vocab for easier query
toxvocab_dict = toxvocab.to_dict(orient='records')

In [122]:
def toxic_occurrence(text, vocab):
    stext = text.split()
    occur = 0
    for t in stext:
        if t in vocab.token.tolist():        
            occur += 1

    return occur

def toxic_weight_rate(text, vocab):
    stext = text.split()    
    toxic_rate = []
    for t in stext:        
        found = toxvocab[toxvocab['token'] == t]
        if len(found) > 0:            
            toxic_rate = np.append(toxic_rate, found['weight'])

    return np.mean(toxic_rate) if len(toxic_rate) > 0 else 0

In [104]:
tqdm.pandas()
train['tox_occur'] = train.processed.progress_apply(toxic_occurrence, args=(toxvocab, ))
train['tox_rate'] = train.processed.progress_apply(toxic_weight_rate, args=(toxvocab, ))
train.head()

100%|██████████| 159571/159571 [24:39<00:00, 107.83it/s]
100%|██████████| 159571/159571 [1:38:53<00:00, 26.89it/s]  


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_toxic,clean,processed,tox_occur,tox_rate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,explanation edits made username hardcore metal...,1,0.68558
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,d aww he matches this background colour i am s...,aww match background colour seemingly stuck th...,3,0.498091
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,hey man i am really not trying to edit war it ...,hey man really trying edit war guy constantly ...,2,0.255665
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,\nmore\ni cannot make any real suggestions on...,make real suggestion improvement wondered sect...,2,0.245365
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,you sir are my hero any chance you remember wh...,sir hero chance remember page,1,0.393805


In [112]:
train.tox_rate.fillna(0, inplace=True)

In [113]:
train.tox_rate.isna().sum()

0

In [114]:
train.sort_values(by=['tox_rate'], ascending=False)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_toxic,clean,processed,tox_occur,tox_rate
53408,8ecc71e2dd4e2fbe,BunkSteve is gay! BunkSteve is gay! BunkSteve ...,1,1,0,0,1,1,1,bunksteve is gay bunksteve is gay bunksteve is...,bunksteve gay bunksteve gay bunksteve gay bunk...,278,6.337069
42090,704cd990f266ef15,bad motherfuckers \n\nThere are a lot of them ...,1,0,1,0,1,0,1,bad motherfuckers \n\nthere are a lot of them ...,bad motherfucker lot keep calm carry,1,5.060153
116477,6ea1aefc9a6cc945,JUSTIFY YOUR CHANGES MOTHERFUCKER,1,0,1,0,1,0,1,justify your changes motherfucker,justify change motherfucker,1,5.060153
86638,e7b7a6998191e05d,February 2015 \n\nRemzoy16 Motherfucker !,1,0,1,0,1,0,1,february 2015 \n\nremzoy16 motherfucker,february 2015 remzoy16 motherfucker,1,5.060153
88545,ecde7c101c615c3d,hey motherfucker hurry up!,1,0,1,0,1,0,1,hey motherfucker hurry up,hey motherfucker hurry,1,5.060153
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87081,e8f131751f92bf6f,"Calton, remember, you were ordered to stay awa...",0,0,0,0,0,0,0,calton remember you were ordered to stay away ...,calton remember ordered stay away ordered stay...,0,0.000000
130691,bb33671537b21507,Please respond quickly.,0,0,0,0,0,0,0,please respond quickly,please respond quickly,0,0.000000
130692,bb34ba41a37eff7f,Are you sure it's Kauffner who's making up a s...,1,0,0,0,0,0,0,are you sure it is kauffner who is making up a...,sure kauffner making story sock puppet,0,0.000000
9586,19640fadb0673e60,"""\n In the Donald Trump article, the contro...",0,0,0,0,0,0,0,\n in the donald trump article the controvers...,donald trump article controversy subsection an...,0,0.000000


In [115]:
train.to_csv('../data/processed/trainset-feature-extracted.csv', index=None)

In [121]:
tqdm.pandas()
test['clean'] = test.comment_text.progress_apply(clean_pipeline)
test['processed'] = test.clean.progress_apply(preprocess_pipeline)
test['tox_occur'] = test.processed.progress_apply(toxic_occurrence, args=(toxvocab, ))
test['tox_rate'] = test.processed.progress_apply(toxic_weight_rate, args=(toxvocab, ))
test.head()

100%|██████████| 153164/153164 [00:17<00:00, 8520.11it/s]
100%|██████████| 153164/153164 [01:06<00:00, 2300.61it/s]
100%|██████████| 153164/153164 [20:26<00:00, 124.92it/s]
100%|██████████| 153164/153164 [1:20:24<00:00, 31.74it/s]


Unnamed: 0,id,comment_text,is_toxic,clean,processed,tox_occur,tox_rate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0,yo bitch ja rule is more succesful then you wi...,yo bitch ja rule succesful ever hating sad mof...,19,1.067896
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0,from rfc \n\n the title is fine as it is i am...,rfc title fine going,1,0.387126
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0,\n\n sources \n\n zawe ashton on lapland,source zawe ashton lapland,0,
3,00017563c3f7919a,":If you have a look back at the source, the in...",0,if you have a look back at the source the inf...,look back source information updated correct f...,1,0.268448
4,00017695ad8997eb,I don't anonymously edit articles at all.,0,i do not anonymously edit articles at all,anonymously edit article,0,


In [126]:
test.head()

Unnamed: 0,id,comment_text,is_toxic,clean,processed,tox_occur,tox_rate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0,yo bitch ja rule is more succesful then you wi...,yo bitch ja rule succesful ever hating sad mof...,19,1.067896
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0,from rfc \n\n the title is fine as it is i am...,rfc title fine going,1,0.387126
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0,\n\n sources \n\n zawe ashton on lapland,source zawe ashton lapland,0,0.0
3,00017563c3f7919a,":If you have a look back at the source, the in...",0,if you have a look back at the source the inf...,look back source information updated correct f...,1,0.268448
4,00017695ad8997eb,I don't anonymously edit articles at all.,0,i do not anonymously edit articles at all,anonymously edit article,0,0.0


In [128]:
test.to_csv('../data/processed/testset-feature-extracted.csv', index=None, header=True)