### Toxic Comment Classification Challenge

https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [13]:
import numpy as np 
import pandas as pd
import string
import re
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer 
from matplotlib import pyplot as plt
from textblob import TextBlob
from nltk.stem.wordnet import WordNetLemmatizer 
import gensim

In [6]:
df = pd.read_csv('/Users/Mushroom/Downloads/train.csv')

### Data Overview

In [7]:
df.head(5)
df.describe()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


### Add Variable

In [8]:
rowsums=df.iloc[:,2:].sum(axis=1)
df['clean']=(rowsums==0) *1

In [9]:
#Create indirect features to help compensate for the loss of information when cleaning the dataset:

#Sentense count in each comment:
    #  '\n' can be used to count the number of sentences in each comment
df['count_sent']=df["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
#Word count in each comment:
df['count_word']=df["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
df['count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
#Letter count
df['count_letters']=df["comment_text"].apply(lambda x: len(str(x)))
#upper case words count
df["count_words_upper"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#Average length of the words
df["mean_word_len"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [10]:
#Add 3 "emotional" puncs
#count of !
df['count_!']=df["comment_text"].apply(lambda x: len([w for w in x if w == "!"]))
#count of ?
df['count_?']=df["comment_text"].apply(lambda x: len([w for w in x if w == "?"]))
#count of ^
df['count_^']=df["comment_text"].apply(lambda x: len([w for w in x if w == "^"]))

In [11]:
#derived features
#Word count percent in each comment:
df['word_unique_percent']=df['count_unique_word']*100/df['count_word']
#derived features
# Cap word percent in each comment
df['cap_percent']=df["count_words_upper"]*100/df['count_word']
#remove cap count after calculate the percengtage
df.drop('count_words_upper', axis=1, inplace=True)

In [14]:
df['sentiment'] = df['comment_text'].apply(lambda x : TextBlob(x).sentiment)

In [15]:
df['polarity'] = df['sentiment'].apply(lambda x : x[0])
df['subjective'] = df['sentiment'].apply(lambda x : x[1])

In [16]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,count_sent,...,count_letters,mean_word_len,count_!,count_?,count_^,word_unique_percent,cap_percent,sentiment,polarity,subjective
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,2,...,264,5.162791,0,1,0,95.348837,4.651163,"(0.13636363636363635, 0.45454545454545453)",0.136364,0.454545
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,1,...,112,5.588235,1,0,0,100.0,5.882353,"(0.2875, 0.55)",0.2875,0.55
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,1,...,233,4.571429,0,0,0,92.857143,0.0,"(0.16, 0.4066666666666666)",0.16,0.406667
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,5,...,622,4.486726,0,0,0,72.566372,4.424779,"(0.2, 0.30104166666666665)",0.2,0.301042
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,1,...,67,4.230769,0,1,0,100.0,0.0,"(0.0, 0.0)",0.0,0.0


### Data Cleaning<br>


In [17]:
corpus = df.comment_text

In [18]:
corpus.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [19]:
tokenizer=TweetTokenizer()
lem = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))
pun = set(string.punctuation)

In [20]:
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    # Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    # remove \n
    comment=re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    # remove usernames
    comment=re.sub("\[\[.*\]","",comment)
 
    
    # Split the sentences into words
    words=tokenizer.tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    # remove stopwords and punctuation
    words = [lem.lemmatize(word, "v") for word in words]
    words = [w for w in words if not w in eng_stopwords]
    words = [w for w in words if not w in pun]
    
    clean_sent=" ".join(words)
    
    return(clean_sent)

In [21]:
clean_corpus = corpus.apply(lambda x :clean(x))

In [22]:
df['comment'] = clean_corpus

In [23]:
df.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,count_sent,...,mean_word_len,count_!,count_?,count_^,word_unique_percent,cap_percent,sentiment,polarity,subjective,comment
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,2,...,5.162791,0,1,0,95.348837,4.651163,"(0.13636363636363635, 0.45454545454545453)",0.136364,0.454545,explanationwhy edit make username hardcore met...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,1,...,5.588235,1,0,0,100.0,5.882353,"(0.2875, 0.55)",0.2875,0.55,d'aww match background colour i'm seemingly st...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,1,...,4.571429,0,0,0,92.857143,0.0,"(0.16, 0.4066666666666666)",0.16,0.406667,hey man i'm really try edit war guy constantly...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,5,...,4.486726,0,0,0,72.566372,4.424779,"(0.2, 0.30104166666666665)",0.2,0.301042,morei can't make real suggestions improvement ...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,1,...,4.230769,0,1,0,100.0,0.0,"(0.0, 0.0)",0.0,0.0,sir hero chance remember page that's
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,1,3,...,3.923077,0,0,0,92.307692,0.0,"(0.0, 0.0)",0.0,0.0,congratulations well use tool well · talk
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,0,1,...,4.625,0,0,0,100.0,100.0,"(0.0, 0.0)",0.0,0.0,cocksucker piss around work
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,1,1,...,4.75,0,0,0,100.0,0.0,"(0.0, 0.0)",0.0,0.0,vandalism matt shirvington article revert plea...
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,1,1,...,4.698795,0,1,0,84.337349,1.204819,"(-0.13999999999999999, 0.5399999999999999)",-0.14,0.54,sorry word nonsense offensive anyway i'm inten...
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,1,1,...,4.916667,0,0,0,100.0,0.0,"(-0.16666666666666666, 0.3333333333333333)",-0.166667,0.333333,alignment subject contrary dulithgow


In [24]:
all_sentences = filter(None, df['comment'])
word2vec_model = gensim.models.Word2Vec(all_sentences, 
                                        size=100, 
                                        window=5, 
                                        min_count=5, 
                                        workers=4)

In [25]:
word2vec_model.init_sims(replace=True)
model_name = 'word2vec_model'
word2vec_model.save(model_name)

In [26]:
def sentence_to_avg(words, embedding):
    vector_size = word2vec_model.vector_size
    mapping = word2vec_model
    avg = np.zeros((vector_size,))
    count = 0
    for w in words:
        try:
            avg += mapping[w]
            count += 1
        except:
            pass
    if count > 0: avg = avg / count
    return avg

In [27]:
df_new = df[["comment", "toxic","severe_toxic","obscene","threat","insult","identity_hate"]].copy()

In [28]:
df_new["comment"] = df_new["comment"].apply([lambda x : sentence_to_avg(x, "word2vec")])

  


In [29]:
df['vector'] = df_new['comment']

In [None]:
df_new = DataFrame.to_csv("/Users/Mushroom/Downloads/train_clean.csv")

In [29]:
df_new.head()

Unnamed: 0,comment,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"[0.030326198018155993, -0.036622067977441476, ...",0,0,0,0,0,0
1,"[0.010061211557788125, -0.024253740422102776, ...",0,0,0,0,0,0
2,"[0.02421304808915344, -0.03475592498230314, 0....",0,0,0,0,0,0
3,"[0.034024511345989, -0.036588193794975354, 0.0...",0,0,0,0,0,0
4,"[0.04967472029642926, -0.04506905984857844, 0....",0,0,0,0,0,0


In [30]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,count_sent,...,mean_word_len,count_!,count_?,count_^,word_unique_percent,cap_percent,sentiment,polarity,subjective,comment
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,2,...,5.162791,0,1,0,95.348837,4.651163,"(0.13636363636363635, 0.45454545454545453)",0.136364,0.454545,explanationwhy edit make username hardcore met...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,1,...,5.588235,1,0,0,100.0,5.882353,"(0.2875, 0.55)",0.2875,0.55,d'aww match background colour i'm seemingly st...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,1,...,4.571429,0,0,0,92.857143,0.0,"(0.16, 0.4066666666666666)",0.16,0.406667,hey man i'm really try edit war guy constantly...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,5,...,4.486726,0,0,0,72.566372,4.424779,"(0.2, 0.30104166666666665)",0.2,0.301042,morei can't make real suggestions improvement ...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,1,...,4.230769,0,1,0,100.0,0.0,"(0.0, 0.0)",0.0,0.0,sir hero chance remember page that's


In [31]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'clean', 'count_sent', 'count_word',
       'count_unique_word', 'count_letters', 'mean_word_len', 'count_!',
       'count_?', 'count_^', 'word_unique_percent', 'cap_percent', 'sentiment',
       'polarity', 'subjective', 'comment'],
      dtype='object')