## Toxic: Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.pipeline import Pipeline

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from wordcloud import WordCloud, STOPWORDS
from sklearn.manifold import TSNE

from time import time
import math

import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import re

from pymongo import MongoClient

from nltk.corpus import stopwords
stop = stopwords.words('english')

### Preprocessing Functions

In [2]:
def sentence_tokenizer(text):
    sentences = sent_tokenize(text)
    return sentences

def polarity_sentence(sentences):
    listy = []
    for i in list(range(0,len(sentences))):
        pol = TextBlob(sentences[i]).polarity
        listy.append(pol)
    return np.min(listy), np.max(listy), np.mean(listy),listy

# TODO appears to not be working on comment_text_s
def polarity_comment(text):
    txt = " ".join(text)
    return TextBlob(txt).polarity

def token_clean(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = text.lower().split()
    return text

def comment_text_short(text):
    return ''.join(text)[:1000]

def filtered(text):
    filter = ['PRP','CC','IN','DT','PRP$']
    matches = []

    words=pos_tag(word_tokenize(text))
    for i in range(len(words)):
        if words[i][1] not in filter:
            matches.append(words[i][0])

    filtered = ' '.join(matches)
    return filtered

def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [3]:
df = pd.read_csv('../data/train.csv') # train data
df['idx'] = df['id']
df = df.set_index('idx')
print(df.shape)

(159571, 8)


In [4]:
df['category'] = ['To'*r.toxic + 'ST'*r.severe_toxic + 'Ob'*r.obscene+ 'Th'*r.threat+ 'In'*r.insult+'IH'*r.identity_hate for _,r in df.iterrows()]

In [5]:
df.sum(axis=0,numeric_only=True)

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [6]:
df['rating'] = df['toxic'] + df['severe_toxic'] + df['obscene'] + df['threat'] + df['insult'] + df['identity_hate']
df['clean'] = [1 if r == 0 else 0 for r in df.rating]

In [7]:
df = df.sort_values(['rating'],ascending=[False])
df.groupby('rating').nunique()['id'] # class imbalance issue

rating
0    143346
1      6360
2      3480
3      4209
4      1760
5       385
6        31
Name: id, dtype: int64

In [8]:
df.groupby('category').nunique()['id'].sort_values(ascending=False)

category
                143346
To                5666
ToObIn            3800
ToOb              1758
ToIn              1215
ToSTObIn           989
ToObInIH           618
Ob                 317
In                 301
ToSTObInIH         265
ObIn               181
ToSTOb             158
ToIH               136
ToInIH             134
ToObThIn           131
ToTh               113
ToSTObThIn          64
ToObThInIH          56
IH                  54
ToST                41
ToObIH              35
ToSTObThInIH        31
InIH                28
Th                  22
ObInIH              18
ToThIn              16
ToSTIn              14
ToObTh              11
ToSTTh              11
ToSTInIH             7
ToThIH               7
ToSTObIH             6
ToSTObTh             4
ToSTIH               3
ThIn                 3
ObIH                 3
ToThInIH             3
ObThIn               2
ObTh                 2
ToSTThIH             1
ToSTThIn             1
Name: id, dtype: int64

In [9]:
df['comment_text_s'] = df['comment_text'].apply(comment_text_short)
df['comment_text_f'] = df['comment_text_s'].apply(filtered)

In [10]:
# note that classes are not mutually exclusive, any comment to belong to any of 6 classes
# as such, may need to test each classification separately, unless there is a way to test all together?
df['token_clean'] = df['comment_text'].apply(token_clean)
df['sent_token'] = df['comment_text'].apply(sentence_tokenizer)
df['polarity_sentence'] = df['sent_token'].apply(polarity_sentence)
df['polarity_comment'] = df['comment_text'].apply(lambda x: TextBlob(x).sentiment[0] )
df['polarity_comment_s'] = df['comment_text_s'].apply(lambda x: TextBlob(x).sentiment[0] )
df['word_count'] = df['token_clean'].apply(len)
df['char_count'] = df['comment_text'].apply(len)
# df['char_count_ts'] = df['comment_text_s'].apply(len)
df['char_count_s'] = df['comment_text_s'].apply(len)

In [11]:
df['polarity_min'] = [x[0] for x in df['polarity_sentence']]
df['polarity_max'] = [x[1] for x in df['polarity_sentence']]
df['polarity_mean'] = [x[2] for x in df['polarity_sentence']]
print(df.shape)

(159571, 24)


In [12]:
df['avg_word_length'] = df['comment_text'].apply(lambda x: avg_word(x))
df['stop_word_count'] = df['comment_text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df['hashtag_count'] = df['comment_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df['numeric_count'] = df['comment_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df['upper_count'] = df['comment_text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

In [13]:
df['comment_text_clean'] = df['comment_text'].apply(lambda x: " ".join(x.lower() for x in x.split())).str.replace('[^\w\s]','').apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [14]:
freq = pd.Series(' '.join(df['comment_text_clean']).split()).value_counts()[:10]
print(freq)
freq = list(freq.index)
df['comment_text_clean'] = df['comment_text_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

article      55403
page         45611
wikipedia    35557
talk         31288
please       29607
would        29212
one          28057
like         27705
dont         26102
see          21486
dtype: int64


In [15]:
freq = pd.Series(' '.join(df['comment_text_clean']).split()).value_counts()[-10:]
print(freq)
freq = list(freq.index)
df['comment_text_clean'] = df['comment_text_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

rasjid            1
inexhaustible     1
33113             1
judaïca           1
raucous           1
vochen            1
httpwwwncaacom    1
cyn               1
sali              1
acceptranges      1
dtype: int64


In [16]:
from textblob import Word
df['comment_text_clean'] = df['comment_text_clean'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [17]:
TextBlob(df['comment_text_clean'][0]).ngrams(2)

[WordList(['hope', 'retarded']),
 WordList(['retarded', 'kid']),
 WordList(['kid', 'get']),
 WordList(['get', 'anal']),
 WordList(['anal', 'raped']),
 WordList(['raped', 'murdered']),
 WordList(['murdered', 'fag']),
 WordList(['fag', 'father']),
 WordList(['father', 'im']),
 WordList(['im', 'gon']),
 WordList(['gon', 'na']),
 WordList(['na', 'fuck']),
 WordList(['fuck', 'fat']),
 WordList(['fat', 'wife']),
 WordList(['wife', 'trow']),
 WordList(['trow', 'bridge']),
 WordList(['bridge', 'consider']),
 WordList(['consider', 'happy']),
 WordList(['happy', 'another']),
 WordList(['another', 'useful']),
 WordList(['useful', 'editor']),
 WordList(['editor', 'wikiepia']),
 WordList(['wikiepia', 'retired']),
 WordList(['retired', 'user']),
 WordList(['user', 'retired']),
 WordList(['retired', 'everyday']),
 WordList(['everyday', 'even']),
 WordList(['even', 'dare']),
 WordList(['dare', 'removing']),
 WordList(['removing', 'peace']),
 WordList(['peace', 'shit']),
 WordList(['shit', 'ever']),
 W

In [18]:
tf1 = (df['comment_text_clean'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,jew,1
1,evil,1
2,zimzalabim,1
3,murder,1
4,homosexual,1
5,st47,1
6,going,1


In [22]:
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(df.shape[0]/(len(df[df['comment_text_clean'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,jew,1,4.657734
1,evil,1,5.459623
2,zimzalabim,1,10.59395
3,murder,1,5.545698
4,homosexual,1,6.074882
5,st47,1,9.78302
6,going,1,3.171427


In [23]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,jew,1,4.657734,4.657734
1,evil,1,5.459623,5.459623
2,zimzalabim,1,10.59395,10.59395
3,murder,1,5.545698,5.545698
4,homosexual,1,6.074882,6.074882
5,st47,1,9.78302,9.78302
6,going,1,3.171427,3.171427


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(df['comment_text_clean'])
train_vect

<159571x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2025373 stored elements in Compressed Sparse Row format>

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(df['comment_text_clean'])
train_bow

<159571x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2382577 stored elements in Compressed Sparse Row format>

In [27]:
print(df.shape)
df.head()

(159571, 30)


Unnamed: 0_level_0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,category,rating,...,char_count_s,polarity_min,polarity_max,polarity_mean,avg_word_length,stop_word_count,hashtag_count,numeric_count,upper_count,comment_text_clean
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3a4c7758fad18de3,3a4c7758fad18de3,", I hope your retarded kids get anal raped and...",1,1,1,1,1,1,ToSTObThInIH,6,...,494,-0.6,1.0,-0.01875,4.210526,34,0,0,6,hope retarded kid get anal raped murdered fag ...
24d2b50726b67167,24d2b50726b67167,I am going to murder ZimZalaBim ST47 for being...,1,1,1,1,1,1,ToSTObThInIH,6,...,68,-1.0,-1.0,-1.0,4.75,4,0,0,2,going murder zimzalabim st47 evil homosexual jew
c586b7a2fd575b13,c586b7a2fd575b13,"Shut up you asswipe, we don't care. I'll decap...",1,1,1,1,1,1,ToSTObThInIH,6,...,1000,-0.2,0.0,-0.075556,161.0,9,0,0,0,shut asswipe care ill decapitate mother shit h...
77d84b1321c22d9a,77d84b1321c22d9a,"LGBT \n\nyou little fuck , are you a fag , tha...",1,1,1,1,1,1,ToSTObThInIH,6,...,280,-0.1775,0.0,-0.08875,3.5,32,0,0,1,lgbt little fuck fag piece shit making fucked ...
1368c10281978876,1368c10281978876,"You're a stupid cunt \n\nFuck you dumb arse, y...",1,1,1,1,1,1,ToSTObThInIH,6,...,278,-0.4125,-0.4125,-0.4125,3.694915,25,0,0,3,youre stupid cunt fuck dumb arse mum hairy cun...


In [28]:
df.to_pickle('../data/toxictrain.pkl')