## Toxic: Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.pipeline import Pipeline

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from wordcloud import WordCloud, STOPWORDS
from sklearn.manifold import TSNE

from time import time
import math

import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import re

from pymongo import MongoClient

### Preprocessing Functions

In [2]:
def sentence_tokenizer(text):
    sentences = sent_tokenize(text)
    return sentences

def polarity_sentence(sentences):
    listy = []
    for i in list(range(0,len(sentences))):
        pol = TextBlob(sentences[i]).polarity
        listy.append(pol)
    return np.min(listy), np.max(listy), np.mean(listy),listy

# TODO appears to not be working on comment_text_s
def polarity_comment(text):
    txt = " ".join(text)
    return TextBlob(txt).polarity

def token_clean(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = text.lower().split()
    return text

def comment_text_short(text):
    return ''.join(text)[:1000]

# def comment_char_short(text):
#     return ''.join(text)[:1000]

def filtered(text):
    filter = ['PRP','CC','IN','DT','PRP$']
    matches = []

    words=pos_tag(word_tokenize(text))
    for i in range(len(words)):
        if words[i][1] not in filter:
            matches.append(words[i][0])

    filtered = ' '.join(matches)
    return filtered

In [3]:
df = pd.read_csv('../data/train.csv') # train data
df['idx'] = df['id']
df = df.set_index('idx')
print(df.shape)
# df.head()

(159571, 8)


In [4]:
df['category'] = ['To'*r.toxic + 'ST'*r.severe_toxic + 'Ob'*r.obscene+ 'Th'*r.threat+ 'In'*r.insult+'IH'*r.identity_hate for _,r in df.iterrows()]

In [5]:
df.sum(axis=0,numeric_only=True)

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [6]:
df['rating'] = df['toxic'] + df['severe_toxic'] + df['obscene'] + df['threat'] + df['insult'] + df['identity_hate']
df['clean'] = [1 if r == 0 else 0 for r in df.rating]

In [7]:
df = df.sort_values(['rating'],ascending=[False])
df.groupby('rating').nunique()['id'] # class imbalance issue

rating
0    143346
1      6360
2      3480
3      4209
4      1760
5       385
6        31
Name: id, dtype: int64

In [8]:
df.groupby('category').nunique()['id'].sort_values(ascending=False)

category
                143346
To                5666
ToObIn            3800
ToOb              1758
ToIn              1215
ToSTObIn           989
ToObInIH           618
Ob                 317
In                 301
ToSTObInIH         265
ObIn               181
ToSTOb             158
ToIH               136
ToInIH             134
ToObThIn           131
ToTh               113
ToSTObThIn          64
ToObThInIH          56
IH                  54
ToST                41
ToObIH              35
ToSTObThInIH        31
InIH                28
Th                  22
ObInIH              18
ToThIn              16
ToSTIn              14
ToObTh              11
ToSTTh              11
ToSTInIH             7
ToThIH               7
ToSTObIH             6
ToSTObTh             4
ToSTIH               3
ThIn                 3
ObIH                 3
ToThInIH             3
ObThIn               2
ObTh                 2
ToSTThIH             1
ToSTThIn             1
Name: id, dtype: int64

In [None]:
df['comment_text_s'] = df['comment_text'].apply(comment_text_short)
df['comment_text_f'] = df['comment_text_s'].apply(filtered)

In [None]:
# note that classes are not mutually exclusive, any comment to belong to any of 6 classes
# as such, may need to test each classification separately, unless there is a way to test all together?
df['token_clean'] = df['comment_text'].apply(token_clean)
df['sent_token'] = df['comment_text'].apply(sentence_tokenizer)
df['polarity_sentence'] = df['sent_token'].apply(polarity_sentence)
df['polarity_comment'] = df['comment_text'].apply(polarity_comment)
df['polarity_comment_s'] = df['comment_text_s'].apply(polarity_comment)
df['word_count'] = df['token_clean'].apply(len)
df['char_count'] = df['comment_text'].apply(len)
# df['char_count_ts'] = df['comment_text_s'].apply(len)
df['char_count_s'] = df['comment_text_s'].apply(len)

In [None]:
df['polarity_min'] = [x[0] for x in df['polarity_sentence']]
df['polarity_max'] = [x[1] for x in df['polarity_sentence']]
df['polarity_mean'] = [x[2] for x in df['polarity_sentence']]
print(df.shape)
df.head()

In [None]:
df.to_pickle('../data/toxictrain.pkl')