In [1]:
import pandas as pd
import pickle as p
import numpy as np
import operator

from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.corpus import stopwords

In [2]:
train_data = pd.read_csv('../dataset/train.csv').fillna('')
test_data = pd.read_csv('../dataset/test.csv').fillna('')

## Get actual lengths of toxic etc

In [3]:
toxic_comments = train_data.comment_text.iloc[train_data.toxic.nonzero()]
severe_toxic_comments = train_data.comment_text.loc[train_data.severe_toxic.nonzero()]
obscene_comments = train_data.comment_text.loc[train_data.obscene.nonzero()]
threat_comments = train_data.comment_text.loc[train_data.threat.nonzero()]
insult_comments = train_data.comment_text.loc[train_data.insult.nonzero()]
identity_hate_comments = train_data.comment_text.loc[train_data.identity_hate.nonzero()]

In [None]:
train_data.comment_text.str.split().str.len().describe()

## Toxic

In [None]:
toxic_comment_lengths = toxic_comments.str.split().str.len()

In [None]:
toxic_comment_lengths.describe()

In [None]:
%matplotlib inline
toxic_comment_lengths.plot(kind='hist')

## Severe

In [None]:
severe_comment_lengths = severe_toxic_comments.str.split().str.len()

In [None]:
severe_comment_lengths.describe()

## Obscene

In [None]:
obscene_comment_lengths = obscene_comments.str.split().str.len()

In [None]:
obscene_comment_lengths.describe()

## Threat

In [None]:
threat_comment_lengths = threat_comments.str.split().str.len()

In [None]:
threat_comment_lengths.describe()

## Insult

In [None]:
insult_comment_lengths = insult_comments.str.split().str.len()

In [None]:
insult_comment_lengths.describe()

## Identity 

In [None]:
identity_comment_lengths = identity_hate_comments.str.split().str.len()

In [None]:
insult_comment_lengths.describe()


The mean for all the targets are less than 100 so maybe turn down the sequence length...

In [4]:
import textacy
list(textacy.extract.ngrams(
...     train_data.comment_text[0], 2, filter_stops=True, filter_punct=True, filter_nums=False))

SyntaxError: invalid syntax (<ipython-input-4-8b6ce49a86f0>, line 3)

In [None]:
from textacy.doc import Doc

comments = list(map(lambda doc: Doc(content=doc, lang='en'), train_data.comment_text))

### Remove non english chars and special chars

In [None]:
stop_words = set(stopwords.words('english')).union(STOP_WORDS)
custom_stopwords = set(['wikipedia', 'article', 'page', 'talk', 'like', 'know', 'edit', 'use', 
                        'think', 'wiki', 'people', 'b', 'hi', 'hey', ])
stop_words = stop_words.union(custom_stopwords)

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(filters='')

In [None]:
tokenizer.fit_on_texts(train_data.comment_text.astype(str))

In [None]:
tokenizer.word_index['!!!!']

In [None]:
import re

remove = re.compile('[^a-zA-Z ]')

def filter_non_english_chars(documents):
    return pd.Series(list(map(lambda doc: remove.sub(string=doc.lower(), repl=''), documents)))

In [None]:
a

In [None]:
clean_train = filter_non_english_chars(train_data.comment_text)
clean_test = filter_non_english_chars(test_data.comment_text)

In [None]:
def remove_stop_words(texts):
    return list(map(lambda text: ' '.join(map(lambda word: word, text)), 
                map(lambda words: list(filter(lambda word: word not in stop_words, words)), texts.str.split())))

In [None]:
clean_train = remove_stop_words(clean_train)
clean_test = remove_stop_words(clean_test)

In [None]:
train_data.comment_text = clean_train
test_data.comment_text = clean_test

In [None]:
train_data.to_csv('../dataset/preprocessed_train.csv', index=False)
test_data.to_csv('../dataset/preprocessed_test.csv', index=False)

## Length Distribution

In [None]:
clean_lengths = train_data.comment_text.str.split().str.len()

In [None]:
clean_lengths.plot()

In [None]:
clean_lengths.hist()

In [None]:
clean_lengths.describe()

In [None]:
clean_lengths_value_counts = clean_lengths.value_counts()

In [None]:
clean_lengths_value_counts.plot()

In [None]:
clean_lengths.plot.density()

In [None]:
clean_lengths.plot.box()

## Word Distributions

In [None]:
cv = CountVectorizer(stop_words=stop_words)

In [None]:
words = cv.fit_transform(clean_train)

In [None]:
names = cv.get_feature_names()

In [None]:
def get_word_count_dict(term_document_matrix, feature_names):
    """
    """
    counts = np.sum(term_document_matrix, axis=0).A1
    
    return {name: count for name, count in zip(feature_names, counts)}

In [None]:
def get_sorted_word_counts(count_dict):
    return sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True)


In [None]:
count_dict = get_word_count_dict(words, names)
sorted_d = get_sorted_word_counts(count_dict)

In [None]:
%matplotlib inline
counts = pd.DataFrame({'counts': list(map(lambda x: x[1], sorted_d))}, index=list(map(lambda x: x[0], sorted_d)))

In [None]:
len(counts)

In [None]:
counts[:20].plot(kind='bar')

In [None]:
`def clean_output_data(data, name):
    labels = pd.concat([data.toxic, data.severe_toxic, data.obscene, 
           data.threat, data.insult, data.identity_hate], axis=1)
    
    data = train_data[labels.sum(axis=1) > 0].reset_index(drop=True)
    
    
    data.to_csv('data/{}.csv'.format(name), index=False)

In [None]:
clean_output_data(test_data, 'preprocessed_test')