In [3]:
import pandas as pd 
import numpy as np
import h5py
import spacy

from tqdm import tqdm_notebook as tqdm

from data import setup_fit_tokenizer
from keras.preprocessing.sequence import pad_sequences
from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
def parse_sentences_words(documents, max_sentences=None):
    parsed_documents = []
    
    for (example_i, comment) in tqdm(enumerate(documents)):
        tokens = nlp(comment)
        
        parsed_document = []
        
        for sentence_i, sentence in enumerate(tokens.sents):
            if max_sentences and sentence_i == max_sentences:
                break
            filtered_sentence = ' '.join(map(lambda word: word.text, 
                                             list(filter(lambda word: not word.is_stop, sentence))))

            tokenized_sentence = tokenizer.texts_to_sequences([filtered_sentence])[0]
            parsed_document.append(tokenized_sentence)
        parsed_documents.append(parsed_document)
        
    return parsed_documents

In [5]:
def pad_parsed_sequences(documents, max_sentences, max_words):
    if isinstance(documents, list):
        nb_rows = len(documents)
    else:
        raise ValueError('The documents must be of type list')
        
    padded_documents = np.zeros((nb_rows, max_sentences, max_words), dtype=np.int32)

    for parsed_document_i, parsed_document in enumerate(documents):
        i = len(parsed_document) - max_sentences 
        i = 0 if i < 0 or i > max_sentences else i
            
        for parsed_sentence_i, parsed_sentence in enumerate(parsed_document):            
            
            if i == max_sentences or i == len(parsed_document):
                break
            padded_documents[parsed_document_i, i, :] = pad_sequences([parsed_sentence],
                                                                      max_words)
            i += 1

    return padded_documents

In [6]:
def get_labels(df):
    labels = [df.toxic, df.severe_toxic, df.obscene,
              df.threat, df.insult, df.identity_hate]
    labels = list(map(lambda y: y.values, labels))
    labels = np.array(labels).transpose()
    
    return labels

In [7]:
def save_hdf5(padded_documents, labels, name):
    padded_documents_f = h5py.File('dataset/{}.hdf5'.format(name), 'w')
    padded_documents_f.create_dataset('x', padded_documents.shape, dtype='i', data=padded_documents)
    
    padded_documents_f.create_dataset('y', labels.shape, dtype='i', data=labels)
    padded_documents_f.close()

In [9]:
train_data = pd.read_csv('dataset/train.csv')
train_data = train_data.dropna()
train_documents = train_data.comment_text.astype(str)
train_documents = train_documents.str.lower()

In [10]:
test_data = pd.read_csv('dataset/test.csv')
test_documents = test_data.comment_text.astype(str)
test_documents = test_documents.str.lower()

In [13]:
all_documents = pd.concat([train_documents, test_documents]).reset_index(drop=True)

In [15]:
nlp = spacy.load('en')
tokenizer = setup_fit_tokenizer(all_documents)

Remove all nans

In [16]:
parsed_documents = parse_sentences_words(train_documents)

KeyboardInterrupt: 

In [None]:
padded_train_documents = pad_parsed_sequences(parsed_documents, 8, 15)

In [None]:
train_labels = get_labels(train_data)

In [None]:
save_hdf5(padded_train_documents, train_labels, 'padded_train')

In [None]:
parsed_test_documents = parse_sentences_words(test_documents)

In [None]:
padded_test_documents = pad_parsed_sequences(parsed_test_documents, 8, 15)

In [None]:
test_labels = get_labels(test_data)

In [None]:
save_hdf5(padded_test_documents, test_labels, 'padded_test')

In [None]:
def get_lengths(documents):
    document_lengths = []
    sentence_lengths = [] 
    
    for document in documents:
        document_lengths.append(len(document))
        
        for sentence in document:
            sentence_lengths.append(len(sentence))
            
    return pd.Series(document_lengths), pd.Series(sentence_lengths)

In [None]:
document_lengths, sentence_lengths = get_lengths(parsed_documents)

In [None]:
%matplotlib inline

In [None]:
document_lengths.value_counts().hist()

In [None]:
document_lengths.max()

In [None]:
document_lengths.mean()

In [None]:
sentence_lengths.mean()

In [None]:
import pickle as p

In [None]:
tokenizer.vocab_size = 

In [None]:
p.dump(tokenizer, open('tokenizer.p', 'wb'))