In [21]:
import pandas as pd 
import numpy as np
import h5py

from data import get_labels

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences

In [22]:
count_vectorizer = CountVectorizer(ngram_range=(1,2), max_features=25000)

In [23]:
train_data = pd.read_csv('dataset/preprocessed_train.csv').astype(str)

In [24]:
test_data = pd.read_csv('dataset/preprocessed_test.csv')

In [25]:
all_documents = pd.concat([train_data.comment_text, test_data.comment_text]).dropna().reset_index(drop=True)

In [26]:
count_vectorizer.fit(all_documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=25000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
train_counts = count_vectorizer.transform(train_data.comment_text)

In [31]:
def get_tokenized_text(document_term_matrix, max_seq_length):
    rows, cols = np.nonzero(document_term_matrix)
    
    x = [[] for _ in range(document_term_matrix.shape[0])]

    for row, col in zip(rows, cols):
        x[row].append(col + 1)
    
    return pad_sequences(x, maxlen=max_seq_length)

In [33]:
x_train = get_tokenized_text(train_counts, 200)
train = h5py.File('dataset/fasttext_train.hdf5')
train.create_dataset(data=x_train, dtype=np.int32, name='x')
y_train = get_labels(df=train_data)
train.create_dataset(data=y_train, dtype=np.int32, name='y')
train.close()

In [34]:
test_counts = count_vectorizer.transform(test_data.comment_text.astype(str))
x_test = get_tokenized_text(test_counts, 200)
test = h5py.File('dataset/fasttext_test.hdf5')
test.create_dataset(data=x_test, dtype=np.int32, name='x')
test.close()