In [1]:
import pandas as pd

from evaluate import TARGET_NAMES
from data import get_labels
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Using TensorFlow backend.


## Setup documents

In [2]:
train_data = pd.read_csv('dataset/preprocessed_train.csv').dropna().astype(str)
test_data = pd.read_csv('dataset/preprocessed_test.csv').dropna().astype(str)

In [3]:
labels = train_data[TARGET_NAMES]

In [4]:
tags = []

for i in range(labels.shape[0]):
    x = labels.iloc[i, :]
    tag = '_'.join([index for name, index in zip(x, x.index) if name])
    
    if tag == '':
        tag = 'none'
        
    tags.append(tag)

In [5]:
documents = []

for document, tag in zip(train_data.comment_text, tags):
    doc = TaggedDocument(document.split(), [tag])
    documents.append(doc)

In [6]:
dm_model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4, iter=20, dm=1)

In [7]:
dbow_model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4, iter=20, dm=0)

In [8]:
dm_model.save('doc2vec/dm_model.d2v')
dbow_model.save('doc2vec/dbow_model.d2v')

In [9]:
import h5py
import numpy as np

In [10]:
train_df = h5py.File('dataset/d2v_train.hdf5')

In [11]:
def create_concat_matrix(documents, dm_model, dbow_model):
    d2v_matrix = np.zeros((len(documents), dm_model.vector_size + dbow_model.vector_size))
    documents = documents.str.split()
    
    for i, document in enumerate(documents):
        dbow_vector = dbow_model.infer_vector(document)
        dm_vector = dm_model.infer_vector(document)
        
        d2v_matrix[i, :dbow_model.vector_size] = dbow_vector
        d2v_matrix[i, dbow_model.vector_size:] = dm_vector
        
    return d2v_matrix

In [12]:
train_matrix = create_concat_matrix(train_data.comment_text, dm_model, dbow_model)

In [13]:
train_df.create_dataset(name='x', data=train_matrix, dtype=np.float32)

RuntimeError: Unable to create link (name already exists)

In [None]:
labels = get_labels(train_data)
train_df.create_dataset(name='y', data=labels, dtype=np.int32)

In [None]:
train_df.close()

In [None]:
test_matrix = create_concat_matrix(test_data.comment_text, dm_model, dbow_model)

In [None]:
test_df = h5py.File('dataset/d2v_test.hdf5')

In [None]:
test_df.create_dataset(data=test_matrix, dtype=np.float32, name='x')

In [None]:
test_df.close()