In [1]:
import pandas as pd

from evaluate import TARGET_NAMES
from data import get_labels
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Using TensorFlow backend.


## Setup documents

In [73]:
train_data = pd.read_csv('dataset/preprocessed_train.csv').dropna().astype(str)
test_data = pd.read_csv('dataset/preprocessed_test.csv').dropna().astype(str)

In [3]:
labels = train_data[TARGET_NAMES]

In [4]:
tags = []

for i in range(labels.shape[0]):
    x = labels.iloc[i, :]
    tag = '_'.join([index for name, index in zip(x, x.index) if name])
    
    if tag == '':
        tag = 'none'
        
    tags.append(tag)

In [5]:
documents = []

for document, tag in zip(train_data.comment_text, tags):
    doc = TaggedDocument(document.split(), [tag])
    documents.append(doc)

In [6]:
dm_model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4, iter=20, dm=1)

In [7]:
dbow_model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4, iter=20, dm=0)

In [8]:
dm_model.save('doc2vec/dm_model.d2v')
dbow_model.save('doc2vec/dbow_model.d2v')

In [9]:
import h5py
import numpy as np

In [78]:
train_df = h5py.File('dataset/d2v_train.hdf5')

In [79]:
def create_concat_matrix(documents, dm_model, dbow_model):
    d2v_matrix = np.zeros((len(documents), dm_model.vector_size + dbow_model.vector_size))
    documents = documents.str.split()
    
    for i, document in enumerate(documents):
        dbow_vector = dbow_model.infer_vector(document)
        dm_vector = dm_model.infer_vector(document)
        
        d2v_matrix[i, :dbow_model.vector_size] = dbow_vector
        d2v_matrix[i, dbow_model.vector_size:] = dm_vector
        
    return d2v_matrix

In [80]:
test_data = pd.read_csv('dataset/preprocessed_test.csv').astype(str)

In [81]:
train_matrix = create_concat_matrix(train_data.comment_text, dm_model, dbow_model)

In [82]:
train_df.create_dataset(name='x', data=train_matrix, dtype=np.float32)

<HDF5 dataset "x": shape (95848, 200), type "<f4">

In [83]:
labels = get_labels(train_data)
train_df.create_dataset(name='y', data=labels, dtype=np.int32)

<HDF5 dataset "y": shape (95848, 6), type "<i4">

In [84]:
train_df.close()

In [85]:
test_matrix = create_concat_matrix(test_data.comment_text, dm_model, dbow_model)

In [86]:
test_df = h5py.File('dataset/d2v_test.hdf5')

In [87]:
test_df.create_dataset(data=test_matrix, dtype=np.float32, name='x')

<HDF5 dataset "x": shape (226998, 200), type "<f4">

In [88]:
test_df.close()

## Train Logistic Regression

In [89]:
from sklearn.linear_model import LogisticRegression

In [90]:
preds = []
for i in range(labels.shape[1]):
    y = labels[:, i]
    logistic_model = LogisticRegression()
    logistic_model.fit(train_matrix, y)
    probs = logistic_model.predict_proba(test_matrix)
    preds.append(pd.Series(probs[:, 1], dtype=np.float64))

In [91]:
test_data = pd.read_csv('dataset/preprocessed_test.csv')

In [92]:
concat_df = [test_data.id]

In [93]:
concat_df.extend(preds)

In [94]:
sub_df = pd.concat(concat_df, axis=1)

In [95]:
sub_df.columns

Index(['id', 0, 1, 2, 3, 4, 5], dtype='object')

In [96]:
sub_df_columns = ['id']
sub_df_columns.extend(TARGET_NAMES)

In [97]:
sub_df.columns = sub_df_columns

In [98]:
sub_df.to_csv('d2v_submission.csv', index=False)

In [99]:
sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,4.794003e-02,6.460441e-03,1.474216e-02,8.530962e-03,2.419857e-02,5.693080e-03
1,6102620,4.262466e-02,1.734371e-04,1.268481e-02,4.643677e-04,1.063094e-02,1.594593e-04
2,14563293,3.609079e-03,6.669227e-04,9.924517e-04,1.960230e-04,9.286990e-04,2.292052e-05
3,21086297,6.130682e-02,5.020155e-03,2.643462e-02,3.525478e-03,2.492630e-02,7.713211e-03
4,22982444,9.810989e-02,2.083104e-02,5.284168e-02,8.163487e-03,7.423603e-02,2.787389e-02
5,24388733,9.505095e-06,2.914271e-05,1.281905e-07,9.615618e-14,6.192457e-07,2.278088e-11
6,26195914,1.721552e-02,3.367874e-03,4.021124e-03,3.669006e-04,5.595502e-03,2.986386e-04
7,31769073,1.136353e-01,1.221471e-02,4.771210e-02,3.973580e-03,5.296033e-02,4.958405e-03
8,35289443,1.362125e-01,1.948548e-02,8.565362e-02,6.837186e-03,8.923001e-02,2.454800e-02
9,38393350,2.791931e-05,1.982605e-05,2.969131e-06,9.281217e-07,1.450536e-05,3.005735e-07
