## Convert Documents to Vectors

In [1]:
import sys, argparse
sys.path.append('..')
import helper
import numpy as np

import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

  return f(*args, **kwds)


### 1. Data preprocess

In [2]:
# load data from disk
pos_examples = [s.decode("utf-8", "ignore").strip() 
                for s in list(open(helper.mr_pos_data, mode="rb").readlines())]
neg_examples = [s.decode("utf-8", "ignore").strip() 
                for s in list(open(helper.mr_neg_data, mode="rb").readlines())]
pos_nums, neg_nums = len(pos_examples), len(neg_examples)

documents = pos_examples + neg_examples
documents = [gensim.utils.simple_preprocess(doc) for doc in documents]

pos_labels = [1 for _ in range(pos_nums)]
neg_labels = [0 for _ in range(neg_nums)]

labels = np.array(pos_labels + neg_labels)

In [3]:
def preprocess_corpus(documents, labels, split=0.1):
    n_samples = len(documents)
    split_ = int(n_samples * split)
    
    shuffle_indices = np.random.permutation(n_samples)
    corpus = [documents[i] for i in shuffle_indices]
    labels  = [labels[i] for i in shuffle_indices]
    
    train_corpus, train_labels = corpus[split_:], labels[split_:]
    test_corpus, test_labels = corpus[split_:], labels[split_:]
    train_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_corpus)]
    return train_corpus, train_labels, test_corpus, test_labels

train_corpus, train_labels, test_corpus, test_labels = preprocess_corpus(documents, labels)

### 2. Training a doc2vec on movie review dataset

The API provided by Gensim trains a doc2vec model by using the method illustrated in paper Le and Mikolov et al.

In [4]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40)
model.build_vocab(train_corpus)
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 38 s, sys: 6.68 s, total: 44.7 s
Wall time: 26.3 s


### 3. Training a linear classifier

In [5]:
from sklearn.svm import LinearSVC
from sklearn import metrics

# train a linear classify
X_train, y_train = model.docvecs.vectors_docs, train_labels

svc = LinearSVC()
svc.fit(X_train, y_train)

X_test = [model.infer_vector(doc) for doc in test_corpus]
y_test = test_labels

y_predicted = svc.predict(X_test)
print(metrics.classification_report(y_predicted, y_test))

             precision    recall  f1-score   support

          0       0.69      0.68      0.68      4823
          1       0.68      0.69      0.68      4773

avg / total       0.68      0.68      0.68      9596

