Data was taken from http://ai.stanford.edu/~amaas/data/sentiment/ 

In [50]:
import os
import glob
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import pandas as pd

In [60]:
def files_to_dataframe(file_list):
    lines = []
    for file_name in file_list:
        with open(file_name, 'r') as f:
            for line in f:
                lines.append(line.strip())
    
    return pd.DataFrame(lines, columns=['review'])

In [19]:
def clean_review(review, remove_stopwords=False):
    review_text = BeautifulSoup(review, 'html.parser').get_text()
    review_text = re.sub(r'[^a-zA-Z]', ' ', review_text)
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
    
    return ' '.join(words)

Organizing the data into the format used in http://linanqiu.github.io/2015/10/07/word2vec-sentiment/

In [51]:
data_prefix = '/home/gclenden/Documents/umap_testing/data/aclImdb/'

# Get labeled training files
train_pos_files = glob.glob(os.path.join(data_prefix, 'train/pos/*.txt'))
train_neg_files = glob.glob(os.path.join(data_prefix, 'train/neg/*.txt'))
# Get labeled test files
test_pos_files = glob.glob(os.path.join(data_prefix, 'test/pos/*.txt'))
test_neg_files = glob.glob(os.path.join(data_prefix, 'test/neg/*.txt'))

# Get unlabeled dataset
unsup_files = glob.glob(os.path.join(data_prefix, 'train/unsup/*.txt'))

In [63]:
train_pos = files_to_dataframe(train_pos_files)
train_neg = files_to_dataframe(train_neg_files)

test_pos = files_to_dataframe(test_pos_files)
test_neg = files_to_dataframe(test_neg_files)

unlabeled = files_to_dataframe(unsup_files)

In [65]:
train_pos['review'] = train_pos['review'].apply(clean_review)
train_neg['review'] = train_neg['review'].apply(clean_review)

test_pos['review'] = test_pos['review'].apply(clean_review)
test_neg['review'] = test_neg['review'].apply(clean_review)

unlabeled['review'] = unlabeled['review'].apply(clean_review)



In [66]:
train_pos['review'].to_csv('train-pos.txt', index=False)
train_neg['review'].to_csv('train-neg.txt', index=False)
test_pos['review'].to_csv('test-pos.txt', index=False)
test_neg['review'].to_csv('test-neg.txt', index=False)
unlabeled['review'].to_csv('train-unsup.txt', index=False)

## Setting up doc2vec

In [67]:
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

import numpy as np
from random import shuffle

In [35]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(line.decode().split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(line.decode().split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [68]:
sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

In [69]:
%%time
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)

model.build_vocab(sentences.to_array())

CPU times: user 13.9 s, sys: 348 ms, total: 14.2 s
Wall time: 14.3 s


## Let's actually train

In [73]:
%%time
model.train(sentences.sentences_perm(), epochs=20, total_examples=model.corpus_count)

CPU times: user 45min 21s, sys: 20.6 s, total: 45min 42s
Wall time: 12min 46s


234701200

In [74]:
model.save('./imdb.d2v')