In [1]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy as np

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random

Using TensorFlow backend.


In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [3]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [4]:
sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

In [7]:
model = Doc2Vec(min_count=1, window=10, size=128, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

In [8]:
token_count = sum([len(sentence) for sentence in sentences])
print(token_count)

100006


In [9]:
model.train(sentences.sentences_perm(), total_examples=token_count, epochs=15)

87261865

In [10]:
model.most_similar('good')

[('great', 0.6629217863082886),
 ('decent', 0.6190322637557983),
 ('nice', 0.5878570079803467),
 ('fine', 0.5873998403549194),
 ('bad', 0.5638591051101685),
 ('solid', 0.5198214650154114),
 ('excellent', 0.49024176597595215),
 ('workable', 0.4608621895313263),
 ('alright', 0.45717692375183105),
 ('terrific', 0.4457021653652191)]

In [11]:
model['nice']

array([ -2.37049055e+00,  -3.71870637e-01,   1.71233439e+00,
        -3.49776626e-01,   1.46382880e+00,   5.57693601e-01,
         1.70993924e+00,   1.20419049e+00,  -1.01734400e+00,
         1.73735666e+00,   1.28781945e-01,  -8.40478241e-01,
        -4.64325696e-01,  -6.96763396e-02,   1.57788157e+00,
        -3.61901671e-01,   6.56533360e-01,  -1.29465508e+00,
         4.51561004e-01,   3.26383471e-01,  -1.20201170e+00,
        -3.75328362e-01,   1.68469989e+00,  -1.55048206e-01,
         8.96417975e-01,   6.68684781e-01,   1.80388466e-01,
        -5.56801930e-02,   3.49873334e-01,   4.28469814e-02,
         2.35215783e+00,  -8.74150872e-01,   7.81498432e-01,
         3.36278200e-01,  -1.74268186e-02,  -3.44305158e-01,
         6.71895742e-02,  -6.04222119e-01,   7.18841553e-01,
        -1.32454491e+00,  -1.50396180e+00,  -2.20189977e+00,
         4.82800715e-02,   7.98003748e-02,  -7.61350334e-01,
         2.38305733e-01,   2.84754246e-01,  -2.27010295e-01,
        -4.15499091e-01,

In [12]:
model.save('./imdb.d2v')

In [13]:
model = Doc2Vec.load('./imdb.d2v')

In [14]:
sentences_array = sentences.to_array()
sentences_dict = {}

for i in range(0,len(sentences_array)):
    sentences_dict[sentences_array[i][1][0]] = sentences_array[i][0]
    
print(sentences_dict['TEST_NEG_0'])

['once', 'again', 'mr', 'costner', 'has', 'dragged', 'out', 'a', 'movie', 'for', 'far', 'longer', 'than', 'necessary', 'aside', 'from', 'the', 'terrific', 'sea', 'rescue', 'sequences', 'of', 'which', 'there', 'are', 'very', 'few', 'i', 'just', 'did', 'not', 'care', 'about', 'any', 'of', 'the', 'characters', 'most', 'of', 'us', 'have', 'ghosts', 'in', 'the', 'closet', 'and', 'costner', 's', 'character', 'are', 'realized', 'early', 'on', 'and', 'then', 'forgotten', 'until', 'much', 'later', 'by', 'which', 'time', 'i', 'did', 'not', 'care', 'the', 'character', 'we', 'should', 'really', 'care', 'about', 'is', 'a', 'very', 'cocky', 'overconfident', 'ashton', 'kutcher', 'the', 'problem', 'is', 'he', 'comes', 'off', 'as', 'kid', 'who', 'thinks', 'he', 's', 'better', 'than', 'anyone', 'else', 'around', 'him', 'and', 'shows', 'no', 'signs', 'of', 'a', 'cluttered', 'closet', 'his', 'only', 'obstacle', 'appears', 'to', 'be', 'winning', 'over', 'costner', 'finally', 'when', 'we', 'are', 'well', 'p

In [15]:
import numpy as np
count=0
for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    temp = (np.matrix(model[sentences_dict[prefix_train_pos]]).shape[0])
    if count < temp:
        count = temp
        
    temp = (np.matrix(model[sentences_dict[prefix_train_neg]]).shape[0])
    if count < temp:
        count = temp
        
print(count)

2494


In [16]:
X_train = np.zeros(shape=(25000, 1000, 128)).astype(np.float32)
Y_train = np.zeros(shape=(25000, 1)).astype(np.float32)

empty_word = np.zeros(128).astype(np.float32)

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    len1 = len(sentences_dict[prefix_train_pos])
    len2 = len(sentences_dict[prefix_train_neg])
    for j in range(1000):
        if j < len1:
            X_train[i,j,:] = model[sentences_dict[prefix_train_pos][j]]
        else:
            X_train[i,j,:] = empty_word
        
        if j < len2: 
            X_train[12500+i,j,:] = model[sentences_dict[prefix_train_neg][j]]
        else:
            X_train[12500+i,j,:] = empty_word
    
    Y_train[i,:] = 1
    Y_train[12500 + i,:] = 0

In [17]:
X_test = np.zeros(shape=(25000, 1000, 128)).astype(np.float32)
Y_test = np.zeros(shape=(25000, 1)).astype(np.float32)

empty_word = np.zeros(128).astype(np.float32)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    len1 = len(sentences_dict[prefix_test_pos])
    len2 = len(sentences_dict[prefix_test_neg])
    for j in range(1000):
        if j < len1:
            X_test[i,j,:] = model[sentences_dict[prefix_test_pos][j]]
        else:
            X_test[i,j,:] = empty_word
        
        if j < len2: 
            X_test[12500+i,j,:] = model[sentences_dict[prefix_test_neg][j]]
        else:
            X_test[12500+i,j,:] = empty_word
    
    Y_test[i,:] = 1
    Y_test[12500 + i,:] = 0

In [19]:
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(1000,128)))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.2))
model.add(LSTM(300, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_5 (Conv1D)            (None, 1000, 32)          12320     
_________________________________________________________________
dropout_9 (Dropout)          (None, 1000, 32)          0         
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 333, 32)           0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 333, 32)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 333, 32)           3104      
_________________________________________________________________
dropout_11 (Dropout)         (None, 333, 32)           0         
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 111, 32)           0         
__________

<keras.callbacks.History at 0x14bd15fd0>

In [20]:
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 90.09%
