# Sentiment Analysis using Doc2Vec

In [1]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random

In [2]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [3]:
# sources = {'bignews.txt':'TEST_NEWS','mstweet.txt':"TEST_MS",'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS','titletest.txt':'TEST_TITLE', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}
sources = {'clean.txt':"TEST_MS"}
sentences = LabeledLineSentence(sources)

In [4]:
model = Doc2Vec(min_count=1, window=10, vector_size=200, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



In [5]:
max_epochs = 5

for epoch in range(max_epochs):
    model.train(sentences.sentences_perm(),total_examples=model.corpus_count,epochs=model.iter)

  after removing the cwd from sys.path.


In [99]:
a=model.most_similar('xbox',topn=50)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


In [100]:
type(a)

list

In [101]:
a[1]

('to', 0.9999680519104004)

In [102]:
res = []
for i in range(len(a)):
    if len(a[i][0])>3:
        res.append(a[i][0])
print(res)

['Microsoft', 'Google', 'Microsofts', 'with', 'Bing', 'Windows', 'kauft', 'Office', 'that', 'Corp', 'photo', 'Security', 'their', 'Phone', 'search', 'Maps', 'COFEE', 'July', 'computing', 'Silverlight', 'Black', 'Wien', 'screen', 'online', 'Dynamics', 'users', 'xboxer', 'Stadt']


In [103]:
res[1]

'Google'

In [104]:
import csv
headers = ['words']
k = {}

for i in range(len(res)):
    k[res[i]]= 2
print(k)

with open('microsoft.csv','w') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(k)


#     f_csv.writerows(rows)

{'Microsoft': 2, 'Google': 2, 'Microsofts': 2, 'with': 2, 'Bing': 2, 'Windows': 2, 'kauft': 2, 'Office': 2, 'that': 2, 'Corp': 2, 'photo': 2, 'Security': 2, 'their': 2, 'Phone': 2, 'search': 2, 'Maps': 2, 'COFEE': 2, 'July': 2, 'computing': 2, 'Silverlight': 2, 'Black': 2, 'Wien': 2, 'screen': 2, 'online': 2, 'Dynamics': 2, 'users': 2, 'xboxer': 2, 'Stadt': 2}


In [105]:
with open('file.csv', 'w') as f:
    [f.write('{0},{1}\n'.format(key, value)) for key, value in k.items()]

We can also prop the hood open and see what the model actually contains. This is each of the vectors of the words and sentences in the model. We can access all of them using `model.syn0` (for the geekier ones among you, `syn0` is simply the output layer of the shallow neural network). However, we don't want to use the entire `syn0` since that contains the vectors for the words as well, but we are only interested in the ones for sentences.

Here's a sample vector for the first sentence in the training set for negative reviews:

In [7]:
model['TRAIN_NEG_0']

array([-0.00346888, -0.1065565 , -0.04915281, -0.20224075,  0.1978923 ,
       -0.04282485, -0.43660057, -0.30959257, -0.4112031 , -0.19187859,
       -0.14489022,  0.01105775, -0.21810488,  0.5086233 , -0.16626711,
        0.02607804,  0.5183548 ,  0.24627733,  0.02160478,  0.38943416,
        0.18184488, -0.27163163, -0.06424608,  0.40741512, -0.00364409,
        0.05184645, -0.25310647,  0.19860412, -0.8848717 , -0.04368122,
       -0.65877753,  0.4816111 , -0.01869016,  0.08200007, -0.24889433,
        0.41303572,  0.02705549,  0.07660832,  0.01951412,  0.13831238,
        0.02055542, -0.01649992,  0.90899366, -0.07222486, -0.19460824,
        0.04820708, -0.57059723, -0.11099932, -0.3158354 , -0.19011644,
        0.27192736, -0.08092405, -0.20322415,  0.34278238,  0.25435397,
       -0.43429902, -0.56149405, -0.20335905,  0.14805357, -0.20239778,
       -0.01001655,  0.29398334, -0.07481223, -0.16852838, -0.26088372,
        0.2910888 , -0.29018223, -0.34832418, -0.28275296, -0.05

In [8]:
model.save('./imdb.d2v')

And load it.

In [9]:
model = Doc2Vec.load('./imdb.d2v')

In [10]:
train_arrays = numpy.zeros((25000, 200))
train_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[12500 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

The training array looks like this: rows and rows of vectors representing each sentence.

In [11]:
print (train_arrays)

[[ 0.0823211   0.10839429  0.16711673 ... -0.51364815 -0.0026078
  -0.18761201]
 [-0.21725507  0.30024245  0.41205418 ... -0.75862747 -0.24413089
   0.14139348]
 [-0.17174432  0.38831535 -0.3027555  ...  0.0689256   0.24885336
  -0.04213567]
 ...
 [-0.27201086 -0.06924483  0.39688212 ...  0.46984157 -0.36155355
   0.13365436]
 [ 0.19686988 -0.34591311 -0.08938753 ... -0.26868853 -0.51145691
   0.10289862]
 [-0.7026062  -0.08596174 -0.11938912 ... -0.07093173  0.00833835
  -0.14559741]]


The labels are simply category labels for the sentence vectors -- 1 representing positive and 0 for negative.

In [12]:
import numpy as np

np.set_printoptions(threshold = 1e6)

# print (train_labels)

In [13]:
test_arrays = numpy.zeros((25000, 200))
test_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[12500 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

In [14]:
print(prefix_test_neg)

TEST_NEG_12499


In [15]:
print((model['TEST_NEG_12499']))

[-0.59569895 -0.8965415   0.07935757 -0.00659682 -0.95527875 -0.15965545
 -0.22024347 -0.71725464 -0.00278209  0.6489274   0.02783076  0.41254148
 -0.4857486   1.0076189   0.05151547  1.4852124   0.39688328  0.75160784
  0.9116853  -0.7418756  -0.33589527 -0.24818738  0.28594697  0.14713639
 -0.378412    0.17280717 -0.01326428  0.9041615  -0.88265     0.6148145
  0.14030224 -0.23786952  0.46584538  0.03714736 -1.8591859   0.57720435
  0.7210388   1.2823783   0.25294775 -0.63693976  0.11891571 -0.47325477
  0.58119774  0.32715112  0.01301159 -0.01432207 -0.45809492  0.22264095
  0.27674446 -0.06028048  0.34679613  0.12709422  0.35866034  0.721831
 -0.03781616 -0.5956087  -0.39582577 -0.18351023  0.8806755  -0.4831504
 -0.5015606   0.727674    0.13093314 -0.29791042  0.00671306  0.07178222
  0.6617844   0.63425034 -1.0808659  -0.39216593 -0.17100728  0.4626034
  0.05254643 -0.00289082 -0.0305197   2.2216089  -0.42940402 -0.41609
  0.51682174  1.1094373   0.5027408  -1.2769161  -0.6250945

### Classification

Now we train a logistic regression classifier using the training data.

In [16]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

And find that we have achieved near 87% accuracy for sentiment analysis. This is rather incredible, given that we are only using a linear SVM and a very shallow neural network.

In [17]:
classifier.score(test_arrays, test_labels)

0.8608

In [18]:
predictions=classifier.predict(test_arrays)
print(predictions)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1.
 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.

Isn't this fantastic? Hope I saved you some time!

## References

- Doc2vec: https://radimrehurek.com/gensim/models/doc2vec.html
- Paper that inspired this: http://arxiv.org/abs/1405.4053

In [19]:
predict_title_arrays = numpy.zeros((13671, 200))

for i in range(13671):
    prefix_test_title = 'TEST_TITLE_' + str(i)
    predict_title_arrays[i] = model[prefix_test_title]

In [20]:
titlepredictions=classifier.predict(predict_title_arrays)
print(titlepredictions)


[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1.
 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.

In [22]:
predict_bignews_arrays = numpy.zeros((6, 200))

for i in range(6):
    prefix_test_bignews = 'TEST_NEWS_' + str(i)
    predict_bignews_arrays[i] = model[prefix_test_bignews]
    

In [23]:
bignewspredictions=classifier.predict(predict_bignews_arrays)
print(bignewspredictions)


[0. 1. 0. 1. 0. 0.]


In [24]:
predict_ms_arrays = numpy.zeros((88, 200))

for i in range(88):
    prefix_test_ms = 'TEST_MS_' + str(i)
    predict_ms_arrays[i] = model[prefix_test_ms]

In [25]:
mspredictions=classifier.predict(predict_ms_arrays)
print(mspredictions)

[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1.
 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.]
