# Model

In [4]:
# make model
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
# numpy
import numpy
# random
from random import shuffle
# classifier
from sklearn.linear_model import LogisticRegression

In [16]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open('/home/ydw/capston/python/data/doc2vec/'+source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open('/home/ydw/capston/python/data/doc2vec/'+source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [17]:
sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'test-neu.txt' : 'TEST_NEU', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-neu.txt' : 'TRAIN_NEU',"expert_100.txt":"TEST_UNSUP"}
sentences = LabeledLineSentence(sources)

In [18]:
model = Doc2Vec(min_count=1, window=10, vector_size=300, workers=8, alpha=0.045, min_alpha=0.045)
model.build_vocab(sentences.to_array())
model.train(sentences.sentences_perm(), total_examples = 2770*3, epochs = 30)



In [19]:
train_arrays = numpy.zeros((6810, 300))
train_labels = numpy.zeros(6810)

for i in range(2270):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    prefix_train_neu = 'TRAIN_NEU_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[2270 + i] = model[prefix_train_neg]
    train_arrays[2270*2 + i] = model[prefix_train_neu]
    train_labels[i] = 0
    train_labels[2270 + i] = 1
    train_labels[2270*2 + i] = 2

In [20]:
test_arrays = numpy.zeros((1743, 300))
test_labels = numpy.zeros(1743)

for i in range(581):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    prefix_test_neu = 'TEST_NEU_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[581 + i] = model[prefix_test_neg]
    test_arrays[581*2 + i] = model[prefix_test_neu]
    test_labels[i] = 0
    test_labels[581 + i] = 1
    test_labels[581*2 + i] = 2

In [21]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)
classifier.score(test_arrays, test_labels)

0.61847389558232935

# Test(expert 100)

In [22]:
#expert 100
f = open('/home/ydw/capston/python/data/test/expert/expert_100.txt', 'r', encoding='utf-8')
text = f.read().splitlines()
f.close()    

In [23]:
#expert 100
f = open('/home/ydw/capston/python/data/test/expert/y.txt', 'r', encoding='utf-8')
ry = f.read().splitlines()
f.close()    

y = []
for i in ry:
    if(i == 'l'):
        y.append(2.0)
    elif(i == 'p'):
        y.append(1.0)
    elif(i == 'n'):
        y.append(0.0)
    else:
        y.append('-1')

In [24]:
#expert 100
test_arrays = numpy.zeros((99, 300))
test_labels = numpy.zeros(99)

In [25]:
#expert 100
pre_y = []
for i in range(99):
    test_arrays[i] = model['TEST_UNSUP_' + str(i)]
    pre_y.append(classifier.predict([test_arrays[i]])[0])

In [26]:
classifier.score(test_arrays, y)

0.31313131313131315