<h1>Logistic Regression with distributed representation word embedding</h1>

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import find
import gensim
import tensorflow as tf

In [None]:
train = pd.read_csv('ratings_train.txt', delimiter='\t')
test = pd.read_csv('ratings_test.txt', delimiter='\t')
y_train = train.label.values
y_test = test.label.values

In [None]:
model = gensim.models.Word2Vec.load('./vectors.bin')

In [None]:
word_vectors = model.wv

In [None]:
train_segs = np.load('train_segs.npy')
test_segs = np.load('test_segs.npy')

In [None]:
train_segs[0]

In [None]:
word = []
for sent in train_segs:
    for seg in sent:
        word.append(seg)

In [None]:
word2num = {w:(i+1) for i,w in enumerate(set(word))}
num2word = {(i+1):w for i,w in enumerate(set(word))}

In [None]:
len(word2num)

In [None]:
embedding_vector = [np.zeros(shape=300)]
for i, w in enumerate(word2num.keys()):
    if w not in word_vectors.vocab:
        embedding_vector += [np.zeros(shape=300)] #[np.random.normal(scale=1e-2, size=300)]
    else:
        embedding_vector += [word_vectors[w]]

In [None]:
len(embedding_vector)

In [None]:
len(word_vectors.vocab)

In [None]:
word2num['<UNK>'] = 0
num2word[0] = '<UNK>'

In [None]:
def sent2index(segs):
    idx = []
    for sent in segs:
        temp = []
        for seg in sent:
            if seg not in word2num.keys():
                seg = '<UNK>'
            temp.append(word2num[seg])
        idx.append(temp)
    return idx

In [None]:
train_idx = sent2index(train_segs)
test_idx = sent2index(test_segs)

In [None]:
def idx2EmbedSum(idxSet, embedding_vector):
    sum_w2v = []
    for idxes in idxSet:
        temp = np.zeros(shape=300)
        for idx in idxes:
            temp += embedding_vector[idx]
        sum_w2v.append(temp)
    return sum_w2v

In [None]:
X_train_w2v = idx2EmbedSum(train_idx, embedding_vector)
X_test_w2v = idx2EmbedSum(test_idx, embedding_vector)

In [None]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_train_w2v, y_train)

In [None]:
print(logReg.score(X_train_w2v, y_train))
print(logReg.score(X_test_w2v, y_test))

In [None]:
n_output = 2
n_embedding = 300
n_hidden = 128
learning_rate = 1e-3
n_epoch = 10
batch_size = 64

In [None]:
tf.reset_default_graph()
# placeholder



In [None]:
# weights & bias





In [None]:
# logits / hypothesis / cost / optimizer / prediction / correct / accuracy










In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
# train












In [None]:
# test
test_batch = int(len(X_test_w2v) / batch_size)
test_acc = 0
for i in range(test_batch):
    if i == (test_batch-1):
        batch_xs = X_test_w2v[(i*batch_size):len(X_test_w2v)]
        batch_ys = y_test[(i*batch_size):y_test.shape[0]]
    else:
        batch_xs = X_test_w2v[i*batch_size:(i+1)*batch_size]
        batch_ys = y_test[i*batch_size:(i+1)*batch_size]       
    acc = sess.run(accuracy, feed_dict={X: batch_xs, Y: batch_ys})
    test_acc += acc
print('Accuracy: ', '{:.3f}'.format(test_acc/test_batch))