<h1>Logistic Regression with distributed representation word embedding</h1>

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import find
import gensim
import tensorflow as tf



In [2]:
train = pd.read_csv('ratings_train.txt', delimiter='\t')
test = pd.read_csv('ratings_test.txt', delimiter='\t')
y_train = train.label.values
y_test = test.label.values

In [3]:
model = gensim.models.Word2Vec.load('./vectors.bin')

In [4]:
word_vectors = model.wv

In [5]:
train_segs = np.load('train_segs.npy')
test_segs = np.load('test_segs.npy')

In [6]:
train_segs[0]

['아', '더빙', '..', '진짜', '짜증', '나네', '요', '목소리']

In [7]:
word = []
for sent in train_segs:
    for seg in sent:
        word.append(seg)

In [42]:
word2num = {w:(i+1) for i,w in enumerate(set(word))}
num2word = {(i+1):w for i,w in enumerate(set(word))}

In [43]:
len(word2num)

61589

In [44]:
embedding_vector = [np.zeros(shape=300)]
for i, w in enumerate(word2num.keys()):
    if w not in word_vectors.vocab:
        embedding_vector += [np.zeros(shape=300)] #[np.random.normal(scale=1e-2, size=300)]
    else:
        embedding_vector += [word_vectors[w]]

In [45]:
len(embedding_vector)

61590

In [12]:
len(word_vectors.vocab)

34371

In [13]:
word2num['<UNK>'] = 0
num2word[0] = '<UNK>'

In [14]:
def sent2index(segs):
    idx = []
    for sent in segs:
        temp = []
        for seg in sent:
            if seg not in word2num.keys():
                seg = '<UNK>'
            temp.append(word2num[seg])
        idx.append(temp)
    return idx

In [15]:
train_idx = sent2index(train_segs)
test_idx = sent2index(test_segs)

In [16]:
def idx2EmbedSum(idxSet, embedding_vector):
    sum_w2v = []
    for idxes in idxSet:
        temp = np.zeros(shape=300)
        for idx in idxes:
            temp += embedding_vector[idx]
        sum_w2v.append(temp)
    return sum_w2v

In [46]:
X_train_w2v = idx2EmbedSum(train_idx, embedding_vector)
X_test_w2v = idx2EmbedSum(test_idx, embedding_vector)

In [18]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_train_w2v, y_train)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\RYU\Anaconda3\envs\tensor\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-9941680aa384>", line 3, in <module>
    logReg.fit(X_train_w2v, y_train)
  File "C:\Users\RYU\Anaconda3\envs\tensor\lib\site-packages\sklearn\linear_model\logistic.py", line 1217, in fit
    check_classification_targets(y)
  File "C:\Users\RYU\Anaconda3\envs\tensor\lib\site-packages\sklearn\utils\multiclass.py", line 169, in check_classification_targets
    y_type = type_of_target(y)
  File "C:\Users\RYU\Anaconda3\envs\tensor\lib\site-packages\sklearn\utils\multiclass.py", line 239, in type_of_target
    valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__'))
  File "C:\Users\RYU\Anaconda3\envs\tensor\lib\abc.py", line 182, in __instancecheck__
    if subclass in cls._abc_cache:
  File "C:\Users\RYU\Anaconda3\envs\tensor\lib\_weakrefset.p

KeyboardInterrupt: 

In [19]:
print(logReg.score(X_train_w2v, y_train))
print(logReg.score(X_test_w2v, y_test))

NotFittedError: This LogisticRegression instance is not fitted yet

In [50]:
n_output = 2
n_embedding = 300
n_hidden = 128
learning_rate = 1e-3
n_epoch = 10
batch_size = 64
n_vocab = len(word2num)

In [47]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, shape=[None, n_embedding])
Y = tf.placeholder(tf.int32, shape=[None])

In [48]:
W1 = tf.Variable(tf.truncated_normal([n_embedding, n_hidden]))
b1 = tf.Variable(tf.truncated_normal([n_hidden]))
W2 = tf.Variable(tf.truncated_normal([n_hidden, n_output]))
b2 = tf.Variable(tf.truncated_normal([n_output]))

In [49]:
h = tf.nn.relu(tf.matmul(X, W1) + b1)
logits = tf.matmul(h, W2) + b2
hypothesis = tf.nn.softmax(logits)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
prediction = tf.cast(tf.argmax(hypothesis, 1), tf.int32)
correct_prediction = tf.equal(prediction, Y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [51]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(n_epoch):
    total_batch = int(len(X_train_w2v) / batch_size)
    cost_avg = 0
    print('< epoch :', (epoch+1), '>')
    for i in range(total_batch):
        if i == (total_batch-1):
            batch_xs = X_train_w2v[(i*batch_size):len(X_train_w2v)]
            batch_ys = y_train[(i*batch_size):len(y_train)]
        else:
            batch_xs = X_train_w2v[i*batch_size:(i+1)*batch_size]
            batch_ys = y_train[i*batch_size:(i+1)*batch_size]       
        cost_val, _ = sess.run([cost, optimizer], feed_dict={X: batch_xs, Y: batch_ys})
        cost_avg += cost_val
        if i % 1000 == 999:
            print('%04d' % (i+1), 'Cost: ', '{:.3f}'.format(cost_avg/500))
            cost_avg = 0

< epoch : 1 >
1000 Cost:  84.852
2000 Cost:  34.646
< epoch : 2 >
1000 Cost:  21.392
2000 Cost:  16.123
< epoch : 3 >
1000 Cost:  11.810
2000 Cost:  9.613
< epoch : 4 >
1000 Cost:  7.689
2000 Cost:  6.775
< epoch : 5 >
1000 Cost:  5.626
2000 Cost:  5.094
< epoch : 6 >
1000 Cost:  4.596
2000 Cost:  4.253
< epoch : 7 >
1000 Cost:  4.016
2000 Cost:  3.799
< epoch : 8 >
1000 Cost:  3.520
2000 Cost:  3.436
< epoch : 9 >
1000 Cost:  3.177
2000 Cost:  3.191
< epoch : 10 >
1000 Cost:  3.176
2000 Cost:  2.950


In [52]:
test_batch = int(len(X_test_w2v) / batch_size)
test_acc = 0
for i in range(test_batch):
    if i == (test_batch-1):
        batch_xs = X_test_w2v[(i*batch_size):len(X_test_w2v)]
        batch_ys = y_test[(i*batch_size):y_test.shape[0]]
    else:
        batch_xs = X_test_w2v[i*batch_size:(i+1)*batch_size]
        batch_ys = y_test[i*batch_size:(i+1)*batch_size]       
    acc = sess.run(accuracy, feed_dict={X: batch_xs, Y: batch_ys})
    test_acc += acc
print('Accuracy: ', '{:.3f}'.format(test_acc/test_batch))

Accuracy:  0.767
