In [1]:
import pickle


def main():
    vocab = dict()
    with open('vocab_cut.txt') as f:
        for idx, line in enumerate(f):
            vocab[line.strip()] = idx

    with open('vocab.pkl', 'wb') as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)


if __name__ == '__main__':
    main()

In [2]:
#!/usr/bin/env python3
from scipy.sparse import *
import numpy as np
import pickle


def main2():
    with open('vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab)

    data, row, col = [], [], []
    counter = 1
    for fn in ['train_pos.txt', 'train_neg.txt']:
        with open(fn) as f:
            for line in f:
                tokens = [vocab.get(t, -1) for t in line.strip().split()]
                tokens = [t for t in tokens if t >= 0]
                for t in tokens:
                    for t2 in tokens:
                        data.append(1)
                        row.append(t)
                        col.append(t2)

                if counter % 10000 == 0:
                    print(counter)
                counter += 1
    cooc = coo_matrix((data, (row, col)))
    print("summing duplicates (this can take a while)")
    cooc.sum_duplicates()
    print("ferdig med summing")
    with open('cooc.pkl', 'wb') as f:
        pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)



main2()

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
summing duplicates (this can take a while)
ferdig med summing


In [3]:
#!/usr/bin/env python3
from scipy.sparse import *
import numpy as np
import pickle
import random


def main3():
    print("loading cooccurrence matrix")
    with open('cooc.pkl', 'rb') as f:
        cooc = pickle.load(f)
    print("{} nonzero entries".format(cooc.nnz))

    nmax = 100
    print("using nmax =", nmax, ", cooc.max() =", cooc.max())

    print("initializing embeddings")
    embedding_dim = 20
    xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
    ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

    eta = 0.001
    alpha = 3 / 4

    epochs = 10

    for epoch in range(epochs):
        print("epoch {}".format(epoch))
        for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
            logn = np.log(n)
            fn = min(1.0, (n / nmax) ** alpha)
            x, y = xs[ix, :], ys[jy, :]
            scale = 2 * eta * fn * (logn - np.dot(x, y))
            xs[ix, :] += scale * y
            ys[jy, :] += scale * x
    print("skal bare lagre")
    np.save('embeddings', xs)
    print("ferdig")



main3()

loading cooccurrence matrix
5263751 nonzero entries
using nmax = 100 , cooc.max() = 207302
initializing embeddings
epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
skal bare lagre
ferdig


In [None]:
import numpy as np
import csv
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

embedding = np.load("embeddings.npy")
print(np.shape(embedding))
# Take out the dictionary
with open('vocab.pkl', 'rb') as f:
	vocab = pickle.load(f)

print(np.shape(vocab))

X = []
y = []
for fn in ['train_pos.txt']:
    with open(fn) as f:
        for line in f:
            line_convert = [vocab.get(t, -1) for t in line.strip().split()]
            line_convert = [t for t in line_convert if t >= 0]
            average_vector = np.zeros((20))
            for i in range(len(line_convert)):
            	average_vector = np.add(average_vector,embedding[line_convert[i]])
            	average_vector = average_vector/len(line_convert)     
            X.append(average_vector)
            y.append(1) # Stands for positive
for fn in ['train_neg.txt']:
    with open(fn) as f:
        for line in f:
            line_convert = [vocab.get(t, -1) for t in line.strip().split()]
            line_convert = [t for t in line_convert if t >= 0]
            average_vector = np.zeros((20))
            for i in range(len(line_convert)):
            	average_vector = np.add(average_vector,embedding[(line_convert[i])])
            	average_vector = average_vector/len(line_convert)
            X.append(average_vector)
            y.append(0) # Stands for negative

# Build logistic regression classifiers to identify the polarity of words
print(np.shape(X))
print(np.shape(y))
lr = LogisticRegression(C=0.25)
lr.fit(X, y)

    
#     Accuracy for C=0.01: 0.87472
#     Accuracy for C=0.05: 0.88368
#     Accuracy for C=0.25: 0.88016
#     Accuracy for C=0.5: 0.87808
#     Accuracy for C=1: 0.87648



# Use Naive Bayes to identify the sentense

# Test with test data
field = []
test_X = []
count = 0
with open('test_data.txt', 'r') as f:
    for line in f:
        field = field + [line.split(",",1)]
        temp = [vocab.get(t, -1) for t in field[count][1].strip().split()]
        temp = [t for t in temp if t >= 0]
        average_vector = np.zeros((20))
        for i in range(len(temp)):
            average_vector = np.add(average_vector,embedding[(temp[i])])
            average_vector = average_vector/len(line_convert)
        test_X.append(average_vector)
        count = count + 1
test_y = lr.predict(test_X)

with open('test_result.csv', 'w') as csvfile:
    tempwriter = csv.writer(csvfile)
    tempwriter.writerow(["Id","Prediction"])
    count = 0
    for row in test_y:
    	if row == 0:
            row = -1
    	tempwriter.writerow([field[count][0],str(row)])
    	count = count + 1