In [19]:
import os
import glob
import re
import string
import numpy as np

from nltk.tokenize import word_tokenize
import pymorphy2

from keras.models import Sequential
from keras.layers import Activation
from keras.optimizers import SGD
from keras.layers import Dense

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier as gb

Word2Vec implementation

In [5]:
corpora = ""
dump = r'data'
for subdir in ['neg', 'pos']:
    for file in os.listdir(dump + subdir):
        with open(dump + subdir + '\\' + file, encoding='utf8') as f:
            corpora += f.read()

In [6]:
def preprocess(corpora):
    corpora = corpora.lower()
    exclude = set(string.punctuation)
    exclude = list(exclude) + [str(k) for k in range(10)] + ['«', '»'] + ['\n', '\r', '—']
    corpora = ''.join(ch for ch in corpora if ch not in exclude)
    words = word_tokenize(corpora)
    return words

words = preprocess(corpora)

In [9]:
unique_lemmas = []
morph = pymorphy2.MorphAnalyzer()
for word in list(set(words)):
    p = morph.parse(word)[0]
    unique_lemmas.append(p.normal_form)

In [11]:
X = np.zeros((len(words), len(unique_lemmas)))

for i in range(X.shape[0]):
    p = morph.parse(words[i])[0]
    X[i][unique_lemmas.index(p.normal_form)] = 1

In [31]:
train_X = np.zeros((X.shape[0] - 4, 1, X.shape[1]))

for i in range(2, X.shape[0] - 2):
    for j in range(1):
        if j < 2:
            train_X[i - 2][j] = X[i - j - 1]
        else:
            train_X[i - 2][j] = X[i + j - 1]
train_y =  X[2 : X.shape[0] - 2]
train_X.shape, train_y.shape

((64762, 1, 17201), (64762, 17201))

In [14]:
#Neural Network

model = Sequential()
model.add(Dense(300, input_shape=(len(unique_lemmas),), activation="relu"))
model.add(Dense(len(unique_lemmas), input_shape=(300,)))
sgd = SGD(lr=0.01)
model.compile(loss="binary_crossentropy", optimizer=sgd, metrics=["accuracy"])

In [16]:
model.fit(train_X.reshape(X.shape[0] - 4, -1), train_y, epochs = 1)

Epoch 1/1


<keras.callbacks.History at 0x1ba18d31400>

In [17]:
model_weights = model.get_weights()
weights = model_weights[0]

In [18]:
def word2vec(word):
    try:
        morph1 = pymorphy2.MorphAnalyzer()
        p = morph.parse(word)[0]
        ind = unique_lemmas.index(p.normal_form)
        return weights[ind]
    except:
        return np.random.normal(0, 1, 300)

Testing on reviews

In [20]:
path_neg = 'data\\neg' 
docs_neg = []
os.chdir(path_neg)
for fn in glob.glob("*.txt"):
    with open(fn, encoding="utf8") as f: 
        docs_neg.append(f.read())

In [21]:
path_pos = 'data\\pos' 
docs_pos = []
os.chdir(path_pos)
for fn in glob.glob("*.txt"):
    with open(fn, encoding="utf8") as f:
        docs_pos.append(f.read())

In [22]:
num_neg = len(docs_neg)

num_train_neg = int(0.9*num_neg)

train_set_neg = []
for i in range(0, num_train_neg):
    train_set_neg.append(docs_neg[i])

test_set_neg = []
for i in range(num_train_neg, num_neg):
    test_set_neg.append(docs_neg[i])

In [23]:
print ("Lenght of the negative train set", len(train_set_neg))
print ("Lenght of the negative test set", len(test_set_neg))

Lenght of the negative train set 90
Lenght of the negative test set 10


In [24]:
num_pos = len(docs_pos)

num_train_pos = int(0.9*num_pos)

train_set_pos = []
for i in range(0, num_train_pos):
    train_set_pos.append(docs_pos[i])

test_set_pos = []
for i in range(num_train_pos, num_pos):
    test_set_pos.append(docs_pos[i])

In [25]:
print ("Lenght of the positive train set", len(train_set_pos))
print ("Lenght of the positive train set", len(test_set_pos))

Lenght of the positive train set 90
Lenght of the positive train set 10


In [30]:
train_set = train_set_pos + train_set_neg
test_set = test_set_pos + test_set_neg
y_train = [1 for k in range(len(train_set_pos))] + [0 for k in range(len(train_set_neg))]
y_test = [1 for k in range(len(test_set_pos))] + [0 for k in range(len(test_set_neg))]

In [27]:
def get_feat(data):
    feat = []
    for i in range(len(data)):
        dat = (data[i]).split(" ")
        temp = []
        for w in dat:
            temp.append(word2vec(w))
        temp = (np.array(temp)).mean(0)
        feat.append(temp)
    return feat
    
train_len = len(train_set)
test_len = len(test_set)
sets = get_feat(train_set + test_set)
train_set = sets[ : train_len]
test_set = sets[train_len : ]

In [28]:
def model_tr(train_X, train_Y):
    model = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    model.fit(train_X, train_Y)
    return model

log_model = model_tr(train_set, y_train)
predict = log_model.predict(test_set)
test_acc = np.mean(predict == y_test)
print("Test set accuracy: %.4f" % test_acc)

Test set accuracy: 0.7000
