In [1]:
from collections import deque
from sklearn.feature_extraction import DictVectorizer
import tensorflow as tf
import numpy as np
import xml.etree.ElementTree as et

vowel_table = {"a" : ["á"], "e" : ["é"], "i" : ["í"], "o" : ["ó", "ö", "ő"], "u" : ["ú", "ü", "ű"]}
vectorizer = DictVectorizer()

In [2]:
def ispunct(c):
    punctuations = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    for char in punctuations:
        if (c == char):
            return True
    return False


def isalpha(c):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    for char in alphabet:
        if (c == char):
            return True
    return False

# reduces the number of different characters to 30
def normalize_character(c):
    if (c.isspace()):
        return ' '
    if (c.isdigit()):
        return '0'
    if (ispunct(c)):
        return '_'
    if (isalpha(c)):
        return c
    return '*'


def normalize_text(text):
    normalized = ""
    for c in text:
        normalized += normalize_character(c)
    return normalized

In [3]:
def deaccentize(text):
    text = text.replace("á", "a")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ó", "o")
    text = text.replace("ö", "o")
    text = text.replace("ő", "o")
    text = text.replace("ú", "u")
    text = text.replace("ü", "u")
    text = text.replace("ű", "u")

    return text

def create_row(window, window_size):
    row = {}

    for i in range(-window_size, window_size + 1):
        row[i] = normalize_character(deaccentize(window.popleft()))

    del row[0]

    return row

# returns
# x_e: list of windows with the given window_size
# y_e: one-hot encoded values
def prepare_text(text, window_size, vowel):
    x_e = []
    y_e = []
    lower_text = text.lower()

    window = deque((), window_size * 2 + 1)
    for i in range(window.maxlen):
        window.append("_")
        lower_text += "_"

    for character in lower_text:
        window.append(character)
        if window[window_size] == vowel:
            x_e.append(create_row(window.copy(), window_size))
            y_e.append([1, 0])
        if window[window_size] in vowel_table[vowel]:
            x_e.append(create_row(window.copy(), window_size))
            y_e.append([0, 1])

    return x_e, y_e

def prepare_words(words, window_size, vowel):
    x_e = []
    y_e = []
    
    for word in words:
        if vowel not in word:
            continue
            
        x, y = prepare_text(word, window_size, vowel)

        if x_e == []:
            x_e = vectorizer.transform(x).toarray()
        else:
            tmp = vectorizer.transform(x)
            for tx in tmp:
                #x_e += tx.toarray()
                x_e = np.concatenate((x_e, tx.toarray()))
        y_e += y
        
    return x_e, y_e

In [4]:
# generates template windows for the alphabet
def generate_windows(window_size):
    windows = []
    alphabet = "abcdefghijklmnopqrstuvwxyz 0_*"
    alphabet_size = len(alphabet)

    for i in range(alphabet_size):
        new_window = {}

        end_of_slice = i + window_size * 2
        if end_of_slice <= alphabet_size:
            alphabet_slice = alphabet[i:end_of_slice]
        else:
            alphabet_slice = alphabet[i:alphabet_size]
            alphabet_slice += alphabet[0:end_of_slice - alphabet_size]

        for j in range(window_size):
            new_window[-1 * (j + 1)] = alphabet_slice[window_size - 1 - j]
            new_window[j + 1] = alphabet_slice[window_size + j]

        windows.append(new_window)

    return windows

In [65]:
def read_corpus(count):
    corpus = open("corpus")
    words = []
    
    for i in range(4):
        next(corpus)
        
    for line in corpus:
        splits = line.split()
        if splits != []:
            words.append(splits[0])
            if count < 0:
                break
            count -= 1
        
    return words

In [127]:
count = 20000
words = read_corpus(count)

train_x, train_y = prepare_words(words[:15000], 4, "e")
np.savez("train_e", x = train_x, y = train_y)

test_x, test_y = prepare_words(words[15000:], 4, "e")
np.savez("test_e", x = test_x, y = test_y)



In [128]:
vectorizer.fit(generate_windows(4))

# test_data = "Zsarnoki törvény és erkölcsi zsarnokság egyaránt nyomasztó."
# tmp_x, test_y = prepare_text(test_data, 4, "e")
# test_x = vectorizer.transform(tmp_x).toarray()

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [129]:
input_size = 240
output_size = 2

n_input = tf.placeholder(tf.float32, [None, input_size])
n_output = tf.placeholder(tf.float32, [None, output_size])

hidden_neurons = 10

b_hidden = tf.Variable(tf.random_normal([hidden_neurons]))
W_hidden = tf.Variable(tf.random_normal([input_size, hidden_neurons]))
hidden = tf.sigmoid(tf.matmul(n_input, W_hidden) + b_hidden)

W_output = tf.Variable(tf.random_normal([hidden_neurons, output_size]))
#output = tf.sigmoid(tf.matmul(hidden, W_output))
output = tf.nn.softmax(tf.matmul(hidden, W_output))

#cost = tf.reduce_mean(tf.square(n_output - output))
cost = tf.reduce_mean(-tf.reduce_sum(n_output * tf.log(output), reduction_indices=[1]))

optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(cost)

init = tf.initialize_all_variables()

In [146]:
sess = tf.Session()
sess.run(init)
print("training started")

# train_x, train_y = read_prepared()
data_train = np.load("train_e.npz")
data_test = np.load("test_e.npz")

# train_x = data_train["x"]
# train_y = data_train["y"]
# test = np.load("test_e.npz")
# test_x = test["x"]

for i in range(2001):
    cvalues = sess.run([train, cost, W_hidden, b_hidden, W_output], feed_dict={n_input: data_train["x"], n_output: data_train["y"]})

    if i < 101 and i % 100 == 0:
        print("")
        print("step: {:>3}".format(i))
        print("loss: {}".format(cvalues[1]))
    elif i % 500 == 0:
        print("")
        print("step: {:>3}".format(i))
        print("loss: {}".format(cvalues[1]))

print("")
result = sess.run(output, feed_dict={n_input: data_test["x"]})

training started

step:   0
loss: 1.170082688331604

step: 100
loss: 0.4717368483543396

step: 500
loss: 0.4014345407485962

step: 1000
loss: 0.3559802770614624

step: 1500
loss: 0.32574450969696045

step: 2000
loss: 0.30220603942871094



In [147]:
def decide(y):
    if y[0] > y[1]:
        return np.array([1, 0])
    else:
        return np.array([0, 1])
    
success = 0

for i in range(len(result)):
    if np.array_equal(decide(result[i]), data_test["y"][i]):
        success += 1
        
print("accuracy:")
print(success / len(result))

accuracy:
0.861904761904762
