In [4]:
from collections import deque
from sklearn.feature_extraction import DictVectorizer
import tensorflow as tf
import xml.etree.ElementTree as et

vowel_table = {"a" : ["á"], "e" : ["é"], "i" : ["í"], "o" : ["ó", "ö", "ő"], "u" : ["ú", "ü", "ű"]}
vectorizer = DictVectorizer()

In [5]:
def ispunct(c):
    punctuations = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    for char in punctuations:
        if (c == char):
            return True
    return False


def isalpha(c):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    for char in alphabet:
        if (c == char):
            return True
    return False

# reduces the number of different characters to 30
def normalize_character(c):
    if (c.isspace()):
        return ' '
    if (c.isdigit()):
        return '0'
    if (ispunct(c)):
        return '_'
    if (isalpha(c)):
        return c
    return '*'


def normalize_text(text):
    normalized = ""
    for c in text:
        normalized += normalize_character(c)
    return normalized

In [6]:
def deaccentize(text):
    text = text.replace("á", "a")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ó", "o")
    text = text.replace("ö", "o")
    text = text.replace("ő", "o")
    text = text.replace("ú", "u")
    text = text.replace("ü", "u")
    text = text.replace("ű", "u")

    return text

def create_row(window, window_size):
    row = {}

    for i in range(-window_size, window_size + 1):
        row[i] = normalize_character(deaccentize(window.popleft()))

    del row[0]

    return row


def prepare_text(text, window_size, vowel):
    x_e = []
    y_e = []
    lower_text = text.lower()

    window = deque((), window_size * 2 + 1)
    for i in range(window.maxlen):
        window.append("_")
        lower_text += "_"

    for character in lower_text:
        window.append(character)
        if window[window_size] == vowel:
            x_e.append(create_row(window.copy(), window_size))
            y_e.append([1, 0])
        if window[window_size] in vowel_table[vowel]:
            x_e.append(create_row(window.copy(), window_size))
            y_e.append([0, 1])

    return x_e, y_e

In [7]:
# generates template windows for the alphabet
def generate_windows(window_size):
    windows = []
    alphabet = "abcdefghijklmnopqrstuvwxyz 0_*"
    alphabet_size = len(alphabet)

    for i in range(alphabet_size):
        new_window = {}

        end_of_slice = i + window_size * 2
        if end_of_slice <= alphabet_size:
            alphabet_slice = alphabet[i:end_of_slice]
        else:
            alphabet_slice = alphabet[i:alphabet_size]
            alphabet_slice += alphabet[0:end_of_slice - alphabet_size]

        for j in range(window_size):
            new_window[-1 * (j + 1)] = alphabet_slice[window_size - 1 - j]
            new_window[j + 1] = alphabet_slice[window_size + j]

        windows.append(new_window)

    return windows

In [8]:
input_size = 240
output_size = 2

n_input = tf.placeholder(tf.float32, [None, input_size])
n_output = tf.placeholder(tf.float32, [None, output_size])

hidden_neurons = 10

b_hidden = tf.Variable(tf.random_normal([hidden_neurons]))
W_hidden = tf.Variable(tf.random_normal([input_size, hidden_neurons]))
hidden = tf.sigmoid(tf.matmul(n_input, W_hidden) + b_hidden)

W_output = tf.Variable(tf.random_normal([hidden_neurons, output_size]))
output = tf.sigmoid(tf.matmul(hidden, W_output))

cost = tf.reduce_mean(tf.square(n_output - output))

optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(cost)

init = tf.initialize_all_variables()

In [9]:
vectorizer.fit(generate_windows(4))

train_data = "A politika és az erkölcs kapcsolatáról régebben lezárt vita foglalkozott az erkölcs és a törvények kapcsolatával is."
tmp_x, train_y = prepare_text(train_data, 4, "e")
train_x = vectorizer.transform(tmp_x).toarray()

test_data = "Zsarnoki törvény és erkölcsi zsarnokság egyaránt nyomasztó."
tmp_x, test_y = prepare_text(test_data, 4, "e")
test_x = vectorizer.transform(tmp_x).toarray()

In [10]:
sess = tf.Session()
sess.run(init)

for i in range(5000):
    cvalues = sess.run([train, cost, W_hidden, b_hidden, W_output], feed_dict={n_input: train_x, n_output: train_y})

    if i % 100 == 0:
        print("")
        print("step: {:>3}".format(i))
        print("loss: {}".format(cvalues[1]))

print("")
print(sess.run(output, feed_dict={n_input: test_x}))


step:   0
loss: 0.38514241576194763

step: 100
loss: 0.0806296318769455

step: 200
loss: 0.02782360091805458

step: 300
loss: 0.01437755860388279

step: 400
loss: 0.009060019627213478

step: 500
loss: 0.006361192557960749

step: 600
loss: 0.004781566560268402

step: 700
loss: 0.0037700801622122526

step: 800
loss: 0.0030800143722444773

step: 900
loss: 0.002585839480161667

step: 1000
loss: 0.002217965666204691

step: 1100
loss: 0.001935320207849145

step: 1200
loss: 0.0017124030273407698

step: 1300
loss: 0.0015327080618590117

step: 1400
loss: 0.0013851572293788195

step: 1500
loss: 0.001262078178115189

step: 1600
loss: 0.0011580116115510464

step: 1700
loss: 0.0010689758928492665

step: 1800
loss: 0.0009920155862346292

step: 1900
loss: 0.0009248846909031272

step: 2000
loss: 0.0008658539736643434

step: 2100
loss: 0.0008135723765008152

step: 2200
loss: 0.0007669708575122058

step: 2300
loss: 0.0007251876522786915

step: 2400
loss: 0.0006875263643451035

step: 2500
loss: 0.000653