In [1]:
from sklearn.feature_extraction import DictVectorizer

vowel_table = {"a" : ["á"], "e" : ["é"], "i" : ["í"], "o" : ["ó", "ö", "ő"], "u" : ["ú", "ü", "ű"]}
vectorizer = DictVectorizer()

In [2]:
def deaccentize(text):
    text = text.replace("á", "a")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ó", "o")
    text = text.replace("ö", "o")
    text = text.replace("ő", "o")
    text = text.replace("ú", "u")
    text = text.replace("ü", "u")
    text = text.replace("ű", "u")

    return text

def ispunct(c):
    punctuations = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    for char in punctuations:
        if (c == char):
            return True
    return False


def isalpha(c):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    for char in alphabet:
        if (c == char):
            return True
    return False

# reduces the number of different characters to 30
def normalize_character(c):
    if (c.isspace()):
        return ' '
    if (c.isdigit()):
        return '0'
    if (ispunct(c)):
        return '_'
    if (isalpha(c)):
        return c
    return '*'

# generates template windows for the alphabet
def generate_windows(window_size):
    windows = []
    alphabet = "abcdefghijklmnopqrstuvwxyz 0_*"
    alphabet_size = len(alphabet)

    for i in range(alphabet_size):
        new_window = {}

        end_of_slice = i + window_size * 2
        # adding the center of the window
        end_of_slice += 1
        if end_of_slice <= alphabet_size:
            alphabet_slice = alphabet[i:end_of_slice]
        else:
            alphabet_slice = alphabet[i:alphabet_size]
            alphabet_slice += alphabet[0:end_of_slice - alphabet_size]

        new_window[0] = alphabet_slice[window_size]
        for j in range(window_size):
            new_window[-1 * (j + 1)] = alphabet_slice[window_size - 1 - j]
            new_window[j + 1] = alphabet_slice[window_size + j + 1] # ...+ 1... 

        windows.append(new_window)

    return windows

vectorizer.fit(generate_windows(1))
pass

In [3]:
def read_corpus(count):
    corpus = open("corpus")
    words = []
    
    for i in range(4):
        next(corpus)
        
    for line in corpus:
        splits = line.split()
        if splits != []:
            words.append(splits[0])
            if count < 0:
                break
            count -= 1
        
    return words

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_enc = LabelEncoder()
onehot_enc = OneHotEncoder(sparse=False)

def fit_encoders():
    windows = []
    alphabet = "abcdefghijklmnopqrstuvwxyz 0_*"
    alphabet_list = list(alphabet)
#     print(alphabet_list)
    label_enc.fit(alphabet_list)
    label_list = label_enc.transform(alphabet_list)
    onehot_enc.fit(label_list.reshape(-1, 1))
    
def transform(character):
    character_label = label_enc.transform([character])
    return onehot_enc.transform(character_label[0])[0]

def transform_list(character_list):
    transformed_list = []
    
    for character in character_list:
        transformed_list.append(transform(character))
    
    return transformed_list

fit_encoders()

def encode_window(window):
    encoded_window = []
    
    for element in window:
        transformed_element = [transform(element)]
        encoded_window.append(transformed_element)
        
    return encoded_window

In [5]:
def transform_accent(vowel):
    if vowel in "aei":
        return [1, 0]
    if vowel in "áéí":
        return [0, 1]
    if vowel in "ou":
        return [1, 0, 0, 0]
    if vowel in "óú":
        return [0, 1, 0, 0]
    if vowel in "öü":
        return [0, 0, 1, 0]
    if vowel in "őű":
        return [0, 0, 0, 1]

In [6]:
count = 100000
words = read_corpus(count)

In [7]:
def deaccentize(text):
    text = text.replace("á", "a")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ó", "o")
    text = text.replace("ö", "o")
    text = text.replace("ő", "o")
    text = text.replace("ú", "u")
    text = text.replace("ü", "u")
    text = text.replace("ű", "u")
    
    return text

In [8]:
def ispunct(c):
    punctuations = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    for char in punctuations:
        if (c == char):
            return True
    return False


def isalpha(c):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    for char in alphabet:
        if (c == char):
            return True
    return False

# reduces the number of different characters to 30
def normalize_character(c):
    if (c.isspace()):
        return ' '
    if (c.isdigit()):
        return '0'
    if (ispunct(c)):
        return '_'
    if (isalpha(c)):
        return c
    return '*'

def normalize_list(character_list):
    normalized_list = []
    
    for c in character_list:
        normalized_list.append(normalize_character(deaccentize(c)))
    
    return normalized_list

In [9]:
def pad_word(word, window_size):
    if window_size > 0:
        return '_' + pad_word(word, window_size - 1) + '_'
    return word

In [10]:
from collections import deque

def make_windows_from_word(word, window_size, vowel):
    windows = []
    accents = []
    
    sliding_window = deque((), window_size * 2 + 1)
    
    
    for character in word[:sliding_window.maxlen - 1]:
        sliding_window.append(character)
    
    for character in word[sliding_window.maxlen - 1:]:
        sliding_window.append(character)
        
        if (sliding_window[window_size] == vowel) or (sliding_window[window_size] in vowel_table[vowel]):
            normalized_list = normalize_list(list(sliding_window))
            transformed_list = transform_list(normalized_list)
            transformed_accents = sliding_window[window_size]
            
            windows.append(transformed_list)
            accents.append(transform_accent(transformed_accents))
    
    return windows, accents
    
def make_windows_from_text(text, window_size, vowel):
    windows = []
    accents = []
    
    for word in text:
        padded_word = pad_word(word, window_size)
        
        new_windows, new_accents = make_windows_from_word(padded_word, window_size, vowel)
#         windows.append(new_windows)
#         accents.append(new_accents)
        windows += new_windows
        accents += new_accents
        
    return windows, accents

In [11]:
from sklearn.model_selection import train_test_split

w = words[:10000]
x, y = make_windows_from_text(w, 1, 'a')
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.8)

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM


Using TensorFlow backend.


In [17]:
model = Sequential()
model.add(LSTM(2, return_sequences=False, input_shape=(3, 30)))
model.add(Dropout(0.5))
model.add(Dense(2))

model.compile(loss='mean_squared_logarithmic_error',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [18]:
model.fit(train_x, train_y, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe679e37c50>

In [19]:
model.evaluate(test_x, test_y)



[0.13453705814516301, 0.67147251059577262]

In [20]:
x, y = make_windows_from_text('áradat', 1, 'a')
print(model.predict(x))

x, y = make_windows_from_text('állat', 1, 'a')
print(model.predict(x))

[[ 0.22496337  0.15532254]
 [ 0.22496337  0.15532254]
 [ 0.22496337  0.15532254]]
[[ 0.22496337  0.15532254]
 [ 0.22496337  0.15532254]]
