In [7]:
from sklearn.feature_extraction import DictVectorizer

vowel_table = {"a" : ["á"], "e" : ["é"], "i" : ["í"], "o" : ["ó", "ö", "ő"], "u" : ["ú", "ü", "ű"]}
vectorizer = DictVectorizer()

In [8]:
def deaccentize(text):
    text = text.replace("á", "a")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ó", "o")
    text = text.replace("ö", "o")
    text = text.replace("ő", "o")
    text = text.replace("ú", "u")
    text = text.replace("ü", "u")
    text = text.replace("ű", "u")

    return text

def ispunct(c):
    punctuations = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    for char in punctuations:
        if (c == char):
            return True
    return False


def isalpha(c):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    for char in alphabet:
        if (c == char):
            return True
    return False

# reduces the number of different characters to 30
def normalize_character(c):
    if (c.isspace()):
        return ' '
    if (c.isdigit()):
        return '0'
    if (ispunct(c)):
        return '_'
    if (isalpha(c)):
        return c
    return '*'

# generates template windows for the alphabet
def generate_windows(window_size):
    windows = []
    alphabet = "abcdefghijklmnopqrstuvwxyz 0_*"
    alphabet_size = len(alphabet)

    for i in range(alphabet_size):
        new_window = {}

        end_of_slice = i + window_size * 2
        # adding the center of the window
        end_of_slice += 1
        if end_of_slice <= alphabet_size:
            alphabet_slice = alphabet[i:end_of_slice]
        else:
            alphabet_slice = alphabet[i:alphabet_size]
            alphabet_slice += alphabet[0:end_of_slice - alphabet_size]

        new_window[0] = alphabet_slice[window_size]
        for j in range(window_size):
            new_window[-1 * (j + 1)] = alphabet_slice[window_size - 1 - j]
            new_window[j + 1] = alphabet_slice[window_size + j + 1] # ...+ 1... 

        windows.append(new_window)

    return windows

vectorizer.fit(generate_windows(1))
pass

In [9]:
def read_corpus(count):
    corpus = open("res/corpus")
    words = []
    
    for i in range(4):
        next(corpus)
        
    for line in corpus:
        splits = line.split()
        if splits != []:
            words.append(splits[0])
            if count < 0:
                break
            count -= 1
        
    return words

In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_enc = LabelEncoder()
onehot_enc = OneHotEncoder(sparse=False)

def fit_encoders():
    windows = []
    alphabet = "abcdefghijklmnopqrstuvwxyz 0_*"
    alphabet_list = list(alphabet)
#     print(alphabet_list)
    label_enc.fit(alphabet_list)
    label_list = label_enc.transform(alphabet_list)
    onehot_enc.fit(label_list.reshape(-1, 1))
    
def transform(character):
    character_label = label_enc.transform([character])
    return onehot_enc.transform(character_label[0])[0]

def transform_list(character_list):
    transformed_list = []
    
    for character in character_list:
        transformed_character = transform(character)
#         transformed_list.append(list(transformed_character))
        transformed_list.append(transformed_character)
    
    return transformed_list

fit_encoders()

def encode_window(window):
    encoded_window = []
    
    for element in window:
        transformed_element = [transform(element)]
        encoded_window.append(transformed_element)
        
    return encoded_window

In [11]:
def transform_accent(vowel):
    if vowel in "aei":
        return [1, 0]
    if vowel in "áéí":
        return [0, 1]
    if vowel in "ou":
        return [1, 0, 0, 0]
    if vowel in "óú":
        return [0, 1, 0, 0]
    if vowel in "öü":
        return [0, 0, 1, 0]
    if vowel in "őű":
        return [0, 0, 0, 1]

In [12]:
count = 100000
words = read_corpus(count)

In [13]:
def deaccentize(text):
    text = text.replace("á", "a")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ó", "o")
    text = text.replace("ö", "o")
    text = text.replace("ő", "o")
    text = text.replace("ú", "u")
    text = text.replace("ü", "u")
    text = text.replace("ű", "u")
    
    return text

In [14]:
def ispunct(c):
    punctuations = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    for char in punctuations:
        if (c == char):
            return True
    return False


def isalpha(c):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    for char in alphabet:
        if (c == char):
            return True
    return False

# reduces the number of different characters to 30
def normalize_character(c):
    if (c.isspace()):
        return ' '
    if (c.isdigit()):
        return '0'
    if (ispunct(c)):
        return '_'
    if (isalpha(c)):
        return c
    return '*'

def normalize_list(character_list):
    normalized_list = []
    
    for c in character_list:
        normalized_list.append(normalize_character(deaccentize(c)))
    
    return normalized_list

In [15]:
def pad_word(word, window_size):
    if window_size > 0:
        return '_' + pad_word(word, window_size - 1) + '_'
    return word

In [16]:
from collections import deque

def make_windows_from_word(word, window_size, vowel):
    windows = []
    accents = []
    
    sliding_window = deque((), window_size * 2 + 1)
    
    
    for character in word[:sliding_window.maxlen - 1]:
        sliding_window.append(character)
    
    for character in word[sliding_window.maxlen - 1:]:
        sliding_window.append(character)
        
        if (sliding_window[window_size] == vowel) or (sliding_window[window_size] in vowel_table[vowel]):
            normalized_list = normalize_list(list(sliding_window))
            transformed_list = transform_list(normalized_list)
            transformed_accents = sliding_window[window_size]
            
            windows.append(transformed_list)
            accents.append(transform_accent(transformed_accents))
    
#     print(windows)
    return windows, accents
    
def make_windows_from_text(text, window_size, vowel):
    windows = []
    accents = []
    
    for word in text:
        padded_word = pad_word(word.lower(), window_size)
#         print(padded_word)
        
        new_windows, new_accents = make_windows_from_word(padded_word, window_size, vowel)
#         print(new_windows)
#         print(np.array(new_windows).argmax(axis=2))
#         windows.append(new_windows)
#         accents.append(new_accents)
        windows += new_windows
        accents += new_accents
        
    return windows, accents

In [17]:
w = words[:10000]
w2 = []
w2 = w

# for word in w:
#     word = word.replace('a', 'á')
#     word = word.replace('A', 'á')
#     w2.append(word)
    
# for word in w:
#     word = word.replace('á', 'a')
#     word = word.replace('Á', 'a')
#     w2.append(word)

In [19]:
from sklearn.model_selection import train_test_split

x, y = make_windows_from_text(w2, 1, 'a')
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.8)

In [20]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import SGD


Using TensorFlow backend.


In [21]:
train_x[0]
train_y[0]

[0, 1]

In [22]:
model = Sequential()
model.add(LSTM(100, return_sequences=False, input_shape=(3, 30)))
# model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))

#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [23]:
# n = 5
# h = model.fit(sample_x, sample_y, batch_size=100, epochs=100, verbose=2)


In [24]:
print(len(test_x))
print(model.metrics_names)
model.evaluate(sample_x, sample_y, batch_size=100)

5403
['loss', 'acc']


NameError: name 'sample_x' is not defined

In [27]:
import numpy as np
np.random.seed(42)
indices = np.random.choice(len(train_x), 20)
sample_y = np.array(test_y)[indices]
sample_x = np.array(test_x)[indices]

In [28]:
import numpy as np
np.unique(model.predict_classes(sample_x), return_counts=True)



(array([0, 1]), array([16,  4]))

In [29]:
sample_x.argmax(axis=2)[:,0]

array([23, 21,  3, 10, 11, 15,  5, 11, 29,  3, 29, 25, 21, 23, 23,  3,  3,
       15, 24, 15])

In [30]:
sample_y

array([[1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1]])

In [31]:
np.unique(test_y, return_counts=True)

(array([0, 1]), array([5403, 5403]))

In [32]:
np.unique(np.array(train_y), return_counts=True)

(array([0, 1]), array([1350, 1350]))

In [33]:
x1, y1 = make_windows_from_text(['áradat'], 1, 'a')
print(model.predict(x1))
# print(x1)

x2, y2 = make_windows_from_text(['állat'], 1, 'a')
print(model.predict(x2))

print(model.predict(x1)[0][1])

[[ 0.50146353  0.49471885]
 [ 0.50543588  0.49396312]
 [ 0.49639484  0.49267733]]
[[ 0.50502229  0.49478415]
 [ 0.49523586  0.49357539]]
0.494719


In [34]:
print(x1)
print(y1)

[[array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.])], [array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [33]:
np.array(x1).argmax(axis=2)
# np.array(x1).shape


array([[3, 4, 3],
       [3, 4, 3],
       [3, 4, 3]])

In [39]:
x3, y3 = make_windows_from_word(pad_word('áradat', 1), 1, 'a')
np.array(x3).argmax(axis=2)

array([[ 3,  4, 21],
       [21,  4,  7],
       [ 7,  4, 23]])

In [18]:
h.history

{'acc': [0.5,
  0.5,
  0.52925925784640837,
  0.65703703739024977,
  0.68666665642349811,
  0.68666666083865691,
  0.71222222734380647,
  0.71407407742959483,
  0.71407407963717429,
  0.71407407522201538,
  0.71407407522201538,
  0.71407408184475374,
  0.71407406859927702,
  0.71407407742959483,
  0.71407407963717429,
  0.71407407963717429,
  0.71407408625991253,
  0.71407407742959483,
  0.71407406859927702,
  0.71407406859927702,
  0.71407407963717429,
  0.71407407301443593,
  0.71407407522201538,
  0.71407407301443593,
  0.71407408184475374,
  0.71407408184475374,
  0.71407407742959483,
  0.71407407301443593,
  0.71407407080685648,
  0.71407407301443593,
  0.71407406859927702,
  0.71407407301443593,
  0.71407407742959483,
  0.71407407742959483,
  0.71407407742959483,
  0.71407407301443593,
  0.71407407742959483,
  0.71407407080685648,
  0.71407408184475374,
  0.71407407522201538,
  0.71407407963717429,
  0.71407407301443593,
  0.71407407742959483,
  0.71407407301443593,
  0.714074077