In [1]:
from collections import deque
from sklearn.feature_extraction import DictVectorizer

In [2]:
vowel_table = {"a" : ["á"], "e" : ["é"], "i" : ["í"], "o" : ["ó", "ö", "ő"], "u" : ["ú", "ü", "ű"]}
vectorizer = DictVectorizer()

In [17]:
def deaccentize(text):
            
    text = text.replace("á", "a");
    text = text.replace("é", "e");
    text = text.replace("í", "i");
    text = text.replace("ó", "o");
    text = text.replace("ö", "o");
    text = text.replace("ő", "o");
    text = text.replace("ú", "u");
    text = text.replace("ü", "u");
    text = text.replace("ű", "u");

    return text

In [18]:
def create_row(window, window_size):
    row = {}
    
    for i in range(-window_size, window_size + 1):
        row[i] = window.popleft()
        
    del row[0]
    
    return row

In [19]:
def prepare_text(text, window_size, vowel):
    x_e = []
    y_e = []

    window = deque((), window_size * 2 + 1)
    for i in range(window.maxlen):
        window.append("_")

    for character in text:
        window.append(character)
        if window[window_size] == vowel:
            x_e.append(create_row(window.copy(), window_size))
            y_e.append(0)
        if window[window_size] in vowel_table[vowel]:
            x_e.append(create_row(window.copy(), window_size))
            y_e.append(1)

    print(x_e)
    print(y_e)
    return x_e

In [20]:
demo = "Tévedsz. Eddig epedtek érte, hogy legyen, s nem volt, most majd a lelkük üdvösségét kínálnák, " \
       "ha elmaradhatna, de nem tudjuk megakadályozni."

In [21]:
X = prepare_text(demo, 2, "e")

[{1: 'v', 2: 'e', -1: 'T', -2: '_'}, {1: 'd', 2: 's', -1: 'v', -2: 'é'}, {1: 'p', 2: 'e', -1: ' ', -2: 'g'}, {1: 'd', 2: 't', -1: 'p', -2: 'e'}, {1: 'k', 2: ' ', -1: 't', -2: 'd'}, {1: 'r', 2: 't', -1: ' ', -2: 'k'}, {1: ',', 2: ' ', -1: 't', -2: 'r'}, {1: 'g', 2: 'y', -1: 'l', -2: ' '}, {1: 'n', 2: ',', -1: 'y', -2: 'g'}, {1: 'm', 2: ' ', -1: 'n', -2: ' '}, {1: 'l', 2: 'k', -1: 'l', -2: ' '}, {1: 'g', 2: 'é', -1: 's', -2: 's'}, {1: 't', 2: ' ', -1: 'g', -2: 'é'}, {1: 'l', 2: 'm', -1: ' ', -2: 'a'}, {1: ' ', 2: 'n', -1: 'd', -2: ' '}, {1: 'm', 2: ' ', -1: 'n', -2: ' '}, {1: 'g', 2: 'a', -1: 'm', -2: ' '}]
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]


In [22]:
vectorized = vectorizer.fit_transform(X).toarray()

In [23]:
print(vectorized)

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
   0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
   0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.