In [None]:
import os
import keras
import numpy as np

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.models import Model
from keras.models import Input
from keras.models import load_model

from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import Bidirectional

from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from progress_bar import log_progress

In [None]:
from collections import defaultdict

PAD = 0
ACCENT_TYPES = 4
START = '^'  # start character
END = '$'  # end character
HUN = 'aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyz'
HUN += HUN.upper()

CH_TO_ID = {c: i+1 for i,c in enumerate(START+END+HUN)}

FILTER = {
    'a': 'á',
    'e': 'é',
    'i': 'í',
    'o': 'ó',
    'u': 'ü',
    'á': 'a',
    'é': 'e',
    'í': 'i',
    'ó': 'o',
    'ö': 'o',
    'ő': 'o',
    'ú': 'u',
    'ü': 'u',
    'ű': 'u',
}

LABELS = {
    'a': ('a', 0),
    'e': ('e', 0),
    'i': ('i', 0),
    'o': ('o', 0),
    'u': ('u', 0),
    'á': ('a', 1),
    'é': ('e', 1),
    'í': ('i', 1),
    'ó': ('o', 1),
    'ö': ('o', 2),
    'ő': ('o', 3),
    'ú': ('u', 1),
    'ü': ('u', 2),
    'ű': ('u', 3),
}

class DataProcessor():
    
    def __init__(self, file_list, window_size = 7):
        self._window_size = 0
        self._fill = 0
        
        self.set_window_size(window_size)
        
        self.file_list = file_list
        
        self.n = ACCENT_TYPES
        self.max_len = 30
        self.true_len = 0
        
        self.ch_to_id = CH_TO_ID
        self.id_to_ch = {v:k for k,v in CH_TO_ID.items()}
        self.words = set()
        self.windows = {}
        self.chars = defaultdict(int)
        
        self.get_unique_words()
    
    def set_window_size(self, x):
        if x%2 != 1:
            raise ValueError('Window size shall be an odd number!')
        self._window_size = x
        self._fill = int(x/2)
    
    def get_unique_words(self):
        words = self.process_files()
        for i in words:
            self.words.add(i)
    
    def get_windows(self):
        self.windows = {}
        for w in self.words:
            p = START*self._fill + w + END*self._fill
            for i,c in enumerate(p):
                if c in LABELS.keys():
                    self.windows[p[i-self._fill:i+1+self._fill]] = c
    
    def filter_chars(self):
        chars = defaultdict(int)
        windows = {}
        for key,c in self.windows.items():
            if chars.get(c, 0) <= chars.get(FILTER[c], 0):
                windows[key] = c
                chars[c] += 1
        self.windows = windows
    
    def get_id(self, c):
        return self.ch_to_id.get(c, PAD)
    
    def get_char(self, idx, c = None):
        return self.id_to_ch.get(idx, c)
    
    def process_files(self):
        for file in log_progress(self.file_list, every=1):
            with open(file, 'r') as f:
                for line in f:
                    if not line.startswith('#') and \
                       not line.startswith('\n') and \
                       len(line.split()[0]) <= self.max_len:
                        x = line.split()[0]
                        if len(x) > self.true_len:
                            self.true_len = len(x)
                        yield line.split()[0]
    
    def decode(self, word):
        out = []
        for i in word:
            out.append(self.get_id(i))
        return np.array(out)
    
    def encode(self, idxs, orig = None):
        if orig is None:
            orig = '&'*len(idxs)
        
        out = []
        for i,idx in enumerate(idxs):
            out.append(self.get_char(idx, orig[i]))
        return np.array(out)
    
    def remove_accent(self, word):
        out = []
        for c in word:
            out.append(LABELS.get(c, [c, None])[0])
        return out

    def serve_data(self):
        self.get_windows()
        self.filter_chars()
        
        x_in = []
        y_out = []
        self.chars = defaultdict(int)
        for key,c in self.windows.items():
            self.chars[c] += 1
            x_in.append(self.decode(self.remove_accent(list(key))))
            y_out.append(np.array([LABELS[c][1]]))
        
        return np.array(x_in), np.array(y_out)

In [None]:
NR_OF_FILES = None
TRAIN_DATA_DIRS = [
                   'comments_20131001-20131201.nlp',
                   'comments_20131201-20140519.nlp',
                   'comments_20140519-20140921.nlp',
                  ]

input_files = [os.path.join(dir,file) for dir in TRAIN_DATA_DIRS for file in os.listdir(dir)]
provider = DataProcessor(input_files[:NR_OF_FILES])

In [None]:
provider.set_window_size(5)

X, Y = provider.serve_data()
Y = [to_categorical(i, num_classes=provider.n) for i in Y]
X, X_te, Y, Y_te = train_test_split(X, Y, test_size=0.1)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=provider.n, output_dim=128))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(provider.n, activation="softmax")))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

In [None]:
es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=3,
                   verbose=0, mode='auto')

history = model.fit(X, np.array(Y),
                    batch_size=256,
                    epochs=50,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[es])

In [None]:
score = model.evaluate(X_te, np.array(Y_te))
score

In [None]:
import time
model.save('saprse_{}.h5'.format(str(int(time.time()))))

In [None]:
class Predictor():
    def __init__(self, model, provider):
        self.model = model
        self.provider = provider
    
    def accent(self, w):
        prediction = self.model.predict(provider.decode(list(a))).argmax(axis=2)
        output = prediction.reshape(prediction.shape[0])
        return ''.join(provider.encode(output, w))

m = Predictor(model, provider)

In [None]:
m.accent('elnok')