In [None]:
import keras
import numpy as np

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.models import Model
from keras.models import Input

from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import Bidirectional

from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from progress_bar import log_progress

In [None]:
import tensorflow as tf

def sparse_cross_entropy(y_true, y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

In [None]:
PAD = 0
HUN = 'aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyz'
HUN += HUN.upper()
CH_TO_ID = {c: i+1 for i,c in enumerate(HUN)}

LABELS = {
    'á': 'a',
    'é': 'e',
    'í': 'i',
    'ó': 'o',
    'ö': 'o',
    'ő': 'o',
    'ú': 'u',
    'ü': 'u',
    'ű': 'u',
}

class DataProcessor():
    
    def __init__(self, file_list):
        self.file_list = file_list
        
        self.n = len(CH_TO_ID.keys())+1  # Zero is kept for padding and non-alphabetic characters
        self.max_len = 30
        self.true_len = 0
        
        self.ch_to_id = CH_TO_ID
        self.id_to_ch = {v:k for k,v in CH_TO_ID.items()}
        self.words = set()
        
        self.get_unique_words()
    
    def get_unique_words(self):
        words = self.process_files()
        for i in words:
            self.words.add(i)
    
    def get_id(self, c):
        return self.ch_to_id.get(c, PAD)
    
    def get_char(self, idx, c = None):
        return self.id_to_ch.get(idx, c)
    
    def process_files(self):
        for file in log_progress(self.file_list, every=1):
            with open(file, 'r') as f:
                for line in f:
                    if not line.startswith('#') and \
                       not line.startswith('\n') and \
                       len(line.split()[0]) <= self.max_len:
                        x = line.split()[0]
                        if len(x) > self.true_len:
                            self.true_len = len(x)
                        yield line.split()[0]
    
    def decode(self, word):
        out = []
        for i in word:
            out.append(self.get_id(i))
        return np.array(out)
    
    def encode(self, idxs, orig = None):
        if orig is None:
            orig = '$'*len(idxs)
        
        out = []
        for i,idx in enumerate(idxs):
            out.append(self.get_char(idx, orig[i]))
        return np.array(out)
    
    def remove_accent(self, word):
        out = word.copy()
        for i,c in enumerate(word):
            if c in LABELS.keys():
                out[i] = LABELS[c][0]
        return out

    def serve_data(self):
        x_in = []
        y_out = []
        for i in self.words:
            x_in.append(self.decode(self.remove_accent(list(i))))
            y_out.append(self.decode(i))
        
        return np.array(x_in), np.array(y_out)

In [None]:
NR_OF_FILES = None
TRAIN_DATA_DIRS = [
                   'comments_20131001-20131201.nlp',
                   'comments_20131201-20140519.nlp',
                   'comments_20140519-20140921.nlp',
                  ]

input_files = [os.path.join(dir,file) for dir in TRAIN_DATA_DIRS for file in os.listdir(dir)]
provider = DataProcessor(input_files[:NR_OF_FILES])

X, Y = provider.serve_data()
X = pad_sequences(maxlen=provider.true_len, sequences=X, padding="post", value=PAD)
Y = pad_sequences(maxlen=provider.true_len, sequences=Y, padding="post", value=PAD)
X, X_te, Y, Y_te = train_test_split(X, Y, test_size=0.1)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=provider.n, output_dim=128))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(provider.n, activation='linear')))

model.compile(loss=sparse_cross_entropy,
              optimizer='rmsprop',
              target_tensors=[decoder_target])

model.summary()

In [None]:
es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=3,
                   verbose=0, mode='auto')

history = model.fit(x_in, np.array(y_out),
                    batch_size=512,
                    epochs=50,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=[es])

In [None]:
score = model.evaluate(X_te, np.array(Y_te))
score

In [None]:
import time
model.save('saprse_{}.h5'.format(str(int(time.time()))))

In [None]:
class Predictor():
    def __init__(self, model, provider):
        self.model = model
        self.provider = provider
    
    def accent(self, w):
        prediction = self.model.predict(provider.decode(list(a))).argmax(axis=2)
        output = prediction.reshape(prediction.shape[0])
        return ''.join(provider.encode(output, w))

m = Predictor(model, provider)

In [None]:
m.accent('elnok')