# Run it

To run it use
`docker build -f Keras.Dockerfile -t keras .`
`docker run -v ${PWD}:/notebooks -p 8888:8888 keras` 

## Data preparation



In [1]:

def load_sentences(file_path):
    from xml.etree import cElementTree as ET
    t = ET.parse(file_path)
    sentences = []
    for article in list(t.getroot()):
        for sentence in  list(article.findall("s")):
            cur = ""
            for word in list(sentence):
                if word.text is None:
                    continue
                    
                cur = cur + " " + word.text
                
            sentences.append(cur)
        
    return sentences

In [2]:
blick = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blick.xml')
blogs = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blogs.xml')
schobinger = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/schobinger.xml')
swatch = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/swatch.xml')
wiki = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/wiki.xml')

all_sentences = blick + blogs + schobinger + swatch + wiki

In [3]:
def flatten(seqs):
    seq = []
    for s in seqs:
        seq = seq + s
    return seq

from itertools import islice

def window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

## Preprocess


In [82]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

num_words = 2000
max_length = 1

t = Tokenizer(num_words=num_words,
           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
           lower=True,
           split=" ",
           char_level=False)
t.fit_on_texts(all_sentences)

def lookup_word(index):
    for e in t.word_index:
        if t.word_index[e] == index:
            return e
        
    return "<unk>"

seqs = t.texts_to_sequences(all_sentences)
seq = to_categorical(flatten(seqs))

data = list(window(seq, max_length+1))

xs = [x[max_length-1] for x in data]
ys = [x[max_length] for x in data]

[ 0.  1.  0. ...,  0.  0.  0.]


In [75]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten
from keras.layers.recurrent import LSTM

import numpy as np

model = Sequential()
model.add(Dense(num_words, input_shape=[num_words]))

print(model.summary())

model.compile(optimizer='adam',
              loss='mse',
              metrics=['accuracy'])

model.fit(np.array(xs), np.array(ys), epochs=15)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 2000)              4002000   
Total params: 4,002,000
Trainable params: 4,002,000
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f4e954b3630>

In [84]:
result = model.predict(np.array(xs[:2]))

print(lookup_word(np.argmax(result[1])))


isch
