# Run it

To run it use

`docker build -f Keras.Dockerfile -t keras .`

`docker run -v ${PWD}:/notebooks -p 8888:8888 keras` 

## Data preparation



In [1]:

def load_sentences(file_path):
    from xml.etree import cElementTree as ET
    t = ET.parse(file_path)
    sentences = []
    for article in list(t.getroot()):
        for sentence in  list(article.findall("s")):
            cur = ""
            for word in list(sentence):
                if word.text is None:
                    continue
                    
                cur = cur + " " + word.text
                
            sentences.append(cur)
        
    return sentences

In [2]:
blick = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blick.xml')
blogs = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/blogs.xml')
schobinger = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/schobinger.xml')
swatch = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/swatch.xml')
wiki = load_sentences('../data/NOAHsCorpusOfSwissGermanDialects_Release2.1/wiki.xml')

all_sentences = blick + blogs + schobinger + swatch + wiki

In [23]:
print(len(all_sentences))

7455


In [4]:
def flatten(seqs):
    seq = []
    for s in seqs:
        seq = seq + s
    return seq

from itertools import islice

def window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

## Preprocess


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

num_words = 3000
max_length = 3

t = Tokenizer(num_words=num_words,
           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n’\'',
           lower=True,
           split=" ",
           char_level=False)
t.fit_on_texts(all_sentences)

def lookup_word(index):
    for e in t.word_index:
        if t.word_index[e] == index:
            return e
        
    return "<unk>"

seqs = t.texts_to_sequences(all_sentences)
seq = flatten(seqs)

data = list(window(seq, max_length+1))

xs = [x[:max_length] for x in data]
ys = to_categorical([x[max_length] for x in data])

Using TensorFlow backend.


In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten
from keras.layers.recurrent import LSTM

import numpy as np

model = Sequential()
model.add(Embedding(num_words, 128, input_length=max_length))
model.add(LSTM(128, activation='relu'))
model.add(Dense(num_words))
model.add(Activation('softmax'))

print(model.summary())

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])





_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3, 128)            384000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 3000)              387000    
_________________________________________________________________
activation_1 (Activation)    (None, 3000)              0         
Total params: 902,584
Trainable params: 902,584
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
model.fit(np.array(xs), np.array(ys), epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fcadb484828>

In [21]:
input = t.texts_to_sequences(["ich zmitzt im"])
result = model.predict(np.array(input))

top = sorted(range(len(result[0])), key=lambda i: result[0][i], reverse=True)[:10]
for i in top:
    print(lookup_word(i))

ander
emal
wììrt
soo
rächt
chönt
so
chöne
uf
«


In [33]:
def generate(a, b, c, accl=[], pos=0, max=10):
    if pos == 0:
        accl = [a,b,c]
    if pos == max:
        print(accl)
        return
    
    input = t.texts_to_sequences([a + " " + b + " " + c])
    result = model.predict(np.array(input)) 
    
    top = sorted(range(len(result[0])), key=lambda i: result[0][i], reverse=True)[:10]
    w = lookup_word(top[0])
    
    accl.append(w)
    generate(b, c, w, accl, pos+1, max)
    
generate("ich", "bin", "so", max=100)

['ich', 'bin', 'so', 'froh', 'dass', 'ich', 'mis', 'lebe', 'do', 'id', 'stadt', 'go', 'luege', 'und', 'ned', 'z', 'herzig', 'gsi', 'und', 'sie', 'sind', 'soooo', 'fein', 'das', 'hets', 'öpe', 'am', 'bahnhof', 'la', 'inere', 'uf', 'e', 'insle', 'd', 'sunne', 'im', 'herze', 'für', 'franke', 'das', 'isch', 'echt', 'cool', 'gsi', 'usserdem', 'hets', 'echt', 'gha', 'womer', 'wo', 'zäme', 'sehr', 'viel', 'da', 'werded', 'afange', 'hani', 'grad', 'na', 'muese', 'etc', 'etc', 'uf', 'jede', 'fall', 'anen', 'aalass', 'ich', 'sogar', 'e', 'richtigi', 'oben', 'isches', 'so', 'angscht', 'für', 'en', 'wo', 'ab', 'recht', 'em', 'da', 'und', 'uf', 'de', 'foeteli', 'gsehnd', 'ish', 'd', 'au', 'vom', 'und', 'dä', 'vom', 'unternehmä', 'hayek', 'sowiä', 'dank', 'dä', 'und', 'dä', 'vom', 'unternehmä']
