In [1]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import tensorflow as tf

In [2]:
with open('./transcripts/117_Atoll.txt', 'r') as f:
    text = f.read()

In [3]:
def tokenize_words(input):
    input = input.lower()

    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    filtered = filter(lambda token : token not in stopwords.words('german'), tokens)

    return ''.join(filtered)

In [4]:
processed_inputs = tokenize_words(text)

In [5]:
chars = sorted(list(set(processed_inputs)))

In [6]:
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [7]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 9135
Total vocab: 38


In [8]:
seq_length = 100
x_data = []
y_data = []

In [9]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [10]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 9035


In [11]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [12]:
y = tf.keras.utils.to_categorical(y_data)

In [22]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_len, seq_length, input_length=seq_length))
model.add(tf.keras.layers.LSTM(100, return_sequences=True))
model.add(tf.keras.layers.LSTM(100))
model.add(tf.keras.layers.Dense(y.shape[1], activation='softmax'))

In [23]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          3800      
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 100)          80400     
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 38)                3838      
Total params: 168,438
Trainable params: 168,438
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [25]:
filepath = "model_weights_saved.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [26]:
model.fit(X, y, epochs=40, batch_size=128, callbacks=desired_callbacks)

Epoch 1/40
Epoch 00001: loss improved from inf to 3.03636, saving model to model_weights_saved.hdf5
Epoch 2/40
Epoch 00002: loss improved from 3.03636 to 2.94479, saving model to model_weights_saved.hdf5
Epoch 3/40
Epoch 00003: loss improved from 2.94479 to 2.94397, saving model to model_weights_saved.hdf5
Epoch 4/40
Epoch 00004: loss improved from 2.94397 to 2.94124, saving model to model_weights_saved.hdf5
Epoch 5/40
Epoch 00005: loss improved from 2.94124 to 2.93962, saving model to model_weights_saved.hdf5
Epoch 6/40
Epoch 00006: loss improved from 2.93962 to 2.93962, saving model to model_weights_saved.hdf5
Epoch 7/40
Epoch 00007: loss did not improve from 2.93962
Epoch 8/40
Epoch 00008: loss did not improve from 2.93962
Epoch 9/40
Epoch 00009: loss improved from 2.93962 to 2.93883, saving model to model_weights_saved.hdf5
Epoch 10/40
Epoch 00010: loss did not improve from 2.93883
Epoch 11/40
Epoch 00011: loss did not improve from 2.93883
Epoch 12/40
Epoch 00012: loss improved fro

KeyboardInterrupt: 

In [27]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [28]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [29]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" efantatsächlichbisschenrochenliebengelerntähdachteerstwassergegangenähhalttypischesosteekindähhaltäh "


In [30]:
for i in range(10):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eeeeeeeeee