In [89]:
import tensorflow as tf

import numpy as np
import os

In [90]:
with open('./transcripts/117_Atoll.txt', 'r') as f:
    text = f.read()

In [91]:
print(f'length of text (characters): {len(text)}')

length of text (characters): 17658


In [92]:
#first 250 characters:
print(text[:250])

STEFAN:
Hallo und herzlich willkommen zu einer neuen Folge Podcast Ufo. Wir haben beide so nen Hals, wir haben keinen Bock mehr, alles funktioniert nicht so richtig. Die Technik und so.

FLORENTIN:
Ey wir wurden, wir wurden von amerikanischen Choreog


In [93]:
#unique set of characters:
vocab = sorted(set(text))

In [94]:
print(f"{len(vocab)} unique characters")

74 unique characters


# Text Processing
## Vectorization

In [95]:
#mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [96]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  "'" :   4,
  '*' :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '0' :   9,
  '1' :  10,
  '2' :  11,
  '3' :  12,
  '4' :  13,
  '5' :  14,
  '6' :  15,
  '8' :  16,
  ':' :  17,
  '?' :  18,
  'A' :  19,
  ...
}


In [97]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'STEFAN:\nHallo' ---- characters mapped to int ---- > [37 38 23 24 19 32 17  0 26 44 55 55 58]


In [98]:
# The maximum length sentence we want for a single input in characters
seq_length= 100
examples_per_epoch = len(text) // (seq_length + 1)

#create trainings examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

S
T
E
F
A


In [99]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'STEFAN:\nHallo und herzlich willkommen zu einer neuen Folge Podcast Ufo. Wir haben beide so nen Hals, '
'wir haben keinen Bock mehr, alles funktioniert nicht so richtig. Die Technik und so.\n\nFLORENTIN:\nEy w'
'ir wurden, wir wurden von amerikanischen Choreografen aus unserem Aufnahmeraum geschmissen, sind jetz'
't hier nebenan in der Maske.\n\nSTEFAN:\nWir sind reingelaufen, sie haben so gesagt: Und geht wieder rau'
's in fünf,sechs\n\nBEIDE:\nfünf, sechs, sieben, acht\n\nSTEFAN:\nund raus, raus, raus, raus...\n\nFLORENTIN:\n'


In [100]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [101]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'STEFAN:\nHallo und herzlich willkommen zu einer neuen Folge Podcast Ufo. Wir haben beide so nen Hals,'
Target data: 'TEFAN:\nHallo und herzlich willkommen zu einer neuen Folge Podcast Ufo. Wir haben beide so nen Hals, '


In [102]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 37 ('S')
  expected output: 38 ('T')
Step    1
  input: 38 ('T')
  expected output: 23 ('E')
Step    2
  input: 23 ('E')
  expected output: 24 ('F')
Step    3
  input: 24 ('F')
  expected output: 19 ('A')
Step    4
  input: 19 ('A')
  expected output: 32 ('N')


## Create Training Batches

In [79]:
BATCH_SIZE = 128

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((128, 100), (128, 100)), types: (tf.float32, tf.float32)>

## Build the Model

In [80]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 2048

In [81]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size,None]),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [82]:
model = build_model(vocab_size= len(vocab), embedding_dim=embedding_dim, rnn_units= rnn_units, batch_size= BATCH_SIZE)

In [83]:
for input_example_batch, target_example_batch in dataset.take(1):
    print(target_example_batch)

tf.Tensor(
[[0.9459459  0.8378378  0.6486486  ... 0.         0.         0.3243243 ]
 [0.6216216  0.6891892  0.7837838  ... 0.8243243  0.01351351 0.7027027 ]
 [0.8108108  0.6351351  0.8243243  ... 0.7702703  0.7702703  0.01351351]
 ...
 [0.8108108  0.01351351 0.6891892  ... 0.8243243  0.6486486  0.7567568 ]
 [0.7702703  0.8243243  0.6486486  ... 0.7702703  0.6486486  0.7702703 ]
 [0.01351351 0.6756757  0.5945946  ... 0.8378378  0.22972973 0.01351351]], shape=(128, 100), dtype=float32)


In [84]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(128, 100, 74) # (batch_size, sequence_length, vocab_size)


In [85]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (128, None, 256)          18944     
_________________________________________________________________
gru_6 (GRU)                  (128, None, 2048)         14168064  
_________________________________________________________________
gru_7 (GRU)                  (128, None, 2048)         25178112  
_________________________________________________________________
gru_8 (GRU)                  (128, None, 2048)         25178112  
_________________________________________________________________
dense_2 (Dense)              (128, None, 74)           151626    
Total params: 64,694,858
Trainable params: 64,694,858
Non-trainable params: 0
_________________________________________________________________


In [33]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [34]:
sampled_indices

array([16,  6, 48, 10, 28, 42, 19, 10, 31, 23, 16,  2,  2, 20, 31, 47, 42,
       31, 22, 54, 10, 12, 73,  7, 18, 73, 27, 62, 11, 35, 48,  6, 25,  1,
       59, 58,  4, 32, 39, 42, 31, 64, 70, 21, 72, 18,  5, 34, 48, 18, 54,
       52, 13, 61, 35, 62,  1,  0, 34, 56, 56, 22,  8, 43, 10, 45, 44,  5,
       45, 59, 22, 23, 52, 36, 13, 57, 46, 11, 22,  7, 35,  5, 29, 45, 17,
       41, 68, 56, 60, 33,  7, 38, 10, 17, 25, 41, 31,  4,  2, 28])

In [35]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 ' Paradies rum. Die kennen halt nichts anderes. Die kennen nichts anderes als dieses blaue lächerlich'

Next Char Predictions: 
 "8,e1JYA1ME8!!BMdYMDk13ü-?üIt2Qe,G po'NUYMvßCö?*Pe?ki4sQt \nPmmD.Z1ba*bpDEiR4nc2D-Q*Kb:WzmrO-T1:GWM'!J"


## Training

In [36]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (128, 100, 74)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.3037944


In [37]:
model.compile(optimizer='adam', loss=loss)

In [39]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_best_only= True,
    monitor= 'loss'
    mode= 'min',
    save_weights_only=True)

In [40]:
for f in os.listdir(checkpoint_dir): os.remove(checkpoint_dir+'/'+f)

In [41]:
EPOCHS = 200

In [42]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/200
Epoch 2/200


KeyboardInterrupt: 

In [254]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_200'

In [255]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size= 1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [256]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (1, None, 256)            17920     
_________________________________________________________________
gru_12 (GRU)                 (1, None, 2048)           14168064  
_________________________________________________________________
gru_13 (GRU)                 (1, None, 2048)           25178112  
_________________________________________________________________
dense_11 (Dense)             (1, None, 70)             143430    
Total params: 39,507,526
Trainable params: 39,507,526
Non-trainable params: 0
_________________________________________________________________


In [257]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [258]:
print(generate_text(model, start_string=u"STEFAN:"))

STEFAN:
Hallo lier nebenat in der Nomnit Problem, weil in eintschen aro der Strand wett. Und die kleinen verschieden wierer hant mit Mantarochen.

STEFAN:
Ja, nee Wasse andere, weil sie gerade auf einem sinkenden Schiff, in einerschied zwischen einer Lagune. Lagune ist ein bisschen, hat mehr sowas... Ist Lagune nicht so ähh kriege?

STEFAN:
Nichts richtig, nichts rier nicht so richtig. Die Technik und so.

FLORENTIN:
Ey was, weil dann müsst ich ja aktiv meiß Leben besserne, mit in ehn paar Youtube Videos angoch. Neunzig Prozent der Landesfläche bist du das inter Wass untergehen.

FLORENTIN:
Um sich Dänemark zu kaufen.

STEFAN:
Oh jo. Ich glaube Bucht in. TFZis man nechn hast ein Ahoplo Sder ähe den ganzen Topf dem Galfes somen. Ao rind nich alle Leute. Ich merk, die Wogen schlagen hoch. Der höchste Präbieden. Äh ich glaub Lagune, Lagune hit und es ist toll.

FLORENTIN:
Ja des ist auch so... Aber ns war ech willkommen zu einer nie Lanuben. Da wohnen, das ist ne Insel, ist ähnlich groß w

In [103]:
import sys
def generate_text2(model, start_string, temperature=1.0, paragraphs= 10):

    paragraphs *= 2 # wegen neuer zeile nach NAME:

    num_paragraphs = 0

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    temperature = temperature
    sys.stdout.write(start_string)

    model.reset_states()
    while num_paragraphs < paragraphs:
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        sys.stdout.write(idx2char[predicted_id])

        if idx2char[predicted_id] == '\n':
            num_paragraphs += 1

In [104]:
generate_text2(model, start_string=u"STEFAN", temperature=0.5)



ValueError: Tensor's shape (6, 128, 2048) is not compatible with supplied shape [6, 1, 2048]