In [1]:
# Working with the book "verwandlung" von Franz Kafka
# Book pages from 60-1952

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
with open('verwandlung.txt', encoding="utf_8", mode='r') as file: # make sure its utf_8 encoding for german texts
    contents = file.read()
    
full_text = "\n".join(contents.split("\n")[59:1952])

In [4]:
## Explore the dataset

# len(contents.split("\n")[59:1952]) # Split at end of each line
# contents.split("\n")[59:1952] # Every line is an element in this list

full_text = "\n".join(contents.split("\n")[59:1952]) # join all lines to get the complete text

# len(set(full_text)) # How many different chars are in full text set() sth like unique

unique_chars = set(full_text)

# enumerate(unique_chars) # returns a generator

In [5]:
# Create a mapping
int_to_char = {}
char_to_int = {}

for i,j in enumerate(unique_chars):
    int_to_char[i] = j
    char_to_int[j] = i

In [6]:
length = 40 # Counter Var for taking 40 chars into account

# Create Placeholders
X = []
y = []

for i in range(0, len(full_text) - length): # - lenght prevents the loop to go over the last char
    
    line = full_text[i:i+length] # gets the first 40 chars
    X.append([char_to_int[c] for c in line])  # Since chars/strings are no use to our NN we must convert them to ints
    y.append(char_to_int[full_text[i+length]])

"""
# What happens in the for loop: 
line =  "Als Gregor Samsa eines Morgens aus unruh"
int_line = [char_to_int[c] for c in line]

print(char_to_int['A'])
print(char_to_int['l'])
print(char_to_int['s'])

print(int_line)
"""

print(X[:4])
print(y[:4])


[[12, 45, 52, 25, 66, 49, 9, 28, 2, 49, 25, 29, 48, 61, 52, 48, 25, 9, 13, 0, 9, 52, 25, 10, 2, 49, 28, 9, 0, 52, 25, 48, 20, 52, 25, 20, 0, 49, 20, 24], [45, 52, 25, 66, 49, 9, 28, 2, 49, 25, 29, 48, 61, 52, 48, 25, 9, 13, 0, 9, 52, 25, 10, 2, 49, 28, 9, 0, 52, 25, 48, 20, 52, 25, 20, 0, 49, 20, 24, 13], [52, 25, 66, 49, 9, 28, 2, 49, 25, 29, 48, 61, 52, 48, 25, 9, 13, 0, 9, 52, 25, 10, 2, 49, 28, 9, 0, 52, 25, 48, 20, 52, 25, 20, 0, 49, 20, 24, 13, 28], [25, 66, 49, 9, 28, 2, 49, 25, 29, 48, 61, 52, 48, 25, 9, 13, 0, 9, 52, 25, 10, 2, 49, 28, 9, 0, 52, 25, 48, 20, 52, 25, 20, 0, 49, 20, 24, 13, 28, 9]]
[13, 28, 9, 0]


In [7]:
# Prepare data for input with on hot encoding
from keras.utils import to_categorical

X = to_categorical(X, num_classes = len(unique_chars))
y = to_categorical(y, num_classes = len(unique_chars)) 

Using TensorFlow backend.


In [8]:
X.shape

(121090, 40, 68)

In [15]:
# Create mode
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import ModelCheckpoint

In [16]:
model = Sequential()
model.add(LSTM(128, input_shape=(40,68)))
model.add(Dense(68, activation="softmax")) # We have 68 different chars
          
model.compile(optimizer="adam", loss="categorical_crossentropy")

save_model = ModelCheckpoint("weigts.{epoch:02d}-{loss:.2f}.hdf5") # Saves the model after every epoch

In [17]:
model.fit(X,y, batch_size=32, epochs=3, callbacks=[save_model])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x28fa8e3d828>

In [18]:
# We also need the mapping informations to use the model on different data

import pickle

with open("int_to_char.pickle","wb") as file: 
    pickle.dump(int_to_char,file)

with open("char_to_int.pickle","wb") as file: 
    pickle.dump(char_to_int,file)