In [50]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, TimeDistributed
from keras.utils import to_categorical

import urllib.request

# Data Preprocessing

In [3]:
# download data
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="../data/11-0.txt")

('../data/11-0.txt', <http.client.HTTPMessage at 0x2a0dc4790>)

In [7]:
# load data
sentences = list()
with open("../data/11-0.txt", "rb") as f:    
    for sentence in f:
        sentence = sentence.strip() # remove spaces such as \n
        sentence = sentence.decode('ascii', 'ignore') # remove non-ascii characters
        sentence = sentence.lower() # lower case
        
        if len(sentence) > 0:
            sentences.append(sentence)     
            
print(sentences[:10])

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll', 'this ebook is for the use of anyone anywhere in the united states and', 'most other parts of the world at no cost and with almost no restrictions', 'whatsoever. you may copy it, give it away or re-use it under the terms', 'of the project gutenberg license included with this ebook or online at', 'www.gutenberg.org. if you are not located in the united states, you', 'will have to check the laws of the country where you are located before', 'using this ebook.', 'title: alices adventures in wonderland', 'author: lewis carroll']


In [25]:
# combine all sentences into one string
data = " ".join(sentences)
print(data[:150])
print(len(data))

the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere in the united states and
159484


In [26]:
# make char-index dictionary
char_vocab = sorted(list(set(data))) # check unique characters
char_to_index_pair = dict((char, index) for index, char in enumerate(char_vocab))
index_to_char_pair = dict((index, char) for index, char in enumerate(char_vocab))
print(char_to_index_pair)
# print(index_to_char_pair)

{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}


In [28]:
# split data into n_sample
split_unit = 60
n_sample = int(np.floor(len(data) / split_unit))
print(n_sample)

2658


In [40]:
# make data sets
X_train = list()
y_train = list()

for i in range(0, n_sample): # 0~n_sample-1
    X = data[i*split_unit:(i+1)*split_unit]
    X_encoded = [char_to_index_pair[c] for c in X]
    X_train.append(X_encoded)
    
    y = data[i*split_unit+1:(i+1)*split_unit+1] # y is shifted version of X by 1
    y_encoded = [char_to_index_pair[c] for c in y]
    y_train.append(y_encoded)
    
print(X_train[0])
print(y_train[0])

[49, 37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30]
[37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30, 43]


In [48]:
# one-hot encoding
X_train_encoded = to_categorical(X_train)
y_train_encoded = to_categorical(y_train)

print(X_train_encoded.shape) # (n_sample, split_unit, n_char)
print(y_train_encoded.shape) # (n_sample, split_unit, n_char) for n to n rnn

X_train_encoded[0]

(2658, 60, 56)
(2658, 60, 56)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Modeling

In [55]:
hidden_units = 256

model = Sequential()
model.add(LSTM(units=hidden_units, input_shape=(None, len(char_vocab)), return_sequences=True))
model.add(LSTM(units=hidden_units, return_sequences=True))
model.add(TimeDistributed(Dense(units=len(char_vocab), activation='softmax')))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_encoded, y_train_encoded, epochs=10, verbose=2)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, None, 256)         320512    
                                                                 
 lstm_9 (LSTM)               (None, None, 256)         525312    
                                                                 
 time_distributed_4 (TimeDi  (None, None, 56)          14392     
 stributed)                                                      
                                                                 
Total params: 860216 (3.28 MB)
Trainable params: 860216 (3.28 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
84/84 - 20s - loss: 3.0664 - accuracy: 0.1838 - 20s/epoch - 244ms/step
Epoch 2/10
84/84 - 23s - loss: 2.6932 - accuracy: 0.2580 - 23s/epoch - 275ms/step
Epoch 3/10
84/84 - 23s - loss: 2.3622 - accuracy: 0.

<keras.src.callbacks.History at 0x165d0e110>

In [69]:
# generating sentences with word
def generate(model, char, n):
    ix = []
    sentence = ""

    idx = [char_to_index_pair[c] for c in char]
    ix.append(idx)
    sentence += char
    
    X = np.zeros((1, n, len(char_vocab))) # (1, n, n_char) to fit into model
    for i in range(n):
        X[0][i][ix[-1]] = 1 # one-hot encoding
        pred = model.predict(X[:, :i+1, :]) # (1, i+1, n_char)
        ix = np.argmax(pred[0], axis=1) # axis=1 means getting max value for each row, while axis=0 means getting max value for each column
        sentence += index_to_char_pair[ix[-1]]
    
    print(sentence)

generate(model, 'i', 25)

ing the marge the marked t
