In [19]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, TimeDistributed
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

import urllib.request

# Data Preprocessing

In [3]:
# data
raw_text = '''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [8]:
# tokenize
tokens = raw_text.split()
text = " ".join(tokens)

char_vocab = sorted(list(set(text)))
vocab_size = len(char_vocab)

char_to_index_pair = dict((char, index) for index, char in enumerate(char_vocab))
print(char_to_index_pair)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [9]:
length = 11
sequences = []

for i in range(length, len(text)):
    seq = text[i-length:i] # cut the text into sequences of length 11
    sequences.append(seq)
    
print(len(sequences))
print(sequences[:10])

426
['I get on wi', ' get on wit', 'get on with', 'et on with ', 't on with l', ' on with li', 'on with lif', 'n with life', ' with life ', 'with life a']


In [10]:
encoded_sequences = []

for seq in sequences:
    encoded_char = [char_to_index_pair[char] for char in seq]
    encoded_sequences.append(encoded_char)

print(encoded_sequences[:10])

[[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18], [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28], [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17], [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0], [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21], [0, 24, 23, 0, 31, 18, 28, 17, 0, 21, 18], [24, 23, 0, 31, 18, 28, 17, 0, 21, 18, 15], [23, 0, 31, 18, 28, 17, 0, 21, 18, 15, 14], [0, 31, 18, 28, 17, 0, 21, 18, 15, 14, 0], [31, 18, 28, 17, 0, 21, 18, 15, 14, 0, 10]]


In [62]:
encoded_sequences = np.array(encoded_sequences)

X = encoded_sequences[:,:-1]
y = encoded_sequences[:,-1]

X_one_hot = [to_categorical(x, num_classes=vocab_size) for x in X]
X_one_hot = np.array(X_one_hot)
y_one_hot = to_categorical(y, num_classes=vocab_size)

print(X_one_hot.shape)
print(y_one_hot.shape)

(426, 10, 33)
(426, 33)


# Modeling

In [63]:
hidden_units = 64

model = Sequential()
model.add(LSTM(units=hidden_units, input_shape=(X_one_hot.shape[1], X_one_hot.shape[2]))) # 10 words with 33 chars
model.add(Dense(units=vocab_size, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_one_hot, y_one_hot, epochs=100, verbose=2)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 64)                25088     
                                                                 
 dense_4 (Dense)             (None, 33)                2145      
                                                                 
Total params: 27233 (106.38 KB)
Trainable params: 27233 (106.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
14/14 - 1s - loss: 3.4718 - accuracy: 0.1268 - 631ms/epoch - 45ms/step
Epoch 2/100
14/14 - 0s - loss: 3.3526 - accuracy: 0.1972 - 34ms/epoch - 2ms/step
Epoch 3/100
14/14 - 0s - loss: 3.1425 - accuracy: 0.1972 - 35ms/epoch - 2ms/step
Epoch 4/100
14/14 - 0s - loss: 3.0254 - accuracy: 0.1972 - 35ms/epoch - 3ms/step
Epoch 5/100
14/14 - 0s - loss: 2.9782 - accuracy: 0.1972 - 35ms/epoch - 2ms/step
Epoch

<keras.src.callbacks.History at 0x2cea46350>

In [68]:
def generate(model, char_to_index_pair, text, n, seq_length=10):
    sentence = text
    
    for _ in range(n):
        encoded = [char_to_index_pair[char] for char in sentence]
        encoded_padded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        encoded_padded_one_hot = to_categorical(encoded_padded, num_classes=vocab_size)
        
        result = model.predict(encoded_padded_one_hot, verbose=0)
        result = np.argmax(result, axis=1) # get the index of the most probable char from each row
        
        for char, idx in char_to_index_pair.items():
            if idx == result:
                break
        
        sentence += char

    return sentence

In [69]:
print(generate(model, char_to_index_pair, 'I get on w,', 80))

I get on w,tht oop tyy aikkg  m ik  hut utsnssrraaiggt I wiin  oee laaa e, lM.  nen Iualtpp
