In [None]:
import tensorflow as tf
import numpy as np
import os
import pickle

SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 200
BATCH_SIZE = 128
FILE_PATH = "data/python_code.py"
BASENAME = os.path.basename(FILE_PATH) + "-lower"

text = open(FILE_PATH).read()
# comment this if you want to use uppercase letters
text = text.lower()
n_chars = len(text)
vocab = ''.join(sorted(set(text)))
print("vocab:", vocab)
n_unique_chars = len(vocab)
print("Number of characters:", n_chars)
print("Number of unique characters:", n_unique_chars)

In [None]:
# dictionary that converts characters to integers
char2int = {c: i for i, c in enumerate(vocab)}
# dictionary that converts integers to characters
int2char = {i: c for i, c in enumerate(vocab)}

# save these dictionaries for later generation
pickle.dump(char2int, open(f"{BASENAME}-char2int.pickle", "wb"))
pickle.dump(int2char, open(f"{BASENAME}-int2char.pickle", "wb"))

In [None]:
encoded_text = np.array([char2int[c] for c in text])

In [None]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
for element in char_dataset.take(5):
    print(element.numpy())

In [None]:
for element in char_dataset.batch(SEQUENCE_LENGTH+1).shuffle(1024).take(2):
    print(''.join([int2char[c] for c in element.numpy()]))

In [None]:
#help(tf.one_hot)
#help(char_dataset.window)
windows = char_dataset.window(SEQUENCE_LENGTH+1, shift=1, drop_remainder=True)
sequences = windows.flat_map(lambda window: window.batch(SEQUENCE_LENGTH+1))
dataset = sequences.map(lambda x: (x[:-1], x[-1]))
for input_, target in dataset.take(10):
    print(input_.numpy().shape)
    print(target.numpy().shape)
    print(''.join([int2char[c] for c in input_.numpy()]), int2char[target.numpy()])
    print("="*50)

In [None]:
sequences2 = char_dataset.batch(2*SEQUENCE_LENGTH+1, drop_remainder=True)

def split_sample(sample):
    ds = tf.data.Dataset.from_tensors((sample[:SEQUENCE_LENGTH], sample[SEQUENCE_LENGTH]))
    for i in range(1, (len(sample)-1) // 2):
        input_ = sample[i:i+SEQUENCE_LENGTH]
        target = sample[i+SEQUENCE_LENGTH]
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
    return ds


dataset2 = sequences2.flat_map(split_sample)
for element in dataset2.take(10):
    print(element[0].shape, element[1].shape)
    print(''.join([int2char[c] for c in element[0].numpy()]), int2char[element[1].numpy()])

In [None]:
for element1, element2 in zip(dataset.take(5), dataset2.take(5)):
    print(element1[0].numpy() == element2[0].numpy())
    

In [None]:
def one_hot_samples(input_, target):
    return tf.one_hot(input_, len(vocab)), tf.one_hot(target, len(vocab))
#     return input_, tf.one_hot(target, len(vocab))

dataset = dataset.map(one_hot_samples)
dataset2 = dataset2.map(one_hot_samples)
for element in dataset.take(10):
    print(element[0].shape, element[1].shape)

In [None]:
ds = dataset.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()
ds2 = dataset2.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()

In [None]:
def create_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential()
    # model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(SEQUENCE_LENGTH,)))
    model.add(tf.keras.layers.LSTM(rnn_units, input_shape=(SEQUENCE_LENGTH, len(vocab)), return_sequences=True))
    model.add(tf.keras.layers.Dropout(0.3))
    model.add(tf.keras.layers.LSTM(rnn_units)),
    model.add(tf.keras.layers.Dropout(0.3))
    model.add(tf.keras.layers.Dense(vocab_size, activation="softmax"))
    return model

In [None]:
model = create_model(len(vocab), embedding_dim=EMBEDDING_DIM, rnn_units=128, batch_size=BATCH_SIZE)
model.summary()
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
EPOCHS = 5
history = model.fit(ds2, steps_per_epoch=(len(encoded_text) - SEQUENCE_LENGTH ) // BATCH_SIZE, epochs=EPOCHS)

In [None]:
# save the model
model_path = f"results/{BASENAME}-{SEQUENCE_LENGTH}-NOEMBEDDING-moredata.h5"
model.save(model_path)
# model.load_weights(model_path)

In [None]:
seed = """You can be a""".lower()
s = seed
# generate 400 characters
generated = ""
for i in range(200):
    # make the input sequence
    X = np.zeros((1, SEQUENCE_LENGTH, len(vocab)))
    # X = np.zeros((1, SEQUENCE_LENGTH))
    for t, char in enumerate(seed):
        X[0, (SEQUENCE_LENGTH - len(seed)) + t, char2int[char]] = 1
    # predict the next character
    predicted = model.predict(X, verbose=0)[0]
    # print(predicted)
    # converting the vector to an integer
    next_index = np.argmax(predicted)
#     next_index = np.squeeze(np.round(predicted))
    # converting the integer to a character
#     print(next_index)
    next_char = int2char[next_index]
    # add the character to results
    generated += next_char
    # shift seed and the predicted character
    seed = seed[1:] + next_char

print("Generated text:")
print(s + generated)

In [None]:
char2int
