In [52]:
import os
import re
import string
from pathlib import Path
from shutil import copyfileobj
from urllib.request import urlopen

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import json
from random import shuffle
import tiktoken

import zipfile

print(f"TF version: {tf.__version__}")

TF version: 2.14.0


In [3]:
# def archive(dir: Path):
#     with zipfile.ZipFile(f"{dir}.zip", "w", zipfile.ZIP_STORED) as zip_file:
#         for entry in dir.rglob("*"):
#             zip_file.write(entry, entry.relative_to(dir))
# 
# 
# def unarchive(file: Path):
#     with zipfile.ZipFile(file, "r") as zip_file:
#         zip_file.extractall(file.with_suffix(""))

In [4]:
def download_file(url, dataset_file_path):
    path = Path(dataset_file_path)
    os.makedirs(path.parent, exist_ok=True)
    if not path.exists():
        print(f"Downloading {path}")
        with urlopen(url) as fsrc, open(path, "wb") as fdst:
            copyfileobj(fsrc, fdst)
    else:
        print(f"File {path} exists")

In [113]:
dataset_path = "tmp/text_generation.json"
batch_size = 1024
seed = 123
max_seq_length = 100

In [6]:
download_file("https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k/raw/main/code_alpaca_20k.json", dataset_path)

File tmp\text_generation.json exists


In [7]:
json_dataset = None
with open(dataset_path) as f:
    # noinspection PyRedeclaration
    json_dataset = json.load(f)

In [8]:
json_dataset[0]

{'instruction': 'Create an array of length 5 which contains all even numbers between 1 and 10.',
 'input': '',
 'output': 'arr = [2, 4, 6, 8, 10]'}

In [9]:
dataset = [f"instruction: {item['instruction']}\ninput: {item['input']}\noutput: {item['output']}"
           for item in json_dataset]
shuffle(dataset)

In [53]:
tokenizer = tiktoken.get_encoding("cl100k_base")

In [58]:
sequences = [tokenizer.encode(s) for s in dataset]

In [61]:
max(len(s) for s in sequences)

1145

In [100]:
def make_samples(lst, max_seq_length):
    seq = lst[:max_seq_length + 1]
    if len(lst) >= 2:
        samples = []
        for i in range(len(seq) - 1):
            sequence = seq[:i + 2]
            prefix = sequence[:-1] + ([0] * (max_seq_length + 1 - len(sequence)))
            completion = sequence[-1]
            samples.append((prefix, completion))
        return samples
    else:
        return []

In [101]:
make_samples([1, 2, 3, 4], 5)

[([1, 0, 0, 0, 0], 2), ([1, 2, 0, 0, 0], 3), ([1, 2, 3, 0, 0], 4)]

In [102]:
make_samples([1, 2, 3, 4, 5], 5)

[([1, 0, 0, 0, 0], 2),
 ([1, 2, 0, 0, 0], 3),
 ([1, 2, 3, 0, 0], 4),
 ([1, 2, 3, 4, 0], 5)]

In [103]:
make_samples([1, 2, 3, 4, 5, 6], 5)

[([1, 0, 0, 0, 0], 2),
 ([1, 2, 0, 0, 0], 3),
 ([1, 2, 3, 0, 0], 4),
 ([1, 2, 3, 4, 0], 5),
 ([1, 2, 3, 4, 5], 6)]

In [104]:
make_samples([1, 2, 3, 4, 5, 6, 7], 5)

[([1, 0, 0, 0, 0], 2),
 ([1, 2, 0, 0, 0], 3),
 ([1, 2, 3, 0, 0], 4),
 ([1, 2, 3, 4, 0], 5),
 ([1, 2, 3, 4, 5], 6)]

In [16]:
# samples = [make_samples(sequence, max_seq_length) for sequence in sequences]

In [105]:
samples = []
for i in range(len(sequences)):
    samples.extend(make_samples(sequences[i], max_seq_length))
    if i % 1000 == 0:
        print(f"seq {i} out of {len(sequences)}")

seq 0 out of 20022
seq 1000 out of 20022
seq 2000 out of 20022
seq 3000 out of 20022
seq 4000 out of 20022
seq 5000 out of 20022
seq 6000 out of 20022
seq 7000 out of 20022
seq 8000 out of 20022
seq 9000 out of 20022
seq 10000 out of 20022
seq 11000 out of 20022
seq 12000 out of 20022
seq 13000 out of 20022
seq 14000 out of 20022
seq 15000 out of 20022
seq 16000 out of 20022
seq 17000 out of 20022
seq 18000 out of 20022
seq 19000 out of 20022
seq 20000 out of 20022


In [115]:
# Plus OOV token and 0 index representing a mask
input_dim = tokenizer.n_vocab + 2

In [116]:
tokenizer.n_vocab

100277

In [123]:
split_size = 0.9
train_dataset = samples[:int(len(samples) * split_size)]
validation_dataset = samples[len(train_dataset):]
print(len(train_dataset), len(validation_dataset))

1519645 168850


In [125]:
train_prefixes = [prefix for prefix, completion in train_dataset]
train_completions = [completion for prefix, completion in train_dataset]
validation_prefixes = [prefix for prefix, completion in validation_dataset]
validation_completions = [completion for prefix, completion in validation_dataset]

In [None]:
train_prefixes = np.array(train_prefixes)
train_completions = np.array(train_completions)
validation_prefixes = np.array(validation_prefixes)
validation_completions = np.array(validation_completions)

In [126]:
model = tf.keras.Sequential([
    # text_vectorization_layer,

    layers.Embedding(input_dim, 64, mask_zero=True, input_length=max_seq_length),
    # layers.Bidirectional and extra LSTM+Dense layers don't improve training performance significantly for this dataset configuration
    layers.LSTM(256, return_sequences=True),
    layers.LSTM(128),
    # tf.keras.layers.Flatten(),
    layers.Dense(64, activation="relu"),
    layers.Dense(tokenizer.n_vocab, activation="softmax", name=f"output")
])

In [127]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 16)           1604464   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                12544     
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 16)                1040      
                                                                 
 output (Dense)              (None, 100277)            1704709   
                                                                 
Total params: 3322757 (12.68 MB)
Trainable params: 3322757 (12.68 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [128]:
model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["acc"]
)

In [129]:
# checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=f"tmp/checkpoints",
#     save_weights_only=False,
#     monitor=f"val_acc",
#     mode="max",
#     save_best_only=True,
# )
reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor=f"val_acc",
    mode="max",
)

In [None]:
epochs = 1
history = model.fit(
    x=train_prefixes,
    y=train_completions,
    batch_size=batch_size,
    epochs=epochs,
    steps_per_epoch=250,
    callbacks=[reduce_lr_callback],
    validation_data=(validation_prefixes, validation_completions),
    validation_steps=20
)

```
Epoch 1/25
250/250 [==============================] - 144s 543ms/step - loss: 7.0927 - acc: 0.0448 - val_loss: 6.5767 - val_acc: 0.0511 - lr: 0.0010
Epoch 2/25
250/250 [==============================] - 119s 475ms/step - loss: 6.3411 - acc: 0.0697 - val_loss: 6.0421 - val_acc: 0.1019 - lr: 0.0010
Epoch 3/25
250/250 [==============================] - 111s 445ms/step - loss: 5.8404 - acc: 0.1162 - val_loss: 5.6927 - val_acc: 0.1278 - lr: 0.0010
Epoch 4/25
250/250 [==============================] - 106s 425ms/step - loss: 5.5067 - acc: 0.1588 - val_loss: 5.3182 - val_acc: 0.1828 - lr: 0.0010
Epoch 5/25
250/250 [==============================] - 105s 419ms/step - loss: 5.0829 - acc: 0.2078 - val_loss: 4.9205 - val_acc: 0.2251 - lr: 0.0010
Epoch 6/25
250/250 [==============================] - 100s 400ms/step - loss: 4.7121 - acc: 0.2373 - val_loss: 4.6609 - val_acc: 0.2587 - lr: 0.0010
Epoch 7/25
250/250 [==============================] - 100s 401ms/step - loss: 4.4836 - acc: 0.2699 - val_loss: 4.4656 - val_acc: 0.2858 - lr: 0.0010
Epoch 8/25
250/250 [==============================] - 102s 409ms/step - loss: 4.2962 - acc: 0.2919 - val_loss: 4.2927 - val_acc: 0.3018 - lr: 0.0010
Epoch 9/25
250/250 [==============================] - 100s 402ms/step - loss: 4.1372 - acc: 0.3080 - val_loss: 4.1648 - val_acc: 0.3175 - lr: 0.0010
Epoch 10/25
250/250 [==============================] - 98s 393ms/step - loss: 3.9871 - acc: 0.3232 - val_loss: 4.0560 - val_acc: 0.3308 - lr: 0.0010
Epoch 11/25
250/250 [==============================] - 99s 397ms/step - loss: 3.8384 - acc: 0.3375 - val_loss: 3.9696 - val_acc: 0.3410 - lr: 0.0010
Epoch 12/25
250/250 [==============================] - 99s 397ms/step - loss: 3.7403 - acc: 0.3495 - val_loss: 3.8845 - val_acc: 0.3507 - lr: 0.0010
Epoch 13/25
250/250 [==============================] - 100s 399ms/step - loss: 3.6602 - acc: 0.3573 - val_loss: 3.8028 - val_acc: 0.3589 - lr: 0.0010
Epoch 14/25
250/250 [==============================] - 100s 401ms/step - loss: 3.5914 - acc: 0.3673 - val_loss: 3.7353 - val_acc: 0.3678 - lr: 0.0010
Epoch 15/25
250/250 [==============================] - 100s 400ms/step - loss: 3.4988 - acc: 0.3761 - val_loss: 3.6846 - val_acc: 0.3738 - lr: 0.0010
Epoch 16/25
250/250 [==============================] - 99s 398ms/step - loss: 3.3790 - acc: 0.3872 - val_loss: 3.6530 - val_acc: 0.3821 - lr: 0.0010
Epoch 17/25
250/250 [==============================] - 97s 386ms/step - loss: 3.3309 - acc: 0.3938 - val_loss: 3.5855 - val_acc: 0.3889 - lr: 0.0010
Epoch 18/25
250/250 [==============================] - 98s 394ms/step - loss: 3.3036 - acc: 0.3975 - val_loss: 3.5440 - val_acc: 0.3935 - lr: 0.0010
Epoch 19/25
250/250 [==============================] - 98s 394ms/step - loss: 3.2649 - acc: 0.4027 - val_loss: 3.5082 - val_acc: 0.3983 - lr: 0.0010
Epoch 20/25
250/250 [==============================] - 100s 401ms/step - loss: 3.1691 - acc: 0.4116 - val_loss: 3.4811 - val_acc: 0.4043 - lr: 0.0010
Epoch 21/25
250/250 [==============================] - 99s 397ms/step - loss: 3.1026 - acc: 0.4184 - val_loss: 3.4354 - val_acc: 0.4097 - lr: 0.0010
Epoch 22/25
250/250 [==============================] - 98s 390ms/step - loss: 3.0756 - acc: 0.4227 - val_loss: 3.4264 - val_acc: 0.4120 - lr: 0.0010
Epoch 23/25
250/250 [==============================] - 99s 396ms/step - loss: 3.0567 - acc: 0.4250 - val_loss: 3.3940 - val_acc: 0.4168 - lr: 0.0010
Epoch 24/25
250/250 [==============================] - 97s 389ms/step - loss: 3.0293 - acc: 0.4299 - val_loss: 3.3742 - val_acc: 0.4187 - lr: 0.0010
Epoch 25/25
250/250 [==============================] - 100s 400ms/step - loss: 2.9531 - acc: 0.4373 - val_loss: 3.3544 - val_acc: 0.4239 - lr: 0.0010
```

In [None]:
acc = history.history["acc"]
val_acc = history.history["val_acc"]

loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(epochs_range, acc, label="Training Accuracy")
plt.plot(epochs_range, val_acc, label="Validation Accuracy")
plt.legend(loc="lower left")
plt.title("Training and Validation Accuracy")

plt.subplot(2, 1, 2)
plt.plot(epochs_range, loss, label="Training Loss")
plt.plot(epochs_range, val_loss, label="Validation Loss")
plt.legend(loc="lower left")
plt.title("Training and Validation Loss")
plt.show()

In [None]:
def pad(tokens, max_seq_length):
    assert len(tokens) <= max_seq_length
    return tokens + ([0] * (max_seq_length - len(tokens)))

In [None]:
tokenizer.decode(train_prefixes[100])

In [1]:
# input = "instruction: Design an algorithm to print out the prime numbers between 1 and 100.\ninput: \noutput: "
input = "instruction: Create a JavaScript to switch between two div components.\ninput: \noutput: "
input_tokens = tokenizer.encode(input)
input_padded = pad(input_tokens, max_seq_length)
(len(input_tokens), len(input_padded))

NameError: name 'tokenizer' is not defined

In [None]:
print(tokenizer.decode(input_padded))
completion = input_padded.copy()
for i in range(max_seq_length - len(input_tokens)):
    # for i in range(100):
    output = model(tf.expand_dims(completion, 0))[0]
    # token = tf.argmax(output)
    top_k = tf.math.top_k(output, k=3)
    # print(f"top_k: {top_k}")
    # print(f"top_k tokens: {tokenizer.decode(top_k.indices)}")
    # p_sum = sum(top_k.values)
    # p = top_k.values.numpy().astype(np.float32) / p_sum
    token = np.random.choice(top_k.indices)
    # print(f"token: {token} '{tokenizer.decode([token])}'")
    completion[i + len(input_tokens)] = token
print("Completion:")
print(tokenizer.decode(completion))

```
instruction: Create a JavaScript to switch between two div components.
input: 
output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Completion:
instruction: Create a JavaScript to switch between two div components.
input: 
output: 3.5 3, y. This function will create an example
let a <- int(num) { [1],
    [2,2],
    print(i) {
    if (str % i) {
      if (!input            return " + (a"2; i < 10.length; i--) { 
  
    for (let i=2;
  }
} else 
  }
```

In [None]:
model.save("tmp/model.h5", save_format="h5")