In [None]:
import os
from datasets import load_dataset

if not os.environ.get("HF_TOKEN"):
    # FYI export HF_TOKEN=$(pbpaste)
    raise ValueError("You need to set HF_TOKEN environment variable. Get one from huggingface.co")

# Load the shell/bash subset
#   FYI:   https://huggingface.co/datasets/bigcode/the-stack/tree/main/data/shell (FYI gated, must give email and approve)
full_dataset = load_dataset( "bigcode/the-stack", split="train", data_dir="data/shell")  # , lang=["bash"])
# data_dir data/shell has 11 files, not bad size (about 4GB IIEC)
# print(full_dataset.shape)
# print(f"  full: size(bytes) comma delimited: {full_dataset.size_in_bytes/1024/1024:,.0f} MB")

first_x = full_dataset.take(35000)
# print(first_x.shape)
# print(f"  first-{len(first_x)}: size(bytes) comma delimited: {first_x.data.nbytes/1024/1024:,.4f} MB")

# Save locally in diff format if needed, I am going to stick with parquet
# full_dataset.to_csv("shell_scripts.csv")
# print(full_dataset.column_names)
subset = full_dataset.select(range(100))
print("shape", subset.shape)

original_columns = subset.column_names

In [None]:
import torch
print(torch.backends.mps.is_available())  # Should return True
mps_device = torch.device('mps')

In [None]:
corpus_dataset = subset
scenario_path = f"tmp/one-record/full100"

# *** build tokenizer model ***
from tokenizers import ByteLevelBPETokenizer

tokenizer_path = f"{scenario_path}/tokenizer"
os.makedirs(tokenizer_path, exist_ok=True)

vocab_path = f"{tokenizer_path}/vocab.json"
vocab_size = 8000
merges_path = f"{tokenizer_path}/merges.txt"

if not os.path.exists(vocab_path):
    print("building tokenizer... (did not find saved model)")
    # *** CORPUS ***
    # PRN would it be advantageous to use larger set of records for the corpus than I am going to train on?
    corpus_file = f"{scenario_path}/corpus"
    if not os.path.exists(corpus_file):
        print("building corpus... (did not find saved file)")
        with open(corpus_file, "w") as f:
            for example in corpus_dataset["content"]:  # Adjust "content" to match your dataset key
                f.write(example + "\n")
    #13.5s for FULL dataset! only have to do this once!

    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[corpus_file], vocab_size=vocab_size, min_frequency=2)  # PRN adjust vocab_size/min_frequency?
    tokenizer.save_model(tokenizer_path)

    os.remove(corpus_file)  # huge file, easy to rebuild, so just nuke it after training


def load_saved_tokenizer():
    new_tokenizer = ByteLevelBPETokenizer(vocab_path, merges_path)
    new_tokenizer.add_tokens(['<PAD>', "<END>"])

    # apparently special tokens are not saved, WTF?, if I need to train on these they can't be different so throw if a problem:
    if new_tokenizer.token_to_id('<PAD>') != 8000:
        raise ValueError('Padding token not added correctly')
    if new_tokenizer.token_to_id('<END>') != 8001:
        raise ValueError('End token not added correctly')
    global vocab_size
    vocab_size = len(new_tokenizer.get_vocab())  # make sure reflects actual size
    return new_tokenizer


tokenizer = load_saved_tokenizer()
pad_token_id = tokenizer.token_to_id("<PAD>")
end_token_id = tokenizer.token_to_id("<END>")
# PRN add more special token as needed at end so I don't have to retrain tokenizer to add new special tokens (just obvi have to retrain the model or parts of it)

# tokenize (FYI .map is immutable, a new dataset is returned with the added column)
print("before tokenize dataset size: ", subset.shape)
subset = subset.map(lambda x: {"tokens": tokenizer.encode(x["content"]).ids})
print("after tokenize dataset size: ", subset.shape)
print()
print(subset["tokens"][0])  # Example tokenized output
print()
#
# # VIEW SOME TOKENS:
# # show each token for first 10:
# for i in range(100):
#     print(subset_tokenizd["tokens"][0][i], tokenizer.decode([subset_tokenizd["tokens"][0][i]]))


In [4]:
def create_training_pairs(tokens, seq_len):
    # Split into sequences of length seq_len + 1
    sequences = [tokens[i:i + seq_len + 1] for i in range(len(tokens) - seq_len)]
    return sequences

seq_len = 50
subset = subset.map(lambda x: {"sequences": create_training_pairs(x["tokens"], seq_len)})


In [None]:
# INSPECTING SUBSET with new columns:
def print_subset():
    print(subset.shape)
    for name in subset.column_names:
        print(f"{name}: {subset[name]}")
        print(f"  {len(subset[name])}")
    print()


def print_subset_new_columns():
    for name in subset.column_names:
        if name in original_columns:
            continue
        print(f"{name}: {subset[name]}")
        first_record = subset[name][0]
        if isinstance(first_record, list):
            print(f"  {first_record}")
            print(f"  {len(first_record)} records")
        # if list has lists:
        first_element_of_first_record = first_record[0]
        if isinstance(first_element_of_first_record, list):
            print(f"    {first_element_of_first_record}")
            print(f"    {len(first_element_of_first_record)} items")
        

    print()


print_subset()

print()
print("# SEQUENCES:")
print()
for i in range(3):
    # each record (in original dataset) now has a list of sequences (so, sequences is not a scalar value like content)
    print("## sequence " + str(i))
    print(subset["sequences"][0][i])
    print(tokenizer.decode(subset["sequences"][0][i]))
    print()

In [6]:
# Separate input and target
def split_input_target(sequence):
    return sequence[:-1], sequence[1:]

# wrong:
# def mapper(x):
#     what = x['sequences']
#     split = split_input_target(what)
#     print(f"* {len(what)} -> {len(split[0])} {len(split[1])}")
#     return {"x_input": split[0], "y_target": split[1]}

def mapper(all):
    sequences = all["sequences"]
    splits = [split_input_target(sequence) for sequence in sequences]
    x_input = [split[0] for split in splits]
    y_target =  [split[1] for split in splits]
    return {"x_input": x_input, "y_target": y_target}
subset = subset.map(mapper)


In [None]:
# print_subset_new_columns()
a = [1,2,3,4]
print(a[:-1])
print(a[1:])
print(split_input_target(a))

print_subset_new_columns()

print(subset["content"][0])

In [8]:
import torch
import torch.nn as nn


class CodeCompletionTransformer(nn.Module):

    def __init__(self, vocab_size, d_model, nhead, num_layers, seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_layers, batch_first=True)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding
        x = self.transformer(x, x)  # Encoder-only transformer
        return self.fc(x)


# mps_device = torch.device("mps")
model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=256, nhead=4, num_layers=4, seq_len=seq_len)
model = model.to(mps_device)


In [9]:
# *** ALTERNATE (less complexity, should train faster and maybe be useful)
class AlternateCodeCompletionLSTM(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        return self.fc(output)


# Initialize
# model = AlternateCodeCompletionLSTM(vocab_size=vocab_size, embed_size=128, hidden_size=256)


In [None]:
import sys
import time
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import torch.nn.functional as F

# Prepare data
# inputs = subset["x_input"]
# targets = subset["y_target"]
#inputs = subset["x_input"][0]
#targets = subset["y_target"][0]
inputs = torch.tensor([item for sublist in subset["x_input"] for item in sublist], device=mps_device)
targets = torch.tensor([item for sublist in subset["y_target"] for item in sublist], device=mps_device)
train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=32, shuffle=True)

# Optimizer and loss
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

LOG_PROGRESS_TO_FILE = True
LOG_FILE = f"{scenario_path}/train.log"
if LOG_PROGRESS_TO_FILE:
    # append, dont ever overwrite logs (del by hand if needed)
    with open(LOG_FILE, "a") as f:
        start_message = f"\n\n## training starting: {time.asctime()}\n"
        f.write(start_message)

start_time = time.perf_counter()
# Training loop
for epoch in range(100):  # Adjust epochs as needed
    total_batches = 0
    start_epoch_time = time.perf_counter()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_batches += 1
        if total_batches % 100 == 0:
            sec_per_batch = (time.perf_counter() - start_epoch_time) / total_batches
            epoch_sec_remaining = (len(train_loader) - total_batches) * sec_per_batch
            epoch_remain_time = time.strftime("%H:%M:%S", time.gmtime(epoch_sec_remaining))
            epoch_elapsed_time = time.strftime("%H:%M:%S", time.gmtime(time.perf_counter() - start_epoch_time))
            elapsed_seconds = time.perf_counter() - start_time  # Elapsed time
            elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_seconds))
            avg_epoch_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_seconds / (epoch + 1)))

            log = f"\rbatch: {total_batches}/{len(train_loader)}, " \
                f"epoch_elapsed_time: {epoch_elapsed_time}, " \
                f"epoch_remain_time: {epoch_remain_time}, " \
                f"overall_time: {elapsed_time}, " \
                f"epoch: {epoch}, avg_epoch_time: {avg_epoch_time}, " \
                f"loss: {loss.item():.4f}"
            sys.stdout.write(log)
            sys.stdout.flush()
            if LOG_PROGRESS_TO_FILE:
                with open(LOG_FILE, "a") as f:
                    f.write(log + "\n")
# 40 epochs on first record sequences only... got to 11/17% loss, IIAC because data set is small, its easy to overfit it?
# 50 epochs => 0.029 down to 0.005 loss... and it gives reasonable completions for the one example, well formatted shell script code
# 2m => 30 epochs (20 to 50)... => MPS => 20 epochs in 32s.... ~45s for 30 (big gains vs 2m CPU!)
# 10 records => IIGC 6 minutes for 20 epochs, right now epoch 6 is loss 0.416! 7 is 0.3491, 8 is .233, 9 is 0.2001, 10 is 0.186, 35 0.043, 36 0.009,37 0.359,38 0.261, 39 0.0261, 40 0.026522
#
# full corpus + 1 record => loss fell off rapidly but struggled to get as low as fast (0.10 ish stuck after 50ish epochs, down to 0.08 after 80 epochs IIRC)
#    full corups definitely has smaller tokens than one/ten records, to be expected IIAC
#    full w/ 10 records => faster than 10 records as corpus... 9s per epoch this time (was 15ish before)... 10 0.288 loss, 11 0.20777, 19 0.0671, 20 0.11686!, 21 0.128, 22 0.041, 23 0.144, 24 0.074, 25 0.169, 26 0.016, 27 0.1099, 28 0.048, 29 0.0457, 32 0.0321, 40 0.0319 (probably about as good as can be expected for model and limited data)
#       * much better responses when starting in the middle of an if block (past start of if)... I get reasonable syntax after wards and goodish suggestions... things are working better in this case with more data and bigger corpus it seems
#
# 100 records test => estimate 3m20s-#1(epoch),6m43s-#2 per epoch
#    => 100 epochs == 5.6+ hours...
#       not even done by morning :)...
#    just wanna know does errro do better than ~3 (epoch 1 is 4.38 so that is promising, 2 is 3.8738)
#       with my padded model I can't get error to drop below 3 after 100+ epochs too w/ 100 records
#    ok yeah after 5 it is still in the 3.5+ range... seems stuck there out of the gate...


In [12]:
torch.save(model.state_dict(), f"{scenario_path}/checkpoint003-100records-100?epochs-loss?.pth")

def load_model():
    model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=128, nhead=4, num_layers=2, seq_len=seq_len)
    # Load the saved state_dict into the model
    model.load_state_dict(torch.load(f"{scenario_path}/checkpoint001.pth"))
    #model.to(mps_device)

# load_model()

In [None]:
def generate_code(model, tokenizer, prompt, max_len=50):
    model.eval()
    tokens = tokenizer.encode(prompt).ids  # Convert prompt to tokens
    if len(tokens) < max_len:
        raise Exception("prompt too short")
        # pad didn't "work" but I only tested it one time so yeah... and on a miniscule dataset :)
        padding = tokenizer.encode("\n" * (max_len - len(tokens))).ids
        padding.extend(tokens)
        tokens = padding

    print("tokens:", tokens)
    generated_tokens = []
    tokens = tokens[-max_len:]  # truncate to max_len
    original_decoded = tokenizer.decode(tokens)
    for _ in range(max_len):
        last_max_len_tokens = tokens[-max_len:]
        input_tensor = torch.tensor(last_max_len_tokens).unsqueeze(0).to(mps_device)  # Add batch dimension
        output = model(input_tensor)  #.contiguous().to(mps_device))  # Predict next token
        next_token = output[0, -1].argmax(-1).item()  # Get the highest probability token
        tokens.append(next_token)  # Add the next token
        generated_tokens.append(next_token)
        if next_token == tokenizer.token_to_id("<END>"):  # Stop at <END> token
            break
    return original_decoded, tokenizer.decode(generated_tokens)

# *** ONE RECORD prompts with ONE RECORD AS CORPUS *** (note does not work with 10 records using 10 records as corpus b/c diff tokens..)
# TODO padding issue, NEED TO FIX in tokenizer? and? retrain? or? just pad bogus chars?
# print(generate_code(model, tokenizer, "#!/bin/bash\n"))
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\"\nfi\n"
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\""
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	echo 1\n\n"
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuckfaceretarted"  # too short
# wow in the case of me rambling swear words, it put more gibberish after it like random words too! cool... versus an `fi` to close out
# test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuck'\n"  # HERE b/c it was closed it started to add diff logic below.. wrong but interesting to generate what was complete logic blocks (partially correct just not across mutliple lines but who cares... it even got indentation and changed out what was happening in an if block!)
# test_prompt = "KUBECTL='docker exec hyperkube /hyperkube kubectl'\n\n#RUN_SKYDNS=\"yes\"\nRUN_SKYDNS=\"no\"\n# DNS_ARGUMENTS needs to be p"


# *** 10 records prompts (with 10 records as corpus) ***
# test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\"\nfi\n"
# test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\"\n"
# FYI not as logical as 1 record output... b/c its not memorizing most likely and not yet learning 

from colorama import Fore, Style

original, generated = generate_code(model, tokenizer, test_prompt)
print("## all:")
print(f"{Fore.GREEN}{original}{Style.RESET_ALL}{Fore.RED}{generated}{Style.RESET_ALL}")
print()
# TODO strip new lines? or can I collapse down to one with a count?
