In [None]:
import os
from datasets import load_dataset

if not os.environ.get("HF_TOKEN"):
    # FYI export HF_TOKEN=$(pbpaste)
    raise ValueError("You need to set HF_TOKEN environment variable. Get one from huggingface.co")

# Load the shell/bash subset
#   FYI:   https://huggingface.co/datasets/bigcode/the-stack/tree/main/data/shell (FYI gated, must give email and approve)
full_dataset = load_dataset("bigcode/the-stack", split="train", data_dir="data/shell")  # , lang=["bash"])
print(f"  records({len(full_dataset)}): size(bytes) comma delimited: {full_dataset.data.nbytes/1024/1024:,.4f} MB")
# data_dir data/shell has 11 files, not bad size (about 4GB IIEC)

# Save locally in diff format if needed, I am going to stick with parquet
# full_dataset.to_csv("shell_scripts.csv")
# print(full_dataset.column_names)
num_records = 100
seq_len = 50  # !!! TODO figure out timing for 1024 vs 256
subset = full_dataset.take(num_records)
print(f"subset: shape({subset.shape}) len({len(subset)}) type({type(subset)})")
print(f"  records({len(subset)}): size(bytes) comma delimited: {subset.data.nbytes/1024/1024:,.4f} MB")

scenario_path = f"tmp/more/05-records{num_records}-seqlen{seq_len}"
# ensure scenario dir exists, but don't wipe it if it already exists, will wipe/replace selective subsets (i.e. I want model checkpoints preserved)
os.makedirs(scenario_path, exist_ok=True)

In [4]:
# replace is fine b/c I need to change file size... forgot to do that with the 10 records model trains w00ps
corpus_file = f"{scenario_path}/corpus"
if not os.path.exists(corpus_file):
    with open(corpus_file, "w") as f:
        for example in subset["content"]:  # Adjust "content" to match your dataset key
            f.write(example + "\n")

In [None]:
from tokenizers import ByteLevelBPETokenizer

# FYI not using the saved tokenizer (yet) ... but takes very little time to train and save so just do it
# recreate tokenzier model
tokenizer_path = f"{scenario_path}/tokenizer"
os.makedirs(tokenizer_path, exist_ok=True)
if os.path.exists(tokenizer_path + "/vocab.json"):
    raise ValueError(f"tokenizer_path {tokenizer_path} already exists, not recreating... you need to reload it")


# Train the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=[corpus_file], vocab_size=8000, min_frequency=2)  # PRN adjust vocab_size/min_frequency?
tokenizer.save_model(tokenizer_path)


# load the tokenizer
def load_saved_tokenizer():
    return ByteLevelBPETokenizer(tokenizer_path + "/vocab.json", tokenizer_path + "/merges.txt")


tokenizer = load_saved_tokenizer()

subset = subset.map(lambda x: {"tokens": tokenizer.encode(x["content"]).ids})


In [None]:
def create_training_pairs(tokens, seq_len):
    # Split into sequences of length seq_len + 1
    # ahh crap the +1 here made for sequences of seq_len+1 (i.e. 512 => 513) so that when x_input/y_target split happens, its 512 in/out with model and not 511
    sequences = [tokens[i:i + seq_len + 1] for i in range(len(tokens) - seq_len)]
    return sequences


subset = subset.map(lambda x: {"sequences": create_training_pairs(x["tokens"], seq_len)})

In [None]:
def split_input_target(sequence):
    return sequence[:-1], sequence[1:]


def mapper(all):
    sequences = all["sequences"]
    splits = [split_input_target(sequence) for sequence in sequences]
    x_input = [split[0] for split in splits]
    y_target = [split[1] for split in splits]
    return {"x_input": x_input, "y_target": y_target}


subset = subset.map(mapper)

In [8]:
x_input = list(subset["x_input"])
y_target = list(subset["y_target"])
sequences = list(subset["sequences"])

In [None]:
print(f"sequences len: {len(sequences)}  type: {type(sequences)}")
print(f"x_input len: {len(x_input)}  type: {type(x_input)}")
print(f"y_target len: {len(y_target)}  type: {type(y_target)}")

for i in range(len(sequences)):
    if len(sequences[i]) == 0:
        continue
    print(f"sequences[{i}] len: {len(sequences[i])}  type: {type(sequences[i])}")
    print(f"  sequences[{i}][0] len: {len(sequences[i][0])} type: {type(sequences[i][0])}")
    print(f"x_input[{i}] len: {len(x_input[i])}  type: {type(x_input[i])}")
    print(f"  x_input[{i}][0] len: {len(x_input[i][0])} type: {type(x_input[i][0])}")
    print(f"y_target[{i}] len: {len(y_target[i])}  type: {type(y_target[i])}")
    print(f"  y_target[{i}][0] len: {len(y_target[i][0])} type: {type(y_target[i][0])}")

    print(f"sequences[{i}][0]: {sequences[i][0]}")
    print(f"x_input[{i}][0]: {x_input[i][0]}")
    print(f"y_target[{i}][0]: {y_target[i][0]}")
    break  # stop on first match

In [10]:
import torch
import torch.nn as nn


class CodeCompletionTransformer(nn.Module):

    def __init__(self, vocab_size, d_model, nhead, num_layers, seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_layers, batch_first=True)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding
        x = self.transformer(x, x)  # Encoder-only transformer
        return self.fc(x)


# Initialize
vocab_size = 8000
mps_device = torch.device("mps")


def initialize_model() -> CodeCompletionTransformer:
    model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=128, nhead=4, num_layers=2, seq_len=seq_len)
    model = model.to(mps_device)
    return model


model = initialize_model()

In [11]:
# *** ALTERNATE (less complexity, should train faster and maybe be useful)
class AlternateCodeCompletionLSTM(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        return self.fc(output)


# Initialize
# model = AlternateCodeCompletionLSTM(vocab_size=8000, embed_size=128, hidden_size=256)


In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
# build the train_loader

# FYI should already be on mps device b/c I am loading into a tensor with dataset load above
inputs = [item for sublist in subset["x_input"] for item in sublist]
# print(f"inputs: {len(inputs)} {type(inputs)} - {inputs[0][0]} {type(inputs[0][0])}")
#  FYI can use int32 or smaller for weights of inputs/targets but shouldn't matter much so leave int64 default for now
targets = [item for sublist in subset["y_target"] for item in sublist]
inputs_tensor = torch.tensor(inputs, device=mps_device)
targets_tensor = torch.tensor(targets, device=mps_device)
dataset = TensorDataset(inputs_tensor, targets_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
print(f"inputs tensor: {inputs_tensor.shape} {inputs_tensor.dtype}")
print(f"targets tensor: {targets_tensor.shape} {targets_tensor.dtype}")
print(f"train_loader: {len(train_loader)} batches")

In [None]:
import sys
import time
from torch.optim import Adam

# Optimizer and loss
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
start_time = time.perf_counter()
num_epochs = 1000


def train():

    for epoch in range(num_epochs):  # Adjust epochs as needed
        total_batches = 0
        start_epoch_time = time.perf_counter()
        for inputs, targets in train_loader:
            # ! TODO consider profiling for optimizations? TorchScript Profiler, Autograd Profiler, torch.utils.bottleneck, torch.profiler ...?
            # pause if pause file is present? or have a stop and resume mechanism
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_batches += 1
            if total_batches % 10 == 0:
                sec_per_batch = (time.perf_counter() - start_epoch_time) / total_batches
                epoch_sec_remaining = (len(train_loader) - total_batches) * sec_per_batch
                epoch_remain_time = time.strftime("%H:%M:%S", time.gmtime(epoch_sec_remaining))
                epoch_elapsed_time = time.strftime("%H:%M:%S", time.gmtime(time.perf_counter() - start_epoch_time))
                overall_time = time.strftime("%H:%M:%S", time.gmtime(time.perf_counter() - start_time))
                sys.stdout.write(f"\repoch: {epoch+1}, "
                                 f"batch: {total_batches}/{len(train_loader)}, "
                                 f"epoch_elapsed_time: {epoch_elapsed_time}, "
                                 f"epoch_remain_time: {epoch_remain_time}, "
                                 f"overall_time: {overall_time},"
                                 f"loss: {loss.item():.4f}")
                sys.stdout.flush()
            # 1024 seq => 22m/epoch (<2s/batch * 849batches/epoch)
            # 512 seq => 11m19s/epoch
            # 256 seq => 6m/epoch (1577 batches/epoch)
            # 50 seq => 3m/epoch

        # FYI indent this to tmp test the output and reverse indent so it only runs per epoch otherwise
        # FYI 1-3ms per call to print progress (depending on complexity of calculations)
        elapsed_seconds = time.perf_counter() - start_time  # Elapsed time
        elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_seconds))
        avg_epoch_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_seconds / (epoch + 1)))
        # each write to stdout < 1ms
        sys.stdout.flush()
#! TODO LET THIS ONE RUN FOR A WHILE AND GET THE LOSS CURVE OVER TIME so I have it as something of a ref... I recall getting really low loss quickly on 1 record but I cant recall what it was for 100


train()

In [None]:
# save model:
checkpoint_dir = f"{scenario_path}/checkpoints/"
checkpoint_path = f"{checkpoint_dir}/01-epoch11-2h11m.pt"
# os.makedirs(checkpoint_dir, exist_ok=True)
# torch.save(model.state_dict(), checkpoint_path)


def load_model():
    # TODO MAKE SURE TO RELOAD THE TOKENIZER!! OMG dont re-create it!
    new_tokenizer = load_saved_tokenizer()
    new_model = initialize_model()
    # Load the saved state_dict into the model
    new_model.load_state_dict(torch.load(checkpoint_path))
    return new_tokenizer, new_model


tokenizer, model = load_model()

In [None]:
# *** EVALUATE


def generate_code(model, tokenizer, prompt, max_len=seq_len):
    model.eval()
    tokens = tokenizer.encode(prompt).ids  # Convert prompt to tokens
    if len(tokens) < max_len:
        raise Exception("prompt too short")
        # pad didn't "work" but I only tested it one time so yeah... and on a miniscule dataset :)
        padding = tokenizer.encode("\n" * (max_len - len(tokens))).ids
        padding.extend(tokens)
        tokens = padding

    print("tokens:", tokens)
    if len(tokens) > max_len:
        tokens = tokens[-max_len:]
    generated_tokens = []
    original_tokens = tokens.copy()
    for _ in range(max_len):
        last_max_len_tokens = tokens[-max_len:]
        # TODO disable gradients while evaluating?
        input_tensor = torch.tensor(last_max_len_tokens).unsqueeze(0).to(mps_device)  # Add batch dimension
        output = model(input_tensor)  # Predict next token
        next_token = output[0, -1].argmax(-1).item()  # Get the highest probability token
        tokens.append(next_token)  # Add the next token
        generated_tokens.append(next_token)
        if next_token == tokenizer.token_to_id("<END>"):  # Stop at <END> token
            break
    return tokenizer.decode(original_tokens), tokenizer.decode(generated_tokens)


# TODO padding issue, NEED TO FIX in tokenizer? and? retrain? or? just pad bogus chars?
# print(generate_code(model, tokenizer, "#!/bin/bash\n"))
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\"\n\n"
# test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\""
test_prompt = "if [ \"${IS_MACOS}\" = \"yes\" ]; then\n  echo \"this is a mac, details: $(uname)\"\nelseif [ \"${IS_LINUX}\" = \"yes\" ]; then\n  echo \"this is linux\"\nfi"

# THIS DOES AWESOME on seq_len=50 (scenario 04)... 
test_prompt = "if [ \"${IS_MACOS}\" = \"yes\" ]; then\n  echo \"mac\"\nelseif [ \"${IS_LINUX}\" = \"yes\" ]; then\n  echo \"linux\"\nelse\n  "
# I am excited to figure out how to pad tokens in eval (see chatgpt thread) s/b to basically ignore the pad tokens in the model and add that after training should work fine

# test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuckfaceretarted" # too short
# wow in the case of me rambling swear words, it put more gibberish after it like random words too! cool... versus an `fi` to close out
# test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuck'\n"  # HERE b/c it was closed it started to add diff logic below.. wrong but interesting to generate what was complete logic blocks (partially correct just not across mutliple lines but who cares... it even got indentation and changed out what was happening in an if block!)
# test_prompt = "KUBECTL='docker exec hyperkube /hyperkube kubectl'\n\n#RUN_SKYDNS=\"yes\"\nRUN_SKYDNS=\"no\"\n# DNS_ARGUMENTS needs to be pa"
# if [ "${RUN_SKYDNS}" = "yes" ]; then
# 	DNS_ARGUMENTS="--cluster-dns=10.0.0.10 --cluster-domain=cluster.local"
# else
# 	DNS_ARGUMENTS=""
# fi
# "
from colorama import Fore, Style

original, generated = generate_code(model, tokenizer, test_prompt)
print("## all:")
print(f"{Fore.GREEN}{original}{Style.RESET_ALL}{Fore.RED}{generated}{Style.RESET_ALL}")
print()
# TODO strip new lines? or can I collapse down to one with a count?
