In [None]:
import os
from datasets import load_dataset
from pprint import pprint

if not os.environ.get("HF_TOKEN"):
    # FYI export HF_TOKEN=$(pbpaste)
    raise ValueError("You need to set HF_TOKEN environment variable. Get one from huggingface.co")

# Load the shell/bash subset
#   FYI:   https://huggingface.co/datasets/bigcode/the-stack/tree/main/data/shell (FYI gated, must give email and approve)
full_dataset = load_dataset("bigcode/the-stack", split="train", data_dir="data/shell")  # , lang=["bash"])
# data_dir data/shell has 11 files, not bad size (about 4GB IIEC)

# Save locally in diff format if needed, I am going to stick with parquet
# full_dataset.to_csv("shell_scripts.csv")
# print(full_dataset.column_names)
num_records = 100
subset = full_dataset.take(num_records)
print("shape", subset.shape)
print(f"  records({len(subset)}): size(bytes) comma delimited: {subset.data.nbytes/1024/1024:,.4f} MB")

original_columns = subset.column_names

scenario_path = f"tmp/04-more-{num_records}"
# ensure scenario dir exists, but don't wipe it if it already exists, will wipe/replace selective subsets (i.e. I want model checkpoints preserved)
os.makedirs(scenario_path, exist_ok=True)

In [2]:
# replace is fine b/c I need to change file size... forgot to do that with the 10 records model trains w00ps
corpus_file = f"{scenario_path}/corpus"
with open(corpus_file, "w") as f:
    for example in subset["content"]:  # Adjust "content" to match your dataset key
        f.write(example + "\n")

In [None]:
from tokenizers import ByteLevelBPETokenizer

# FYI not using the saved tokenizer (yet) ... but takes very little time to train and save so just do it
# recreate tokenzier model
tokenizer_path = f"{scenario_path}/tokenizer"
if os.path.exists(tokenizer_path):
    os.system(f"trash {tokenizer_path}")
os.makedirs(tokenizer_path)

# Train the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=[corpus_file], vocab_size=8000, min_frequency=2)  # PRN adjust vocab_size/min_frequency?
tokenizer.save_model(tokenizer_path)

# load the tokenizer
tokenizer = ByteLevelBPETokenizer(tokenizer_path + "/vocab.json", tokenizer_path + "/merges.txt")

subset = subset.map(lambda x: {"tokens": tokenizer.encode(x["content"]).ids})


In [None]:
def create_training_pairs(tokens, seq_len):
    # Split into sequences of length seq_len + 1
    sequences = [tokens[i:i + seq_len + 1] for i in range(len(tokens) - seq_len)]
    return sequences


seq_len = 50
subset = subset.map(lambda x: {"sequences": create_training_pairs(x["tokens"], seq_len)})

In [None]:
def split_input_target(sequence):
    return sequence[:-1], sequence[1:]


def mapper(all):
    sequences = all["sequences"]
    splits = [split_input_target(sequence) for sequence in sequences]
    x_input = [split[0] for split in splits]
    y_target = [split[1] for split in splits]
    return {"x_input": x_input, "y_target": y_target}


subset = subset.map(mapper)

In [6]:
import torch
import torch.nn as nn


class CodeCompletionTransformer(nn.Module):

    def __init__(self, vocab_size, d_model, nhead, num_layers, seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_layers, batch_first=True)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding
        x = self.transformer(x, x)  # Encoder-only transformer
        return self.fc(x)


# Initialize
vocab_size = 8000
mps_device = torch.device("mps")


def initialize_model() -> CodeCompletionTransformer:
    model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=128, nhead=4, num_layers=2, seq_len=seq_len)
    model = model.to(mps_device)
    return model


model = initialize_model()

In [8]:
# *** ALTERNATE (less complexity, should train faster and maybe be useful)
class AlternateCodeCompletionLSTM(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        return self.fc(output)


# Initialize
# model = AlternateCodeCompletionLSTM(vocab_size=8000, embed_size=128, hidden_size=256)


In [None]:
from time import sleep
from torch.utils.data import DataLoader
from torch.optim import Adam
import torch.nn.functional as F


# DataLoader
class CodeDataset(torch.utils.data.Dataset):

    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])


# move to mps before training loop even starts (should reduce small overhead in calling .to(mps_device) even on a mac with unified memory, IIUC)
inputs = torch.tensor([item for sublist in subset["x_input"] for item in sublist]).to(mps_device)
targets = torch.tensor([item for sublist in subset["y_target"] for item in sublist]).to(mps_device)

train_loader = DataLoader(CodeDataset(inputs, targets), batch_size=32, shuffle=True)

# Optimizer and loss
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(1000):  # Adjust epochs as needed
    for inputs, targets in train_loader:
        # pause if pause file is present? or have a stop and resume mechanism
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Iteration {epoch} loss: {loss.item()}")
    # sleep(30) # with mps, cpu is staying at 115% which (instead of 700%) so I am less worried about the fan now

# # wow loss on 10 records and 10 epocs only... is already almost as low as 1 record for 40 epochs...
# # 15m per 10 epochs with 10 records... so let's do 100 epochs while I sleep?

In [8]:
# save model:
checkpoint_dir = f"{scenario_path}/checkpoints/"
checkpoint_path = f"{checkpoint_dir}/01-X.pth"
os.makedirs(checkpoint_dir, exist_ok=True)
torch.save(model.state_dict(), checkpoint_path)


def load_model():
    new_model = initialize_model()
    # Load the saved state_dict into the model
    new_model.load_state_dict(torch.load(checkpoint_path))
    return new_model


# model = load_model()

In [None]:
# *** EVALUATE


def generate_code(model, tokenizer, prompt, max_len=50):
    model.eval()
    tokens = tokenizer.encode(prompt).ids  # Convert prompt to tokens
    if len(tokens) < max_len:
        raise Exception("prompt too short")
        # pad didn't "work" but I only tested it one time so yeah... and on a miniscule dataset :)
        padding = tokenizer.encode("\n" * (max_len - len(tokens))).ids
        padding.extend(tokens)
        tokens = padding

    print("tokens:", tokens)
    generated_tokens = []
    original_tokens = tokens.copy()
    for _ in range(max_len):
        last_max_len_tokens = tokens[-max_len:]
        # TODO disable gradients while evaluating?
        input_tensor = torch.tensor(last_max_len_tokens).unsqueeze(0).to(mps_device)  # Add batch dimension
        output = model(input_tensor)  # Predict next token
        next_token = output[0, -1].argmax(-1).item()  # Get the highest probability token
        tokens.append(next_token)  # Add the next token
        generated_tokens.append(next_token)
        if next_token == tokenizer.token_to_id("<END>"):  # Stop at <END> token
            break
    return tokenizer.decode(original_tokens), tokenizer.decode(generated_tokens)


# TODO padding issue, NEED TO FIX in tokenizer? and? retrain? or? just pad bogus chars?
# print(generate_code(model, tokenizer, "#!/bin/bash\n"))
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\"\nfi\n"
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\""
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	echo 1\n\n"
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuckfaceretarted" # too short
# wow in the case of me rambling swear words, it put more gibberish after it like random words too! cool... versus an `fi` to close out
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuck'\n"  # HERE b/c it was closed it started to add diff logic below.. wrong but interesting to generate what was complete logic blocks (partially correct just not across mutliple lines but who cares... it even got indentation and changed out what was happening in an if block!)
# test_prompt = "KUBECTL='docker exec hyperkube /hyperkube kubectl'\n\n#RUN_SKYDNS=\"yes\"\nRUN_SKYDNS=\"no\"\n# DNS_ARGUMENTS needs to be pa"
# if [ "${RUN_SKYDNS}" = "yes" ]; then
# 	DNS_ARGUMENTS="--cluster-dns=10.0.0.10 --cluster-domain=cluster.local"
# else
# 	DNS_ARGUMENTS=""
# fi
# "
from colorama import Fore, Style

original, generated = generate_code(model, tokenizer, test_prompt)
print("## all:")
print(f"{Fore.GREEN}{original}{Style.RESET_ALL}{Fore.RED}{generated}{Style.RESET_ALL}")
print()
# TODO strip new lines? or can I collapse down to one with a count?
