In [None]:
import os
from datasets import load_dataset
from pprint import pprint

if not os.environ.get("HF_TOKEN"):
    # FYI export HF_TOKEN=$(pbpaste)
    raise ValueError("You need to set HF_TOKEN environment variable. Get one from huggingface.co")

# Load the shell/bash subset
#   FYI:   https://huggingface.co/datasets/bigcode/the-stack/tree/main/data/shell (FYI gated, must give email and approve)
full_dataset = load_dataset( "bigcode/the-stack", split="train", data_dir="data/shell")  # , lang=["bash"])
# data_dir data/shell has 11 files, not bad size (about 4GB IIEC)
# print(full_dataset.shape)
# print(f"  full: size(bytes) comma delimited: {full_dataset.size_in_bytes/1024/1024:,.0f} MB")

first_x = full_dataset.take(35000)
# print(first_x.shape)
# print(f"  first-{len(first_x)}: size(bytes) comma delimited: {first_x.data.nbytes/1024/1024:,.4f} MB")

# Save locally in diff format if needed, I am going to stick with parquet
# full_dataset.to_csv("shell_scripts.csv")
# print(full_dataset.column_names)
subset = full_dataset.select(range(1))
print("shape", subset.shape)

original_columns = subset.column_names

In [None]:
import torch
print(torch.backends.mps.is_available())  # Should return True


In [4]:
# build corpus
scenario_path = "tmp/one-record/original"
if not os.path.exists(scenario_path):
    os.makedirs(scenario_path, exist_ok=True)

corpus_file = f"{scenario_path}/corpus"
if not os.path.exists(corpus_file):
    with open(corpus_file, "w") as f:
        for example in subset["content"]:  # Adjust "content" to match your dataset key
            f.write(example + "\n")

In [None]:
from tokenizers import ByteLevelBPETokenizer

tokenizer_path = f"{scenario_path}/tokenizer"
if not os.path.exists(tokenizer_path):
    os.makedirs(tokenizer_path)

# Train the tokenizer
if not os.path.exists(tokenizer_path + "/vocab.json"):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[corpus_file], vocab_size=8000, min_frequency=2)  # PRN adjust vocab_size/min_frequency?
    tokenizer.save_model(tokenizer_path)

# load the tokenizer
tokenizer = ByteLevelBPETokenizer(tokenizer_path + "/vocab.json", tokenizer_path + "/merges.txt")

# tokenize (FYI .map is immutable, a new dataset is returned with the added column)
print("before tokenize dataset size: ", subset.shape)
subset = subset.map(lambda x: {"tokens": tokenizer.encode(x["content"]).ids})
print("after tokenize dataset size: ", subset.shape)
print()
print(subset["tokens"][0])  # Example tokenized output
print()
#
# # VIEW SOME TOKENS:
# # show each token for first 10:
# for i in range(100):
#     print(subset_tokenizd["tokens"][0][i], tokenizer.decode([subset_tokenizd["tokens"][0][i]]))


In [6]:
def create_training_pairs(tokens, seq_len):
    # Split into sequences of length seq_len + 1
    sequences = [tokens[i:i + seq_len + 1] for i in range(len(tokens) - seq_len)]
    return sequences

seq_len = 50
subset = subset.map(lambda x: {"sequences": create_training_pairs(x["tokens"], seq_len)})


In [None]:
# INSPECTING SUBSET with new columns:
def print_subset():
    print(subset.shape)
    for name in subset.column_names:
        print(f"{name}: {subset[name]}")
        print(f"  {len(subset[name])}")
    print()


def print_subset_new_columns():
    for name in subset.column_names:
        if name in original_columns:
            continue
        print(f"{name}: {subset[name]}")
        first_record = subset[name][0]
        if isinstance(first_record, list):
            print(f"  {first_record}")
            print(f"  {len(first_record)} records")
        # if list has lists:
        first_element_of_first_record = first_record[0]
        if isinstance(first_element_of_first_record, list):
            print(f"    {first_element_of_first_record}")
            print(f"    {len(first_element_of_first_record)} items")
        

    print()


print_subset()

print()
print("# SEQUENCES:")
print()
for i in range(3):
    # each record (in original dataset) now has a list of sequences (so, sequences is not a scalar value like content)
    print("## sequence " + str(i))
    print(subset["sequences"][0][i])
    print(tokenizer.decode(subset["sequences"][0][i]))
    print()

In [8]:
# Separate input and target
def split_input_target(sequence):
    return sequence[:-1], sequence[1:]

# wrong:
# def mapper(x):
#     what = x['sequences']
#     split = split_input_target(what)
#     print(f"* {len(what)} -> {len(split[0])} {len(split[1])}")
#     return {"x_input": split[0], "y_target": split[1]}

def mapper(all):
    sequences = all["sequences"]
    splits = [split_input_target(sequence) for sequence in sequences]
    x_input = [split[0] for split in splits]
    y_target =  [split[1] for split in splits]
    return {"x_input": x_input, "y_target": y_target}
subset = subset.map(mapper)


In [None]:
# print_subset_new_columns()
a = [1,2,3,4]
print(a[:-1])
print(a[1:])
print(split_input_target(a))

print_subset_new_columns()

print(subset["content"][0])

In [10]:
import torch
import torch.nn as nn


class CodeCompletionTransformer(nn.Module):

    def __init__(self, vocab_size, d_model, nhead, num_layers, seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_layers, batch_first=True)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding
        x = self.transformer(x, x)  # Encoder-only transformer
        return self.fc(x)


# Initialize
vocab_size = 8000
# mps_device = torch.device("mps")
model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=128, nhead=4, num_layers=2, seq_len=seq_len)
# model.to(mps_device)


In [11]:
# *** ALTERNATE (less complexity, should train faster and maybe be useful)
class AlternateCodeCompletionLSTM(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        return self.fc(output)


# Initialize
# model = AlternateCodeCompletionLSTM(vocab_size=8000, embed_size=128, hidden_size=256)


In [None]:
from torch.utils.data import DataLoader
from torch.optim import Adam
import torch.nn.functional as F

# DataLoader
class CodeDataset(torch.utils.data.Dataset):

    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

# Prepare data
# inputs = subset["x_input"]
# targets = subset["y_target"]
#inputs = subset["x_input"][0]
#targets = subset["y_target"][0]
inputs = [ item for sublist in subset["x_input"] for item in sublist]
targets = [ item for sublist in subset["y_target"] for item in sublist]
train_loader = DataLoader(CodeDataset(inputs, targets), batch_size=32, shuffle=True)

# Optimizer and loss
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(30):  # Adjust epochs as needed
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
# 40 epochs on first record sequences only... got to 11/17% loss, IIAC because data set is small, its easy to overfit it?
# 50 epochs => 0.029 down to 0.005 loss... and it gives reasonable completions for the one example, well formatted shell script code

Epoch 1, Loss: 0.3428381085395813
Epoch 2, Loss: 0.2325245440006256
Epoch 3, Loss: 0.18840189278125763
Epoch 4, Loss: 0.23613181710243225
Epoch 5, Loss: 0.15484541654586792
Epoch 6, Loss: 0.12867139279842377
Epoch 7, Loss: 0.10232086479663849
Epoch 8, Loss: 0.12792468070983887
Epoch 9, Loss: 0.08423150330781937
Epoch 10, Loss: 0.09120484441518784
Epoch 11, Loss: 0.10631529241800308
Epoch 12, Loss: 0.05978334695100784
Epoch 13, Loss: 0.07507678866386414
Epoch 14, Loss: 0.04323822632431984
Epoch 15, Loss: 0.08547110110521317
Epoch 16, Loss: 0.07593853026628494
Epoch 17, Loss: 0.08188261836767197
Epoch 18, Loss: 0.032942067831754684
Epoch 19, Loss: 0.05927233025431633
Epoch 20, Loss: 0.061810266226530075
Epoch 21, Loss: 0.029853317886590958
Epoch 22, Loss: 0.02651490829885006
Epoch 23, Loss: 0.026656942442059517
Epoch 24, Loss: 0.024634717032313347
Epoch 25, Loss: 0.04473059996962547
Epoch 26, Loss: 0.014352821744978428
Epoch 27, Loss: 0.02192622423171997
Epoch 28, Loss: 0.009389908052980

In [22]:
# save model:
torch.save(model.state_dict(), f"{scenario_path}/checkpoint001-50epochs.pth")

def load_model():
    model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=128, nhead=4, num_layers=2, seq_len=seq_len)
    # Load the saved state_dict into the model
    model.load_state_dict(torch.load(f"{scenario_path}/checkpoint001.pth"))
    #model.to(mps_device)

# load_model()

In [32]:



def generate_code(model, tokenizer, prompt, max_len=50):
    model.eval()
    tokens = tokenizer.encode(prompt).ids  # Convert prompt to tokens
    if len(tokens) < max_len:
        raise Exception("prompt too short")
        # pad didn't "work" but I only tested it one time so yeah... and on a miniscule dataset :)
        padding = tokenizer.encode("\n" * (max_len - len(tokens))).ids
        padding.extend(tokens)
        tokens = padding

    print("tokens:", tokens)
    generated_tokens = []
    tokens = tokens[-max_len:]  # truncate to max_len
    original_decoded = tokenizer.decode(tokens)
    for _ in range(max_len):
        last_max_len_tokens = tokens[-max_len:]
        input_tensor = torch.tensor(last_max_len_tokens).unsqueeze(0)  # Add batch dimension
        output = model(input_tensor)  #.contiguous().to(mps_device))  # Predict next token
        next_token = output[0, -1].argmax(-1).item()  # Get the highest probability token
        tokens.append(next_token)  # Add the next token
        generated_tokens.append(next_token)
        if next_token == tokenizer.token_to_id("<END>"):  # Stop at <END> token
            break
    return original_decoded, tokenizer.decode(generated_tokens)


# TODO padding issue, NEED TO FIX in tokenizer? and? retrain? or? just pad bogus chars?
# print(generate_code(model, tokenizer, "#!/bin/bash\n"))
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\"\nfi\n"
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\""
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	echo 1\n\n"
test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuckfaceretarted"  # too short
# wow in the case of me rambling swear words, it put more gibberish after it like random words too! cool... versus an `fi` to close out
# test_prompt = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n   echo 'suck a giant fucking dick you asslicking cutwaffle pumperkiffenslobistrapistfuck'\n"  # HERE b/c it was closed it started to add diff logic below.. wrong but interesting to generate what was complete logic blocks (partially correct just not across mutliple lines but who cares... it even got indentation and changed out what was happening in an if block!)
test_prompt="KUBECTL='docker exec hyperkube /hyperkube kubectl'\n\n#RUN_SKYDNS=\"yes\"\nRUN_SKYDNS=\"no\"\n# DNS_ARGUMENTS needs to be p"
# if [ "${RUN_SKYDNS}" = "yes" ]; then
# 	DNS_ARGUMENTS="--cluster-dns=10.0.0.10 --cluster-domain=cluster.local"
# else
# 	DNS_ARGUMENTS=""
# fi
# "
from colorama import Fore, Style

original, generated = generate_code(model, tokenizer, test_prompt)
print("## all:")
print(f"{Fore.GREEN}{original}{Style.RESET_ALL}{Fore.RED}{generated}{Style.RESET_ALL}")
print()
# TODO strip new lines? or can I collapse down to one with a count?


tokens: [365, 28, 6, 317, 220, 68, 87, 68, 66, 220, 431, 220, 14, 431, 404, 66, 83, 75, 6, 198, 198, 2, 313, 62, 314, 296, 340, 1, 198, 313, 62, 314, 296, 285, 1, 198, 2, 220, 271, 62, 364, 220, 392, 68, 383, 415, 403, 68, 220, 79]
## all:
[32mKUBECTL='docker exec hyperkube /hyperkube kubectl'

#RUN_SKYDNS="yes"
RUN_SKYDNS="no"
# DNS_ARGUMENTS needs to be p[0m[31massed passed Kubernetes is setup.
if [ "${RUN_SKYDNS}" = "yes" ]; then
	DNS_ARGUMENTS="--cluster-dns=10.0.0.10 --[0m

