In [None]:
import os
from datasets import load_dataset
from pprint import pprint

if not os.environ.get("HF_TOKEN"):
    # FYI export HF_TOKEN=$(pbpaste)
    raise ValueError("You need to set HF_TOKEN environment variable. Get one from huggingface.co")

# Load the shell/bash subset
#   FYI:   https://huggingface.co/datasets/bigcode/the-stack/tree/main/data/shell (FYI gated, must give email and approve)
full_dataset = load_dataset( "bigcode/the-stack", split="train", data_dir="data/shell")  # , lang=["bash"])
# data_dir data/shell has 11 files, not bad size (about 4GB IIEC)

# Save locally in diff format if needed, I am going to stick with parquet
# full_dataset.to_csv("shell_scripts.csv")
# print(full_dataset.column_names)
subset = full_dataset.select(range(1))
print("shape", subset.shape)

original_columns = subset.column_names

In [6]:
# build corpus
if not os.path.exists("tmp"):
    os.makedirs("tmp")

corpus_file = "tmp/shell_scripts_corpus.sh"
if not os.path.exists(corpus_file):
    with open(corpus_file, "w") as f:
        for example in subset["content"]:  # Adjust "content" to match your dataset key
            f.write(example + "\n")

In [None]:
from tokenizers import ByteLevelBPETokenizer

tokenizer_path = "tmp/trained-tokenizer"
if not os.path.exists(tokenizer_path):
    os.makedirs(tokenizer_path)

# Train the tokenizer
if not os.path.exists(tokenizer_path + "/vocab.json"):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[corpus_file], vocab_size=8000, min_frequency=2)  # PRN adjust vocab_size/min_frequency?
    tokenizer.save_model(tokenizer_path)

# load the tokenizer
tokenizer = ByteLevelBPETokenizer(tokenizer_path + "/vocab.json", tokenizer_path + "/merges.txt")

# tokenize (FYI .map is immutable, a new dataset is returned with the added column)
print("before tokenize dataset size: ", subset.shape)
subset = subset.map(lambda x: {"tokens": tokenizer.encode(x["content"]).ids})
print("after tokenize dataset size: ", subset.shape)
print()
print(subset["tokens"][0])  # Example tokenized output
print()
#
# # VIEW SOME TOKENS:
# # show each token for first 10:
# for i in range(100):
#     print(subset_tokenizd["tokens"][0][i], tokenizer.decode([subset_tokenizd["tokens"][0][i]]))


In [8]:
def create_training_pairs(tokens, seq_len):
    # Split into sequences of length seq_len + 1
    sequences = [tokens[i:i + seq_len + 1] for i in range(len(tokens) - seq_len)]
    return sequences

seq_len = 50
subset = subset.map(lambda x: {"sequences": create_training_pairs(x["tokens"], seq_len)})


In [None]:
# INSPECTING SUBSET with new columns:
def print_subset():
    print(subset.shape)
    for name in subset.column_names:
        print(f"{name}: {subset[name]}")
        print(f"  {len(subset[name])}")
    print()


def print_subset_new_columns():
    for name in subset.column_names:
        if name in original_columns:
            continue
        print(f"{name}: {subset[name]}")
        first_record = subset[name][0]
        if isinstance(first_record, list):
            print(f"  {first_record}")
            print(f"  {len(first_record)} records")
        # if list has lists:
        first_element_of_first_record = first_record[0]
        if isinstance(first_element_of_first_record, list):
            print(f"    {first_element_of_first_record}")
            print(f"    {len(first_element_of_first_record)} items")
        

    print()


print_subset()

print()
print("# SEQUENCES:")
print()
for i in range(3):
    # each record (in original dataset) now has a list of sequences (so, sequences is not a scalar value like content)
    print("## sequence " + str(i))
    print(subset["sequences"][0][i])
    print(tokenizer.decode(subset["sequences"][0][i]))
    print()

In [None]:
# Separate input and target
def split_input_target(sequence):
    return sequence[:-1], sequence[1:]

# wrong:
# def mapper(x):
#     what = x['sequences']
#     split = split_input_target(what)
#     print(f"* {len(what)} -> {len(split[0])} {len(split[1])}")
#     return {"x_input": split[0], "y_target": split[1]}

def mapper(all):
    sequences = all["sequences"]
    splits = [split_input_target(sequence) for sequence in sequences]
    x_input = [split[0] for split in splits]
    y_target =  [split[1] for split in splits]
    return {"x_input": x_input, "y_target": y_target}
subset = subset.map(mapper)


In [None]:
# print_subset_new_columns()
a = [1,2,3,4]
print(a[:-1])
print(a[1:])
print(split_input_target(a))

print_subset_new_columns()

print(subset["content"][0])

In [17]:
import torch
import torch.nn as nn


class CodeCompletionTransformer(nn.Module):

    def __init__(self, vocab_size, d_model, nhead, num_layers, seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, d_model))
        self.transformer = nn.Transformer(d_model, nhead, num_layers, batch_first=True)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding
        x = self.transformer(x, x)  # Encoder-only transformer
        return self.fc(x)


# Initialize
vocab_size = 8000
model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=128, nhead=4, num_layers=2, seq_len=seq_len)


In [20]:
# *** ALTERNATE (less complexity, should train faster and maybe be useful)
class AlternateCodeCompletionLSTM(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        return self.fc(output)


# Initialize
# model = AlternateCodeCompletionLSTM(vocab_size=8000, embed_size=128, hidden_size=256)


In [None]:
from torch.utils.data import DataLoader
from torch.optim import Adam
import torch.nn.functional as F

# DataLoader
class CodeDataset(torch.utils.data.Dataset):

    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

# Prepare data
# inputs = subset["x_input"]
# targets = subset["y_target"]
#inputs = subset["x_input"][0]
#targets = subset["y_target"][0]
inputs = [ item for sublist in subset["x_input"] for item in sublist]
targets = [ item for sublist in subset["y_target"] for item in sublist]
train_loader = DataLoader(CodeDataset(inputs, targets), batch_size=32, shuffle=True)

# Optimizer and loss
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(10):  # Adjust epochs as needed
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
# 40 epochs on first record sequences only... got to 11/17% loss, IIAC because data set is small, its easy to overfit it?

In [None]:
# save model:
# torch.save(model.state_dict(), "tmp/model-40ish-epocs-first-record-only.pth")


def load_model():
    model = CodeCompletionTransformer(vocab_size=vocab_size, d_model=128, nhead=4, num_layers=2, seq_len=seq_len)
    # Load the saved state_dict into the model
    model.load_state_dict(torch.load("tmp/model-40ish-epocs-first-record-only.pth"))

load_model()

In [None]:
# *** EVALUATE


def generate_code(model, tokenizer, tokens, max_len=50):
    model.eval()
    generated_tokens = []
    for _ in range(max_len):
        input_tensor = torch.tensor(tokens).unsqueeze(0)  # Add batch dimension
        output = model(input_tensor)  # Predict next token
        next_token = output[0, -1].argmax(-1).item()  # Get the highest probability token
        tokens.append(next_token)  # Add the next token
        generated_tokens.append(next_token)
        # drop first token for next iteration # TODO change it up so I only take last 50 chars on each loop... so I get one final chunk with prompt and generated code
        tokens = tokens[1:]
        if next_token == tokenizer.token_to_id("<END>"):  # Stop at <END> token
            break
    return tokenizer.decode(generated_tokens)


# TODO padding issue, NEED TO FIX in tokenizer? and? retrain? or? just pad bogus chars?
# print(generate_code(model, tokenizer, "#!/bin/bash\n"))
test = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\"\nfi\n"
test = "if [ \"${RUN_SKYDNS}\" = \"yes\" ]; then\n	DNS_ARGUMENTS=\"--cluster-dns=10.0.0.10 --cluster-domain=cluster.local\"\nelse\n	DNS_ARGUMENTS=\"\""
# test="KUBECTL='docker exec hyperkube /hyperkube kubectl'\n\n#RUN_SKYDNS=\"yes\"\nRUN_SKYDNS=\"no\"\n# DNS_ARGUMENTS needs to be pa"
# if [ "${RUN_SKYDNS}" = "yes" ]; then
# 	DNS_ARGUMENTS="--cluster-dns=10.0.0.10 --cluster-domain=cluster.local"
# else
# 	DNS_ARGUMENTS=""
# fi
# "
tokens = tokenizer.encode(test).ids  # Convert prompt to tokens

if len(tokens) > 50:
    tokens = tokens[:50]
print("prompt: " + tokenizer.decode(tokens))
print("## generated: \n" + generate_code(model, tokenizer, tokens))
