In [1]:
# vim_command_trainer.ipynb

# This is a Jupyter notebook outline for training a CodeT5-based encoder-decoder model
# to output raw Vim command strings from before/after code examples.

# You can copy this into a `.ipynb` file or use it in Colab/Jupyter directly.

# ---------------------------------------------
# 1. Install Dependencies
# ---------------------------------------------
!pip install transformers datasets tokenizers accelerate

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting accelerate
  Downloading accelerate-1.8.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarr

In [3]:
# ---------------------------------------------
# 2. Load Pretrained Encoder and Tokenizers
# ---------------------------------------------
from transformers import AutoTokenizer, AutoModel, PreTrainedTokenizerFast
from tokenizers import ByteLevelBPETokenizer
import torch
from torch import nn

# Load CodeT5 encoder
ENCODER_NAME = "Salesforce/codet5-small"
encoder_tokenizer = AutoTokenizer.from_pretrained(ENCODER_NAME)
encoder = AutoModel.from_pretrained(ENCODER_NAME)
encoder.requires_grad_(False)  # Freeze encoder

# Load custom Vim command decoder tokenizer (trained beforehand)
decoder_tokenizer = PreTrainedTokenizerFast.from_pretrained("vim_bpe_tokenizer")



OSError: vim_bpe_tokenizer is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# ---------------------------------------------
# 3. Define Decoder Model
# ---------------------------------------------
class SimpleDecoder(nn.Module):
    def __init__(self, hidden_size, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.transformer = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=hidden_size, nhead=8), num_layers=2
        )
        self.output_head = nn.Linear(hidden_size, vocab_size)

    def forward(self, tgt_ids, memory):
        tgt_emb = self.embedding(tgt_ids)
        out = self.transformer(tgt=tgt_emb, memory=memory)
        return self.output_head(out)



In [None]:
# ---------------------------------------------
# 4. Load and Prepare Dataset
# ---------------------------------------------
from torch.utils.data import Dataset, DataLoader
import json

class VimDataset(Dataset):
    def __init__(self, data_path, enc_tok, dec_tok, max_length=256):
        self.samples = [json.loads(line) for line in open(data_path)]
        self.enc_tok = enc_tok
        self.dec_tok = dec_tok
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        enc = self.enc_tok(item['input'], return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        dec = self.dec_tok(item['output'], return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        return {
            'input_ids': enc.input_ids.squeeze(0),
            'attention_mask': enc.attention_mask.squeeze(0),
            'labels': dec.input_ids.squeeze(0),
        }

# Example usage
# dataset = VimDataset("vimgolf_data.jsonl", encoder_tokenizer, decoder_tokenizer)
# dataloader = DataLoader(dataset, batch_size=8, shuffle=True)



In [None]:
# ---------------------------------------------
# 5. Training Loop
# ---------------------------------------------
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
decoder = SimpleDecoder(hidden_size=768, vocab_size=decoder_tokenizer.vocab_size).to(device)
optimizer = torch.optim.AdamW(decoder.parameters(), lr=5e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=decoder_tokenizer.pad_token_id)

# for epoch in range(3):
#     for batch in dataloader:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         with torch.no_grad():
#             memory = encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

#         outputs = decoder(labels[:, :-1], memory)
#         loss = loss_fn(outputs.view(-1, decoder_tokenizer.vocab_size), labels[:, 1:].reshape(-1))

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")



In [None]:
# ---------------------------------------------
# 6. Inference Example with Prefix
# ---------------------------------------------
# prefix = "50G23l"
# prefix_ids = decoder_tokenizer(prefix, return_tensors='pt').input_ids.to(device)
# encoder_inputs = encoder_tokenizer("BEFORE: ... AFTER: ...", return_tensors="pt").to(device)

# output = decoder.generate(
#     **encoder_inputs,
#     decoder_input_ids=prefix_ids,
#     max_new_tokens=50
# )

# decoded = decoder_tokenizer.decode(output[0], skip_special_tokens=True)
# print("Generated Vim command:", decoded)

# ---------------------------------------------
# This notebook gives you a full training and inference scaffold. You’ll just need to:
# - Prepare the `vimgolf_data.jsonl` file
# - Train a BPE tokenizer with `tokenizers` and save it to `vim_bpe_tokenizer/`
# - Run training loop and inference as needed