In [2]:
from __future__ import annotations
import os
from pathlib import Path
from typing import Tuple, Literal, Optional

import datasets
from datasets import Dataset       
from transformers import (
    AutoTokenizer,
    PreTrainedTokenizerBase,
)
import torch
from torch.utils.data import DataLoader         


In [3]:

DEFAULT_LANGUAGE: Literal[
    "go", "java", "javascript", "php", "python", "ruby"
] = "python"

HF_DATASET_NAME = "code_search_net"
DEFAULT_TOKENIZER = "microsoft/phi-2"

In [4]:
def load_codesearchnet(
    language: str = DEFAULT_LANGUAGE,
    split: str = "train"
) -> Dataset:
    """
    Load the CodeSearchNet split for `language`.
    
    """

    ds = datasets.load_dataset(
        HF_DATASET_NAME,
        language,
        split=split,
        trust_remote_code=True)

    return ds


In [18]:
def tokenize_dataset(
    ds: Dataset,
    tokenizer: PreTrainedTokenizerBase,
    text_field: str = "func_code_string",
    max_length: int = 512,
) -> Dataset:
    """
    Tokenise `text_field` and keep only the tokens.
    """
    def _tokenise(batch):
        tokenized = tokenizer(
            batch[text_field],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
        
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    tokenised = ds.map(
        _tokenise,
        batched=True,
        remove_columns=ds.column_names)
    
    tokenised.set_format(type="torch",columns=["input_ids", "attention_mask", "labels"])    
    return tokenised


In [7]:

def build_dataloader(
    tokenised_ds: Dataset,
    batch_size: int = 4,
    shuffle: bool = True,
) -> DataLoader:
    """
    Wrap the Hugging Face `Dataset` into a PyTorch `DataLoader`.
    """
    return DataLoader(
        tokenised_ds,
        batch_size=batch_size,
        shuffle=shuffle and not tokenised_ds.streaming,
        collate_fn=lambda x: {
            k: torch.stack([d[k] for d in x]) for k in x[0]
        },
    )


In [8]:
def get_training_dataloader(
    language: str = DEFAULT_LANGUAGE,
    split: str = "train",
    cache_dir: str | None = None,
    tokenizer_name: str = DEFAULT_TOKENIZER,
    batch_size: int = 4,
    streaming: bool = False,
) -> Tuple[DataLoader, PreTrainedTokenizerBase]:
    """
    End-to-end helper: load -> tokenise -> return DataLoader & tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name,
        trust_remote_code=True,
        use_fast=True,
    )

    ds = load_codesearchnet(
        language=language,
        split=split,
        cache_dir=cache_dir,
        streaming=streaming,
    )
    tokenised = tokenize_dataset(ds, tokenizer)
    loader = build_dataloader(tokenised, batch_size=batch_size)
    return loader, tokenizer


In [5]:
train_data = load_codesearchnet(DEFAULT_LANGUAGE, "train")
test_data = load_codesearchnet(DEFAULT_LANGUAGE, "test")

In [6]:
train_data

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [7]:
cuda_available = torch.cuda.is_available()

device = torch.device("cuda:0" if cuda_available else "cpu")

In [8]:
torch.cuda.is_available()

True

In [19]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

tokenised_train_data = tokenize_dataset(train_data, tokenizer)
tokenised_test_data = tokenize_dataset(test_data, tokenizer)

Map: 100%|██████████████████████████████████████████████████████| 412178/412178 [17:04<00:00, 402.29 examples/s]
Map: 100%|████████████████████████████████████████████████████████| 22176/22176 [00:56<00:00, 395.20 examples/s]


In [23]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2_baseline",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    eval_strategy="epoch",
    logging_steps=50,
    save_strategy="epoch",
    remove_unused_columns=False,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_train_data,
    eval_dataset=tokenised_test_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [24]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 