In [1]:
data_dir = "data/code_data.csv"
save_dir = "model/voltscript/gpt2-large-hf"
base_model = "openai-community/gpt2-large"
tok_dir = "tokenizer/voltscript-bpe"
n_examples = 60
seed = 1

In [2]:
"""
Dataset prep
"""
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
    def __init__(
        self, tokenizer, dataset, infinite=False, seq_length=512, num_of_sequences=512, chars_per_token=3.6
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.bos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
        self.epoch = 0
        self.infinite = infinite

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    break
                try:
                    buffer.append(next(iterator)["code"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                        self.epoch += 1
                        print(f"Dataset epoch: {self.epoch}")
                    else:
                        more_examples = False
                        break
                except TypeError:
                    pass
            tokenized_inputs = self.tokenizer(buffer, truncation=True)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

In [None]:
from datasets import load_dataset, Dataset
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

from transformers import AutoTokenizer, HfArgumentParser
from transformers.models.gpt2.tokenization_gpt2 import bcytes_to_unicode

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
from accelerate import Accelerator, DistributedType
from transformers import HfArgumentParser, set_seed

from torch.utils.data.dataloader import DataLoader


def batch_iterator(n_examples, batch_size=1):
    """ 
    Iterator for Training
    """
    for _ in tqdm(range(0, n_examples, batch_size)):
        try:
            yield [next(iter_dataset)["code"] for _ in range(batch_size)]
        except StopIteration:
            return


def build_tokenizer(data_dir):
    """
    base tokenizer
    """
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    base_vocab = list(bytes_to_unicode().values())
    
    # Load dataset
    df = pd.read_csv(data_dir)
    df = df.dropna()
    dataset = Dataset.from_pandas(df[:-(int(len(df)*0.2))])
    iter_dataset = iter(dataset)
    print(iter_dataset)
    
    # Training and saving
    new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(len(df[:-(int(len(df)*0.2))])),
                                                      vocab_size=tokenizer.vocab_size,
                                                      initial_alphabet=base_vocab
                                                     )
    new_tokenizer.save_pretrained(tok_dir, push_to_hub=False)


def load_tokenizer():
    """
    Load built tokenizer
    """
    tokenizer = AutoTokenizer.from_pretrained(tok_dir)

    # Config: "scale_attn_by_layer_idx" and "reorder_and_upcast_attn" are Mistral stability tweaks
    config_kwargs = {
        "vocab_size": len(tokenizer),
        "scale_attn_by_inverse_layer_idx": True,
        "reorder_and_upcast_attn": True,
    }

    # Load model config (GPT-2)
    config = AutoConfig.from_pretrained(base_model, **config_kwargs)
    
    # Initialize new model with config
    model = AutoModelForCausalLM.from_config(config)
    
    Save model to the hub
    model.save_pretrained(save_dir, push_to_hub=False)


def init_accelerator():
    """
    initialize accelerator
    """
    accelerator = Accelerator()
    acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
    
    samples_per_step = accelerator.state.num_processes * 2
    set_seed(seed)
    return accelerator


def create_dataloaders():
    """
    create dataloaders
    """
    train_data = Dataset.from_pandas(df[:-(int(len(df)*0.2))])
    train_data = train_data.shuffle()
    valid_data = Dataset.from_pandas(df[-(int(len(df)*0.2)):])
    
    train_dataset = ConstantLengthDataset(tokenizer, train_data, infinite=True, seq_length=512)
    valid_dataset = ConstantLengthDataset(tokenizer, valid_data, infinite=False, seq_length=512)
    
    train_dataloader = DataLoader(train_dataset, batch_size=4)
    eval_dataloader = DataLoader(valid_dataset, batch_size=4)
    return train_dataloader, eval_dataloader

train_dataloader, eval_dataloader = create_dataloaders()


def load_base(save_dir, gradient_checkpointing):
    """
    Load base model and tokenizer
    """
    model = AutoModelForCausalLM.from_pretrained(save_dir)
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()
    tokenizer = AutoTokenizer.from_pretrained(save_dir)

    return model, tokenizer
    

In [None]:
from torch.utils.data import IterableDataset


class ConstantLengthDataset(IterableDataset):
    def __init__(
        self, tokenizer, dataset, infinite=False, seq_length=512, num_of_sequences=512, chars_per_token=3.6
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.bos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
        self.epoch = 0
        self.infinite = infinite

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    break
                try:
                    buffer.append(next(iterator)["code"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                        self.epoch += 1
                        print(f"Dataset epoch: {self.epoch}")
                    else:
                        more_examples = False
                        break
                except TypeError:
                    pass
            tokenized_inputs = self.tokenizer(buffer, truncation=True)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)


In [None]:
from torch.utils.data.dataloader import DataLoader

def create_dataloaders():
    train_data = Dataset.from_pandas(df[:-(int(len(df)*0.2))])
    train_data = train_data.shuffle()
    valid_data = Dataset.from_pandas(df[-(int(len(df)*0.2)):])
    
    train_dataset = ConstantLengthDataset(tokenizer, train_data, infinite=True, seq_length=512)
    valid_dataset = ConstantLengthDataset(tokenizer, valid_data, infinite=False, seq_length=512)
    
    train_dataloader = DataLoader(train_dataset, batch_size=4)
    eval_dataloader = DataLoader(valid_dataset, batch_size=4)
    return train_dataloader, eval_dataloader

train_dataloader, eval_dataloader = create_dataloaders()


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay): params_without_wd.append(p)
        else: params_with_wd.append(p)
    return [{"params": params_with_wd, "weight_decay": 0.1},
            {"params": params_without_wd, "weight_decay": 0.0},]

optimizer = AdamW(get_grouped_params(model), lr=0.005)
lr_scheduler = get_scheduler(name='cosine', optimizer=optimizer,
                             num_warmup_steps=100,
                             num_training_steps=500,)


In [None]:
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

In [None]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(2)
        losses.append(accelerator.gather(loss))
        if -1 > 0 and step >= -1:
            break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [None]:
import torch 


model.train()
completed_steps = 0
for step, batch in enumerate(train_dataloader, start=1):
    loss = model(batch, labels=batch, use_cache=False).loss
    loss = loss / 16
    accelerator.backward(loss)
    if step % 16 == 0:
        accelerator.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
    if step % 100 == 0:
        eval_loss, perplexity = evaluate()
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(model_dir, save_function=accelerator.save)
        # if accelerator.is_main_process:
        #     hf_repo.push_to_hub(commit_message=f"step {step}")
        model.train()
    if completed_steps >= 100:
        break