In [27]:
import numpy as np
import datasets
import os 
from sklearn.model_selection import train_test_split

os.environ["HF_CACHE_DIR"] = "/data_hdd_16t/khanhtran/LLM/pretraining_GPT2"

c4 = datasets.load_dataset('datablations/c4-filter-small', split='train')
split = c4.train_test_split(test_size=0.1,)
test_ds = split['test']
train_ds = split['train']
split = train_ds.train_test_split(test_size=0.1,)
train_ds = split['train']
val_ds = split['test']

In [28]:
len(train_ds)

81000

In [25]:
train_ds, test_ds = train_ds['text'], test_ds['text']

In [29]:
train_ds

Dataset({
    features: ['text', 'timestamp', 'url', 'meta', 'text_length', 'domain', 'perplexity', 'dup_ratio', 'pairs', 'repetitions', 'cluster'],
    num_rows: 81000
})

In [30]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.normalizer = NFKC()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(
    vocab_size=60000, 
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)
tokenizer.train_from_iterator(train_ds, trainer=trainer)
tokenizer.save("gpt2_tokenizer.json")






In [31]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="gpt2_tokenizer.json"
)
tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "bos_token": "<s>", 
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "mask_token": "<mask>"
})




0

In [38]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        max_length=512, 
        padding="max_length"
    )

tokenized_train_ds = train_ds.map(
    tokenize_function, 
    batched=True, 
    remove_columns=["text"], 
    num_proc=10
)
tokenized_test_ds = test_ds.map(
    tokenize_function, 
    batched=True, 
    remove_columns=["text"], 
    num_proc=10
)
tokenized_val_ds = val_ds.map(
    tokenize_function, 
    batched=True, 
    remove_columns=["text"], 
    num_proc=10
)

In [39]:
from itertools import chain
from datasets import Dataset

block_size = 128

def manual_group(dataset):
    input_ids = list(chain(*dataset["input_ids"]))
    attention_mask = list(chain(*dataset["attention_mask"]))

    total_length = (len(input_ids) // block_size) * block_size
    result = {
        "input_ids": [input_ids[i:i+block_size] for i in range(0, total_length, block_size)],
        "attention_mask": [attention_mask[i:i+block_size] for i in range(0, total_length, block_size)]
    }
    result["labels"] = result["input_ids"].copy()
    return Dataset.from_dict(result)

lm_train_ds = manual_group(tokenized_train_ds)
lm_val_ds = manual_group(tokenized_val_ds)
lm_test_ds = manual_group(tokenized_test_ds)

In [40]:
from transformers import GPT2Config, GPT2LMHeadModel

config = GPT2Config(
    vocab_size=tokenizer.vocab_size, 
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

In [41]:
model = GPT2LMHeadModel(config)

In [66]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)

training_args = TrainingArguments(
    output_dir="gpt2-finetuned-c4",
    logging_dir= "gpt2-finetuned-c4/logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    metric_for_best_model="eval_loss",
    evaluation_strategy="steps",
    greater_is_better= True,
    eval_steps=500,
    load_best_model_at_end=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_train_ds,
    eval_dataset=lm_val_ds,
    data_collator=data_collator
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [67]:
import torch

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [68]:
trainer.train()



Step,Training Loss,Validation Loss
500,1.9716,1.869307




KeyboardInterrupt: 