In [1]:
# Code reference: https://huggingface.co/docs/transformers/tasks/language_modeling

# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

! pip install transformers[torch]
! pip install accelerate -U

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [2]:
import os
import pickle

from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import DataCollatorForLanguageModeling

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
# dir_name = "drive/MyDrive/babylm_augment/"
dir_name = "drive/MyDrive/babylm_pretrain_corpus/"

babylm_list_dataset = []
for file_name in os.listdir(dir_name):
    if file_name.endswith(".pkl"):
        with open(dir_name + file_name, "rb") as f:
            print(f"Loading chunk {file_name}")
            data_chunk = pickle.load(f)
            babylm_list_dataset += data_chunk
print(f"Length of babylm_dataset: {len(babylm_list_dataset)}")

In [18]:
babylm_dataset = Dataset.from_list(babylm_list_dataset)
babylm_dataset = babylm_dataset.train_test_split(test_size=0.2)
babylm_dataset

In [6]:
# def preprocess_function(examples):
    # return tokenizer([" ".join(x) for x in examples["augmented_text"]])

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer([tokenizer.bos_token + x + tokenizer.eos_token for x in examples["augmented_text"]]) #truncation=True, padding='max_length', max_length=block_size)

block_size = 512

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
tokenized_babylm = babylm_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=babylm_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/1393 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/349 [00:00<?, ? examples/s]

In [8]:
babylm_dataset = tokenized_babylm.map(group_texts, batched=True, num_proc=4)
babylm_dataset

Map (num_proc=4):   0%|          | 0/1393 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/349 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 931
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 233
    })
})

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
# BabyLM Default Hyperparameters

LR = 5e-5            # default: 5e-5
PATIENCE = 10        # default: 10
BSZ = 64             # default: 64
EVAL_EVERY = 200     # default: 200
MAX_EPOCHS = 10      # default: 10
SEED = 12            # default: 12

In [16]:
# Training arguments
training_args = TrainingArguments(
    output_dir="gpt2_babylm_clm",
    evaluation_strategy="steps",
    learning_rate=LR,
    per_device_train_batch_size=BSZ,
    per_device_eval_batch_size=BSZ,
    num_train_epochs=MAX_EPOCHS,
    load_best_model_at_end=True,
    eval_steps=EVAL_EVERY,
    save_steps=EVAL_EVERY,
)

model = AutoModelForCausalLM.from_pretrained("gpt2")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=babylm_dataset["train"],
    eval_dataset=babylm_dataset["test"],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]  # Add EarlyStoppingCallback
)

In [19]:
trainer.train()

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 5.24


In [None]:
trainer.tokenizer = tokenizer
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/chantomkit/distilgpt2_babylm_clm-model/commit/4d8cb4ccaca01753c6434e0f93e2cd7f6feb2f66', commit_message='End of training', commit_description='', oid='4d8cb4ccaca01753c6434e0f93e2cd7f6feb2f66', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
"""
Testing the finetuned model
"""

prompt = "Jason is going to the grocery store"

In [None]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("chantomkit/gpt2_babylm_clm")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

inputs = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

In [None]:
# from transformers import AutoModelForCausalLM

# model = AutoModelForCausalLM.from_pretrained("chantomkit/distilgpt2_babylm_clm-model")
# outputs = model.generate(inputs, max_new_tokens=500, do_sample=True, top_k=10, top_p=0.7, temperature=0.5, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
outputs = model.generate(
    inputs,
    max_new_tokens=128,
    do_sample=True,
    top_k=10,
    top_p=0.9,
    temperature=0.6, # Adjust the temperature to encourage diversity
    repetition_penalty=1.2, # Add a penalty for repetition
    pad_token_id=tokenizer.pad_token_id
)

In [None]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Jason is going to the grocery store and she\'s gonna buy some food. But then, something strange happens! She hears a knock at her door that says "Hello!" And it sounds like someone was trying on his clothes."\nAs they enter the kitchen, Emily realizes how much trouble this could be for them if their parents didn\'t know about him before he left home in May of last year…and what kind-hearted person would want anything from such an unexpected event? They\'re so confused right now because no one knows who or where Mr Johnson really lives anymore. So when Mrs. Smith comes back with news that there are still people missing out on important things (she']

In [None]:
tokenizer.save_pretrained("./model_folder")
model.save_pretrained("./model_folder")

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

%cd /content/
!zip -r model_folder.zip model_folder/

/content
updating: model_folder/ (stored 0%)
updating: model_folder/config.json (deflated 51%)
updating: model_folder/vocab.json (deflated 59%)
updating: model_folder/model.safetensors (deflated 7%)
updating: model_folder/tokenizer_config.json (deflated 54%)
updating: model_folder/special_tokens_map.json (deflated 60%)
updating: model_folder/merges.txt (deflated 53%)
updating: model_folder/tokenizer.json (deflated 72%)
updating: model_folder/generation_config.json (deflated 24%)
