In [13]:
# Import standard libraries
import time
import numpy as np
import pdb
import torch
import os

# Import third party libraries
import evaluate
import transformers
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from transformers import GPTJForCausalLM, AutoTokenizer
from transformers.data import DataCollatorForLanguageModeling


GPTJ_FINE_TUNED_FILE = "/home/jupyter/project/fine_tuned_models/gpt-j-6B"

In [2]:
print("Loading model")
model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
model.config.pad_token_id = model.config.eos_token_id

Downloading (…)/float16/config.json: 100%|██████████| 836/836 [00:00<00:00, 194kB/s]
Downloading pytorch_model.bin: 100%|██████████| 12.1G/12.1G [00:46<00:00, 261MB/s]


In [18]:
print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer.pad_token = tokenizer.eos_token

print("Loading dataset")
current_dataset = load_dataset("wikitext", 'wikitext-103-v1')


def tokenize_function(examples):
    current_tokenizer_result = tokenizer(examples["text"], padding="max_length", truncation=True)
    return current_tokenizer_result


print("Splitting and tokenizing dataset")
tokenized_datasets = current_dataset.map(tokenize_function, batched=True, num_proc=os.cpu_count())
small_eval_dataset = tokenized_datasets["validation"].select(range(100))

Loading tokenizer
Loading dataset


Downloading builder script: 100%|██████████| 8.48k/8.48k [00:00<00:00, 10.6MB/s]
Downloading metadata: 100%|██████████| 6.84k/6.84k [00:00<00:00, 10.7MB/s]
Downloading readme: 100%|██████████| 9.62k/9.62k [00:00<00:00, 14.2MB/s]


Downloading and preparing dataset wikitext/wikitext-103-v1 to /home/jupyter/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data: 100%|██████████| 190M/190M [00:08<00:00, 21.7MB/s] 
                                                                                           

Dataset wikitext downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 174.22it/s]


Splitting and tokenizing dataset


                                                                                      

In [19]:
print("Preparing training arguments")

 # The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info() 

training_args = TrainingArguments(output_dir=GPTJ_FINE_TUNED_FILE,
                                  gradient_accumulation_steps=32,
                                  per_device_train_batch_size=1,
                                  label_names=['input_ids', 'attention_mask'],  # 'logits', 'past_key_values'
                                  num_train_epochs=1,
                                  report_to=["wandb"],
                                  logging_steps = 1,
                                  eval_steps = 5000,
                                  run_name = 'custom_training',
                                  no_cuda=False)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


Preparing training arguments


Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 5.95MB/s]


In [23]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_bos_token',
 '_build_conversation_input_ids',
 '_call_one',
 '_cls_token',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_mana

In [20]:
trainer.save_model()

Saving model checkpoint to /home/jupyter/project/fine_tuned_models/gpt-j-6B
Configuration saved in /home/jupyter/project/fine_tuned_models/gpt-j-6B/config.json
Configuration saved in /home/jupyter/project/fine_tuned_models/gpt-j-6B/generation_config.json
The model is bigger than the maximum size per checkpoint (10GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /home/jupyter/project/fine_tuned_models/gpt-j-6B/pytorch_model.bin.index.json.
