In [26]:
import torch

In [27]:
# device = "cuda:1" if torch.cuda.is_available() else "cpu"

# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '1'


In [28]:
!pip install accelerate -U
!pip install -U datasets



In [29]:
import transformers
import datasets
from datasets import load_dataset

In [30]:
train_dataset = load_dataset(path="wikitext", name="wikitext-2-raw-v1", split="train")

In [31]:
val_dataset = load_dataset(path="wikitext", name="wikitext-2-raw-v1", split="validation")

In [32]:
test_dataset = load_dataset(path="wikitext", name="wikitext-2-raw-v1", split="test")

In [33]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, AutoConfig

In [34]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [35]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
)

In [36]:
# config = AutoConfig.from_pretrained(
#     'gpt2',
#     vocab_size=len(tokenizer),
#     n_positions=1024,
#     n_ctx=1024,
#     n_embd=768,
#     n_layer=12,
#     n_head=12,
# )

In [37]:
model = GPT2LMHeadModel(config)

In [38]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["text"],
                                 truncation=True,
                                 max_length=128,
                                 return_overflowing_tokens=False,
                                 return_length=True)

    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()


    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)



In [39]:
tokenized_train_dataset

Dataset({
    features: ['text', 'input_ids', 'length', 'attention_mask', 'labels'],
    num_rows: 36718
})

In [40]:
def format_data(batch):
    all_tokens = sum(batch['input_ids'], [])

    num_full_chunks = len(all_tokens) // 128

    formatted_data = [all_tokens[i * 128: (i + 1) * 128] for i in range(num_full_chunks)]

    return {'input_ids': formatted_data}

formatted_train_dataset = tokenized_train_dataset.map(format_data, batched=True, remove_columns=tokenized_train_dataset.column_names)
formatted_val_dataset = tokenized_val_dataset.map(format_data, batched=True, remove_columns=tokenized_val_dataset.column_names)
formatted_test_dataset = tokenized_test_dataset.map(format_data, batched=True, remove_columns=tokenized_test_dataset.column_names)


In [41]:
from transformers import DataCollatorForLanguageModeling

In [42]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [43]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    learning_rate=1e-3,
    fp16=True,
    prediction_loss_only=True,
)

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_test_dataset,
    tokenizer=tokenizer,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [45]:
trainer.train()

Step,Training Loss
500,6.565
1000,5.2849


Checkpoint destination directory ./results/checkpoint-800 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1278, training_loss=5.671711857516628, metrics={'train_runtime': 106.1484, 'train_samples_per_second': 384.763, 'train_steps_per_second': 12.04, 'total_flos': 2667922292736000.0, 'train_loss': 5.671711857516628, 'epoch': 3.0})

In [46]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 235.90


In [21]:
new_model = "244-final"

trainer.save_model(new_model)

In [23]:
!pip install huggingface_hub
# #!huggingface-cli login



In [24]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

ValueError: Token is required (write-access action) but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.