In [1]:
from google.colab import drive
drive.mount('/content/drive')

root_dir = '/content/drive/MyDrive/cs224n/project'

%cd $root_dir

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/cs224n/project


In [2]:
!pip install transformers



In [3]:
import torch
import transformers
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedTokenizer, Trainer, TrainingArguments, AutoModelForCausalLM
from torch.utils.data.dataset import Dataset

In [7]:
model = AutoModelForCausalLM.from_pretrained('gpt2')

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.eos_token = '<|endoftext|>'
if tokenizer.pad_token == None:
    tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


class LBLDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, debug=False):
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines()]
        pairs = [x+"\n"+y for x,y in zip(lines[::2],lines[1::2])]
        batch_encoding = tokenizer(pairs, add_special_tokens=True, padding=True, truncation=True, max_length=100)
        self.examples = batch_encoding["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)

train_dataset = LBLDataset(tokenizer=tokenizer, file_path='train_debug.txt')

evaluation_dataset = LBLDataset(tokenizer=tokenizer, file_path='dev_debug.txt')

training_args = TrainingArguments(
    output_dir="./result", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=2,  # batch size for evaluation
    eval_steps = 1, # Number of update steps between two evaluations.
    save_steps=10, # after # steps model is saved 
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=evaluation_dataset,
)

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {


In [8]:
trainer.train()

***** Running training *****
  Num examples = 25
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 39


Step,Training Loss


Saving model checkpoint to ./result/checkpoint-10
Configuration saved in ./result/checkpoint-10/config.json
Model weights saved in ./result/checkpoint-10/pytorch_model.bin
Saving model checkpoint to ./result/checkpoint-20
Configuration saved in ./result/checkpoint-20/config.json
Model weights saved in ./result/checkpoint-20/pytorch_model.bin
Saving model checkpoint to ./result/checkpoint-30
Configuration saved in ./result/checkpoint-30/config.json
Model weights saved in ./result/checkpoint-30/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=39, training_loss=3.6770254281850963, metrics={'train_runtime': 200.9257, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.194, 'total_flos': 2373062400000.0, 'train_loss': 3.6770254281850963, 'epoch': 3.0})