In [1]:
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

train = json.load(open('../data/train.json'))
val = json.load(open('../data/val.json'))
test = json.load(open('../data/test.json'))

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [4]:
def create_text_from_summary_and_dialogue(summary, dialogue):
    text = f"""
A partial summary of the conversation is:
{summary}

With the dialogue being:
{dialogue}
    """.strip()
    
    return text

def create_dataset_from_list(list):
    input_ids = []
    labels = []
    for item in list:
        text = create_text_from_summary_and_dialogue(item["summary"], item["dialogue"])
        ids = tokenizer(text, return_tensors='pt', max_length=512, padding="max_length", truncation=True).input_ids
        input_ids.append(ids)
        labels.append(ids)
    
    return Dataset.from_dict({"input_ids": input_ids, "labels": labels})

In [5]:
train_set = create_dataset_from_list(train)
val_set = create_dataset_from_list(val)
test_set = create_dataset_from_list(test)

In [6]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir='./',
  group_by_length=True,
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=8,
  fp16=True,
  save_steps=200,
  eval_steps=200,
  logging_steps=200,
  learning_rate=3e-5,
  weight_decay=0.005,
  warmup_steps=600,
  save_total_limit=2,
  load_best_model_at_end=True,
)

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=tokenizer,
)

Using amp half precision backend


In [9]:
trainer.train()

***** Running training *****
  Num examples = 14732
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 14736
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [34,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [34,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [34,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [34,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [34,0,0], thread: [68,0,0] Assertion `s

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`