In [1]:
import os
import numpy as np
import pandas as pd
import nltk.tokenize
import re
import random
from nltk.util import ngrams
import tqdm
from nltk.tokenize import RegexpTokenizer
import torch

In [2]:
from transformers import LEDTokenizer, LongformerTokenizer, LEDForConditionalGeneration
import torch
from transformers import TrainingArguments, Trainer

### Option: Initialize Model

In [None]:
generate_model = LEDForConditionalGeneration.from_pretrained("allenai/longformer-base-4096", 
                max_encoder_position_embeddings = 2048, max_position_embeddings = 1536) 
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

In [None]:
generate_model.config

In [None]:
tokenizer.pad_token_id

### Option: Load Model

In [3]:
model_dir = "./output17/checkpoint-9000/"

generate_model = LEDForConditionalGeneration.from_pretrained(model_dir) 
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.input_ids = encodings.input_ids
        self.attention_mask = encodings.attention_mask
        result = []
        for label in labels.input_ids:
            tmp =list(label)
            result.append([-100 if token_id == tokenizer.pad_token_id else token_id for token_id in label])
        self.labels = result

    def __getitem__(self, idx):
        item = {}
        item["input_ids"] = self.input_ids[idx]
        item["attention_mask"] = self.attention_mask[idx]
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.input_ids)

In [5]:
train_dataset = torch.load("train_dataset_b.pth")
test_dataset = torch.load("test_dataset.pth")

In [6]:
args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=2,
    save_steps = 6750
)
trainer = Trainer(
    model=generate_model,
    args=args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 47722
  Num Epochs = 2
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 31816


Step,Training Loss
500,2.0994
1000,2.1341
1500,2.0458
2000,2.0692
2500,2.0514
3000,2.0309
3500,2.0283
4000,2.0439
4500,2.0012
5000,2.0535


Saving model checkpoint to output18/checkpoint-6750
Configuration saved in output18/checkpoint-6750/config.json
Model weights saved in output18/checkpoint-6750/pytorch_model.bin
Saving model checkpoint to output18/checkpoint-13500
Configuration saved in output18/checkpoint-13500/config.json
Model weights saved in output18/checkpoint-13500/pytorch_model.bin
