In [1]:
from datasets import load_dataset

ds = load_dataset("nyu-mll/glue", "stsb", split="test")

In [5]:
import pandas as pd

test_dataset = pd.DataFrame(ds)
test_dataset.head()

Unnamed: 0,sentence1,sentence2,label,idx
0,A girl is styling her hair.,A girl is brushing her hair.,-1.0,0
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,-1.0,1
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,-1.0,2
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,-1.0,3
4,A man is playing a harp.,A man is playing a keyboard.,-1.0,4


In [6]:
test_dataset = test_dataset.drop(columns=["label", "idx"])
test_dataset = test_dataset.rename(columns={"sentence1" : "input",
                                            "sentence2" : "reference_sentence"})

test_dataset.head()

Unnamed: 0,input,reference_sentence
0,A girl is styling her hair.,A girl is brushing her hair.
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
3,A man is cutting up a cucumber.,A man is slicing a cucumber.
4,A man is playing a harp.,A man is playing a keyboard.


In [7]:
test_dataset.to_csv("../resource/result-base.csv", index=False)

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
print(len(tokenizer))

50257


In [4]:
tokenizer.add_special_tokens({"sep_token": "[SEP]"})
print(len(tokenizer))

50258


In [None]:
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer

import torch
import pytorch_lightning as pl


class STSBDataset(Dataset):
    def __init__(self, item, tokenizer, max_length):
        self.item = item
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.item)

    def __getitem__(self, idx):
        self.tokenizer.pad_token = "[PAD]"
        self.tokenizer.bos_token = "|beginoftext|"
        self.tokenizer.sep_token = "[SEP]"

        sep_token_id = self.tokenizer.convert_tokens_to_ids("[SEP]")
        pad_token_id = self.tokenizer.convert_tokens_to_ids("[PAD]")
        bos_token_id = self.tokenizer.convert_tokens_to_ids("|beginoftext|")

        sentence1 = self.item[idx]["sentence1"]
        sentence2 = self.item[idx]["sentence2"]

        tokens = self.tokenizer(
            sentence1 + "[SEP]" + sentence2,
            max_length=self.max_length - 1,
            padding="max_length",
            truncation=True,
        )["input_ids"]

        input_ids = torch.tensor([bos_token_id] + tokens, dtype=torch.long)
        label_ids = torch.tensor(
            tokens + [self.tokenizer.eos_token_id], dtype=torch.long
        )

        return {"input_ids": input_ids, "label_ids": label_ids}


class STSBDataloader(pl.LightningDataModule):
    def __init__(self, model_name, batch_size, max_length):
        super().__init__()
        self.batch_size = batch_size
        self.max_length = max_length
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.tokenizer.add_special_tokens(
            {"sep_token": "[SEP]", "pad_token": "[PAD]", "bos_token": "|beginoftext|"}
        )

    def setup(self, stage="fit"):

        train_dataset = load_dataset("nyu-mll/glue", "stsb", split="train")
        valid_dataset = load_dataset("nyu-mll/glue", "stsb", split="validation")
        test_dataset = load_dataset("nyu-mll/glue", "stsb", split="test")

        self.train_dataset = STSBDataset(train_dataset, self.tokenizer, self.max_length)
        self.valid_dataset = STSBDataset(valid_dataset, self.tokenizer, self.max_length)
        self.test_dataset = STSBDataset(test_dataset, self.tokenizer, self.max_length)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False
        )

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pytorch_lightning as pl
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import torch
from torch import nn
import evaluate
import bert_score
import pandas as pd


class HGModel(pl.LightningModule):
    def __init__(self, model_name, lr: float = 1e-5):
        super().__init__()
        self.save_hyperparameters()

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.tokenizer.add_special_tokens(
            {"sep_token": "[SEP]", "pad_token": "[PAD]", "bos_token": "|beginoftext|"}
        )
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.model.resize_token_embeddings(len(self.tokenizer))

        self.bleu_scorer = evaluate.load("bleu")
        self.rougeL_scorer = evaluate.load("rouge")
        self.bert_scorer = bert_score.BERTScorer(lang="en", rescale_with_baseline=False)

        self.valid_preds, self.valid_targets = [], []
        self.test_preds, self.test_targets = [], []

    def forward(self, input_ids, labels=None):
        outputs = self.model(input_ids=input_ids, labels=labels)
        return {"logits": outputs.logits, "loss": outputs.loss}

    def training_step(self, batch, batch_idx):
        inputs, labels = batch["input_ids"], batch["label_ids"]
        outputs = self(inputs, labels=labels)
        loss = outputs["loss"]
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch["input_ids"], batch["label_ids"]
        outputs = self(inputs, labels=labels)
        loss = outputs["loss"]
        predicted_tokens = outputs["logits"].argmax(dim=-1)

        pred = self.tokenizer.batch_decode(predicted_tokens, skip_special_tokens=True)
        target = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

        self.valid_preds.extend(pred)
        self.valid_targets.extend(target)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        self.valid_preds.clear()
        self.valid_targets.clear()

    def test_step(self, batch, batch_idx):
        inputs, labels = batch["input_ids"], batch["label_ids"]
        outputs = self(inputs, labels=labels)
        loss = outputs["loss"]
        predicted_tokens = outputs["logits"].argmax(dim=-1)

        generated_texts = self.model.generate(
            inputs, do_sample=True, max_length=512, top_k=10, top_p=0.9, num_return_sequences=1
        )
        pred_texts = self.tokenizer.batch_decode(generated_texts, skip_special_tokens=True)
        target_texts = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

        self.test_preds.extend(pred_texts)
        self.test_targets.extend(target_texts)
        return loss

    def on_test_epoch_end(self):
        logging_dict = self.compute_metrics(self.test_preds, self.test_targets, mode="test")
        self.log_dict(logging_dict, on_epoch=True, prog_bar=True)
        self.result_output(self.test_preds)

    def result_output(self, test_preds, output_path="./resource/result.csv"):
        result_df = pd.DataFrame({"predicted_label": test_preds})
        result_df.to_csv(output_path, index=False)

    def configure_optimizers(self):
        return AdamW(self.model.parameters(), lr=self.hparams.lr)

    def compute_metrics(self, preds, targets, mode):
        references_bleu = [[t] for t in targets]
        bleu_scores = {
            f"{mode}_bleu{n}_score": self.bleu_scorer.compute(
                predictions=preds, references=references_bleu, max_order=n
            )["bleu"]
            for n in range(1, 5)
        }
        rouge_score = self.rougeL_scorer.compute(predictions=preds, references=targets)["rougeL"]
        _, _, bert_score_result = self.bert_scorer.score(preds, targets)

        return {
            **bleu_scores,
            f"{mode}_rouge_score": rouge_score,
            f"{mode}_bert_score": bert_score_result.mean().item(),
        }

In [5]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

from tutorial.gpt2.dataset import STSBDataloader
from tutorial.gpt2.model import Model

max_length = 512
learning_rate = 5e-5
max_epoch = 30
batch_size = 16
model_name = "gpt2"
seed = 42
gpu_id = "1,"

pl.seed_everything(seed)

dataloader = STSBDataloader(model_name, batch_size, max_length)
dataloader.setup()

earlystopping_callback = EarlyStopping(monitor="val_loss", patience=3)

model = HGModel(model_name, lr=learning_rate)

trainer = pl.Trainer(
    accelerator="gpu",
    devices=gpu_id,
    max_epochs=max_epoch,
    num_sanity_val_steps=0,
    callbacks=[earlystopping_callback],
)

trainer.fit(model=model, datamodule=dataloader)

model.eval()
trainer.test(model=model, datamodule=dataloader)


Seed set to 42


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name  | Type            | Params | Mode
-------------------------------------------------
0 | model | GPT2LMHeadModel | 124 M  | eval
-------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.768   Total estimated model params size (MB)
0         Modules in train mode
164       Modules in eval mode


Epoch 7: 100%|██████████| 360/360 [04:35<00:00,  1.31it/s, v_num=3, train_loss_step=0.118, val_loss=0.311, train_loss_epoch=0.163]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing DataLoader 0:   0%|          | 0/1379 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ValueError: Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.