In [8]:
# 뉴스 요약 데이터셑 불러오기

import numpy as np
from datasets import load_dataset

news = load_dataset("argilla/news-summary", split="test")
df = news.to_pandas().sample(5000, random_state=42)[["text", "prediction"]]
df["text"] = "summarize: " + df["text"]
df["prediction"] = df["prediction"].map(lambda x: x[0]["text"])
train, valid, test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

print(f"Source News: {train.text.values[0]}")
print(f"Target Summary: {train.prediction.iloc[0]}")

print(f"Training Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size : {len(test)}")

Source News: summarize: DANANG, Vietnam (Reuters) - Russian President Vladimir Putin said on Saturday he had a normal dialogue with U.S. leader Donald Trump at a summit in Vietnam, and described Trump as civil, well-educated, and comfortable to deal with. Putin said that a mooted bilateral sit-down meeting with Trump did not happen at the Asia-Pacific Economic Cooperation summit, citing scheduling issues on both sides and unspecified protocol issues. Putin, at a briefing for reporters at the end of the summit, said there was still a need for further U.S.-Russia contacts, both at the level of heads of state and their officials, to discuss issues including security and economic development.   
Target Summary: Putin says had useful interaction with Trump at Vietnam summit
Training Data Size : 3000
Validation Data Size : 1000
Testing Data Size : 1000


  return bound(*args, **kwds)


In [9]:
# 뉴스 요약 데이터세트 전처리

import torch
from transformers import T5Tokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

def make_dataset(data, tokenizer, device):
    source = tokenizer(
        text=data.text.tolist(),
        padding="max_length",
        max_length=128,
        pad_to_max_length=True,
        truncation=True,
        return_tensors="pt"
    )

    target = tokenizer(
        text=data.prediction.tolist(),
        padding="max_length",
        max_length=128,
        pad_to_max_length=True,
        truncation=True,
        return_tensors="pt"
    )

    source_ids = source["input_ids"].squeeze().to(device)
    source_mask = source["attention_mask"].squeeze().to(device)
    target_ids = target["input_ids"].squeeze().to(device)
    target_mask = target["attention_mask"].squeeze().to(device)
    return TensorDataset(source_ids, source_mask, target_ids, target_mask)

def get_datalodader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader

epochs = 5
batch_size = 8
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path="t5-base"
)


train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_datalodader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_datalodader(valid_dataset, SequentialSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_datalodader(test_dataset, SequentialSampler, batch_size)

print(next(iter(train_dataloader)))
print(tokenizer.convert_ids_to_tokens(21603))
print(tokenizer.convert_ids_to_tokens(10))

[tensor([[21603,    10,    41,  ...,     9,  2493,     1],
        [21603,    10,   549,  ...,     6,   974,     1],
        [21603,    10,   549,  ...,  1041,   224,     1],
        ...,
        [21603,    10, 24586,  ...,     0,     0,     0],
        [21603,    10,     3,  ..., 27409,     6,     1],
        [21603,    10,  7109,  ...,     0,     0,     0]], device='cuda:0'), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), tensor([[   86,  6019,    12,  ...,     0,     0,     0],
        [20792, 21077,   283,  ...,     0,     0,     0],
        [24463,    10,  4534,  ...,     0,     0,     0],
        ...,
        [ 9995, 24326,     7,  ...,     0,     0,     0],
        [21409,  8263,  2493,  ...,     0,     0,     0],
        [ 4263, 19791,   342,  ...,     0,     0,     0]], device='cuda:0'), ten

In [10]:
# T5 모델 선언
from torch import optim
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="t5-base"
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)

In [11]:
#  모델 구조 출력
for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └", sssub_name)

shared
encoder
└ embed_tokens
└ block
│  └ 0
│  │  └ layer
│  └ 1
│  │  └ layer
│  └ 2
│  │  └ layer
│  └ 3
│  │  └ layer
│  └ 4
│  │  └ layer
│  └ 5
│  │  └ layer
│  └ 6
│  │  └ layer
│  └ 7
│  │  └ layer
│  └ 8
│  │  └ layer
│  └ 9
│  │  └ layer
│  └ 10
│  │  └ layer
│  └ 11
│  │  └ layer
└ final_layer_norm
└ dropout
decoder
└ embed_tokens
└ block
│  └ 0
│  │  └ layer
│  └ 1
│  │  └ layer
│  └ 2
│  │  └ layer
│  └ 3
│  │  └ layer
│  └ 4
│  │  └ layer
│  └ 5
│  │  └ layer
│  └ 6
│  │  └ layer
│  └ 7
│  │  └ layer
│  └ 8
│  │  └ layer
│  └ 9
│  │  └ layer
│  └ 10
│  │  └ layer
│  └ 11
│  │  └ layer
└ final_layer_norm
└ dropout
lm_head


In [12]:
# 모델 학습 및 평가

import numpy as np
from torch import nn


def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for source_ids, source_mask, target_ids, target_mask in dataloader:
        decoder_input_ids = target_ids[:, :-1].contiguous()
        labels = target_ids[:, 1:].clone().detach()
        labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

        outputs = model(
            input_ids=source_ids,
            attention_mask=source_mask,
            decoder_input_ids=decoder_input_ids,
            labels=labels,
        )

        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss


def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        val_loss = 0.0

        for source_ids, source_mask, target_ids, target_mask in dataloader:
            decoder_input_ids = target_ids[:, :-1].contiguous()
            labels = target_ids[:, 1:].clone().detach()
            labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

            outputs = model(
                input_ids=source_ids,
                attention_mask=source_mask,
                decoder_input_ids=decoder_input_ids,
                labels=labels,
            )

            loss = outputs.loss
            val_loss += loss.item()

    val_loss = val_loss / len(dataloader)
    return val_loss


best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss = evaluation(model, valid_dataloader)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "./models/T5ForConditionalGeneration.pt")
        print("Saved the model weights")

Epoch 1: Train Loss: 4.1245 Val Loss: 2.7325
Saved the model weights
Epoch 2: Train Loss: 2.8438 Val Loss: 2.4025
Saved the model weights
Epoch 3: Train Loss: 2.5413 Val Loss: 2.2528
Saved the model weights
Epoch 4: Train Loss: 2.3658 Val Loss: 2.1683
Saved the model weights
Epoch 5: Train Loss: 2.2361 Val Loss: 2.1131
Saved the model weights


In [13]:
# 생성모델 테스트

model.eval()
with torch.no_grad():
    for source_ids, source_mask, target_ids, target_mask in test_dataloader:
        generated_ids = model.generate(
            input_ids=source_ids,
            attention_mask=source_mask,
            max_length=128,
            num_beams=3,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
        )

        for generated, target in zip(generated_ids, target_ids):
            pred = tokenizer.decode(
                generated, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
            actual = tokenizer.decode(
                target, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
            print("Generated Headline Text:", pred)
            print("Actual Headline Text   :", actual)
        break

Generated Headline Text: Clinton leads Trump by 4 percentage points in a four-war race for Nov. 8 election
Actual Headline Text   : Clinton leads Trump by 4 points in Washington Post: ABC News poll
Generated Headline Text: senators question Gorsuch's independence in light of Trump travel ban
Actual Headline Text   : Democrats question independence of Trump Supreme Court nominee
Generated Headline Text: u.S. warns Saudi Arabia over Yemen humanitarian situation could constrain U.S. aid
Actual Headline Text   : In push for Yemen aid, U.S. warned Saudis of threats in Congress
Generated Headline Text: Romanian anti-corruption prosecutors open investigation into Social Democrat party leader Liviu Dragnea
Actual Headline Text   : Romanian ruling party leader investigated over 'criminal group'
Generated Headline Text: environmental activist endorses Hillary Clinton for U.S. president a day after she secured Democratic nomination
Actual Headline Text   : Billionaire environmental activist Tom S