In [1]:

import argparse
import torch

import transformers

import datetime
import easydict
import itertools
import json
import matplotlib
import pathlib
import pprint
import re

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from operator import itemgetter
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from typing import List, Dict
import os

In [13]:
config = easydict.EasyDict({

    'gpu_id': 3,
    'train_data_path': '../datasets/integrated_pre_datasets/train_data.tsv',
    'valid_data_path': '../datasets/integrated_pre_datasets/valid_data.tsv',
    'test_data_path': '../datasets/integrated_pre_datasets/test_data.tsv',
    'pretrained_model_name': 'gogamza/kobart-base-v1',
    'ckpt': "ckpt",
    'logs': "logs",
    'batch_size_per_device': 8, # 8이 최선
    'gradient_accumulation_steps': 16,
    'lr': 5e-5,
    'weight_decay': 1e-2,
    'warmup_ratio': .2,
    'n_epochs': 10,
    'inp_max_len': 1024,
    'tar_max_len': 256,
    'model_fpath': "model_records/kobart-model.pth",
    'beam_size': 5,
    'var_len': False,
    "length_penalty": 0.8,
    "no_repeat_ngram_size": 3,
})

In [3]:
import torch

saved_data = torch.load(
    "/workspace/home/uglee/Projects/title_extraction/src/model_records/kobart-model.pth",
    map_location="cpu" if config.gpu_id < 0 else "cuda:%d" % config.gpu_id
)

In [4]:
bart_best = saved_data["model"]
train_config = saved_data["config"]
tokenizer = transformers.PreTrainedTokenizerFast.from_pretrained(train_config.pretrained_model_name)

## Load weights.
model = transformers.BartForConditionalGeneration.from_pretrained(train_config.pretrained_model_name)
model.load_state_dict(bart_best)

<All keys matched successfully>

In [9]:
from dataloaders.bart_dataloader import get_datasets
from dataloaders.bart_dataloader import TextAbstractSummarizationCollator

ts_ds = get_datasets(tokenizer, fpath=Path(config.test_data_path), mode="test")

ts_loader = torch.utils.data.DataLoader(
    ts_ds,
    batch_size=config.batch_size_per_device,
    shuffle=False,
    num_workers=0,
    collate_fn=TextAbstractSummarizationCollator(
        tokenizer=tokenizer,
        config=config,
        mode="test",
    ),
)

Tokenizing input texts: 100%|██████████| 3052/3052 [00:01<00:00, 1938.42it/s]


In [14]:
with torch.no_grad():
    if config.gpu_id >= 0:
        model.cuda(config.gpu_id)
    device = next(model.parameters()).device

    ## Don't forget turn-on evaluation mode.
    model.eval()

    outputs = []
    for mini_batch in tqdm(ts_loader, total=len(ts_loader)):
        id = mini_batch["id"]
        input_ids = mini_batch["input_ids"]
        attention_mask = mini_batch["attention_mask"]

        if config.var_len:
            ## Variable min, max length of target summaries.
            ## We know that summaries ~= text * 0.1.
            avg_len = int(input_ids.ne(tokenizer.pad_token_id).view(-1).sum() / input_ids.size(0))
            min_length = max(64,  int(avg_len * 0.05))
            max_length = min(256, int(avg_len * 0.15))
            ## And we don't need to set length penalty anymore.
            config.length_penalty = 1.0
        else:
            min_length = config.tar_max_len // 4 ## maybe, less then 64 (e.g. 48 or 32) can be more score..
            max_length = config.tar_max_len

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        ## Generate ids of summaries.
        output = model.generate(
            input_ids, 
            attention_mask=attention_mask,
            max_length=max_length,                  ## maximum summarization size
            min_length=min_length,                  ## minimum summarization size
            early_stopping=True,                    ## stop the beam search when at least 'num_beams' sentences are finished per batch
            num_beams=config.beam_size,             ## beam search size
            bos_token_id=tokenizer.bos_token_id,    ## <s> = 0
            eos_token_id=tokenizer.eos_token_id,    ## <\s> = 1
            pad_token_id=tokenizer.pad_token_id,    ## 3
            length_penalty=config.length_penalty,   ## value > 1.0 in order to encourage the model to produce longer sequences
            no_repeat_ngram_size=config.no_repeat_ngram_size,   ## same as 'trigram blocking'
        )
        ## If you want to decode by each sentence, you may 
        ## call 'decode' fn, not 'batch_decode'.
        output = tokenizer.batch_decode(
            output.tolist(), 
            skip_special_tokens=True,
        )

        ## Get all.
        outputs.extend([{"id": id_, "output": output_} for id_, output_ in zip(id, output)])

## Sort and extract.
outputs = sorted(
    outputs,
    key=itemgetter("id"),
    reverse=False,
)
outputs = [i["output"] for i in outputs]

100%|██████████| 382/382 [08:04<00:00,  1.27s/it]


In [15]:
print(output[0])

미래의 나의 모습  미래의 모습  현재의 모습  미래 도시에 대한 본인의 생각  미래의 나의 생각  현재의 도시에 대해 어떤 생각을 가지고 있는지  미래의 도시에 대해서 어떠한 생각을 가지고 있는지에 대한 나의 생각과 생각    나의 생각을 정리하여 미래에 대한 당신의 생각을 정리할 수 있는 기회가 될 수 있을 것이다  
