# Training

In [None]:
!pip install -q transformers==4.28.0 datasets rouge_score

In [None]:
from datasets import load_dataset

In [None]:
json_data_files = {
    "train": "/kaggle/input/billsum-clean/us_train_clean.jsonl",
    "test": "/kaggle/input/billsum-clean/ca_test_clean.jsonl",
    "validation": "/kaggle/input/billsum-clean/us_test_clean.jsonl",
}

billsum = load_dataset("json", data_files=json_data_files)

In [None]:
train_dataset = billsum['train']
valid_dataset = billsum['validation']

In [None]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("allenai/longformer-base-4096")

In [None]:
!pip install -q sumy

# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import nltk
nltk.download('punkt')

In [None]:
LANGUAGE = "english"
SENTENCES_COUNT = 5

def extractive(text):
    top_sentence = ""
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        top_sentence += str(sentence) + " "
        # print(sentence)

    return top_sentence[:-1]

import re

def extractive_clean_text(text):
    output = ""
    positions = [m.start() for m in re.finditer('<SECTION-HEADER>', text)]
    length = len(positions)
    for i in range(length):
        if i + 1 < length:
            s = text[positions[i]:positions[i+1]]
        else:
            s = text[positions[i]:]
        output += extractive(s)
    return output

In [None]:
# length_token = []
# for i in range(18949):
#     output = extractive_clean_text(train_dataset[i]['clean_text'])
#     inputs = tokenizer(output)
#     length_token.append(len(inputs.input_ids))

# import matplotlib.pyplot as plt

# data = sorted(length_token)
# plt.plot(data)
# plt.xlabel('Index')
# plt.ylabel('Value')
# plt.title('Biểu đồ đường')
# plt.show()

In [None]:
!pip install -q evaluate

import evaluate

rouge = evaluate.load("rouge")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
max_input_length = 2048
max_output_length = 512
batch_size = 4

In [None]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        [extractive_clean_text(doc) for doc in batch["clean_text"]],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["clean_summary"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [None]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["bill_id", "text", "summary", "title", "text_len", "sum_len", "clean_text", "clean_summary", "clean_title"],
)

valid_dataset = valid_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["bill_id", "text", "summary", "title", "text_len", "sum_len", "clean_text", "clean_summary", "clean_title"],
)

In [None]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

valid_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [None]:
train_dataset.to_json("token_train_dataset.jsonl")
valid_dataset.to_json("token_valid_dataset.jsonl")

In [None]:
json_data_files = {
    "train": "/kaggle/input/tokenlongformertextrank8/token_train_dataset_8.jsonl",
    "validation": "/kaggle/input/tokenlongformertextrank8/token_valid_dataset_8.jsonl",
}

billsum = load_dataset("json", data_files=json_data_files)
train_dataset = billsum['train']
valid_dataset = billsum['validation']

In [None]:
# train_dataset = train_dataset.select(range(10))
# valid_dataset = valid_dataset.select(range(4))

In [None]:
from transformers import AutoModelForSeq2SeqLM
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/longformer-base-4096")

In [None]:
# set generate hyperparameters
led.config.num_beams = 4
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
# enable fp16 apex training
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    gradient_accumulation_steps=2,
    # gradient_checkpointing=True,
    fp16=True,
    # optim="adafactor",

    output_dir="/kaggle/working/",
    save_total_limit=1,
    num_train_epochs=1,
    report_to="none"
)

In [None]:
trainer = Seq2SeqTrainer(
    model=led,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [None]:
trainer.train()

In [None]:
# trainer.save_model("/kaggle/working/")

# Evaluation

In [None]:
import torch

json_data_files = {
    "test": "/kaggle/input/billsum-clean/ca_test_clean.jsonl",
}
billsum = load_dataset("json", data_files=json_data_files)
test_dataset = billsum['test']
# test_dataset = test_dataset.select(range(4))

def generate_answer(batch):
  inputs_dict = tokenizer([extractive_clean_text(doc) for doc in batch["clean_text"]], padding="max_length", max_length=max_input_length, return_tensors="pt", truncation=True)
  input_ids = inputs_dict.input_ids.to("cuda")
  attention_mask = inputs_dict.attention_mask.to("cuda")
  global_attention_mask = torch.zeros_like(attention_mask)
  # put global attention on <s> token
  global_attention_mask[:, 0] = 1

  predicted_abstract_ids = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
  batch["predicted_summary"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
  return batch

test_dataset = test_dataset.map(generate_answer, batched=True, batch_size=8) # modified

result = rouge.compute(predictions=test_dataset['predicted_summary'], references=test_dataset['clean_summary'], use_stemmer=True)
import pprint
pprint.pprint({k: round(v, 4) for k, v in result.items()})