In [1]:
# !pip install transformers datasets rouge_score
import logging
import os
import re
import sys
from dataclasses import dataclass, field
from typing import Optional

import numpy as np
import pandas as pd
import torch
import transformers
from datasets import load_dataset, load_metric
from rouge_score import rouge_scorer
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBartTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import is_main_process

DATA_PATH = "data/"

In [2]:
from datasets import load_dataset

cnn_train = load_dataset("cnn_dailymail", "3.0.0", split="train")
cnn_validation = load_dataset("cnn_dailymail", "3.0.0", split="validation")
# cnn_validation = pd.DataFrame(cnn_validation)
# cnn_validation.to_csv('cnn_validation.csv')

Reusing dataset cnn_dailymail (C:\Users\Geoff\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset cnn_dailymail (C:\Users\Geoff\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


In [3]:
def evaluate_summary(reference, summary):
    scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
    scores = scorer.score(reference, summary)
    return scores["rouge1"]


summarization_name_mapping = {
    "amazon_reviews_multi": ("review_body", "review_title"),
    "big_patent": ("description", "abstract"),
    "cnn_dailymail": ("article", "highlights"),
    "orange_sum": ("text", "summary"),
    "pn_summary": ("article", "summary"),
    "psc": ("extract_text", "summary_text"),
    "samsum": ("dialogue", "summary"),
    "thaisum": ("body", "summary"),
    "xglue": ("news_body", "news_title"),
    "xsum": ("document", "summary"),
    "wiki_summary": ("article", "highlights"),
}

In [4]:
MODEL_NAME = "facebook/bart-base"
config = AutoConfig.from_pretrained(
    MODEL_NAME, cache_dir=None, revision="main", use_auth_token=False,
)
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, cache_dir=None, use_fast=True, revision="main", use_auth_token=False,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    from_tf=bool(".ckpt" in MODEL_NAME),
    config=None,
    cache_dir=None,
    revision="main",
    use_auth_token=False,
)

In [5]:
# Set decoder_start_token_id
if model.config.decoder_start_token_id is None and isinstance(
    tokenizer, MBartTokenizer
):
    model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
        data_args.target_lang
    ]
if model.config.decoder_start_token_id is None:
    raise ValueError(
        "Make sure that `config.decoder_start_token_id` is correctly defined"
    )

# Get the default prefix if None is passed.
task_specific_params = model.config.task_specific_params
prefix = task_specific_params.get("prefix", "")
# Preprocessing the datasets.
# We need to tokenize inputs and targets.
column_names = cnn_train.column_names

text_column, summary_column = "article", "highlights"

# Temporarily set max_target_length for training.
max_target_length = 128
max_source_length = 1024
padding = False

In [6]:
def preprocess_function(examples):

    inputs = examples[text_column]
    targets = examples[summary_column]
    inputs = [prefix + inp for inp in inputs]

    # Tokenize Input
    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding=padding, truncation=True
    )

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, padding=padding, truncation=True
        )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
train_dataset = cnn_train.select(range(10000))

In [8]:
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=None,
    remove_columns=column_names,
    load_from_cache_file=False,
)

  0%|          | 0/10 [00:00<?, ?ba/s]

In [9]:
# max_target_length = data_args.val_max_target_length
# eval_dataset = datasets["validation"]
# if data_args.max_val_samples is not None:
#     eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
# eval_dataset = eval_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=data_args.preprocessing_num_workers,
#     remove_columns=column_names,
#     load_from_cache_file=not data_args.overwrite_cache,
# )

# Data collator
label_pad_token_id = -100

# Metric
metric_name = "rouge"
metric = load_metric(metric_name)


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    if metric_name == "sacrebleu":
        decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Extract a few results from ROUGE
    if metric_name == "rouge":
        result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    else:
        result = {"bleu": result["score"]}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result


data_collator = DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id)

from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="exp/bart/results",
    do_train=True,
    do_eval=False,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_dir="exp/bart/logs",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
)


# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=None,  # Add Eval DataSet
    tokenizer=tokenizer,
    data_collator=data_collator,
        compute_metrics=compute_metrics
)

In [10]:
train_result = trainer.train()
trainer.save_model()  # Saves the tokenizer too for easy upload

***** Running training *****
  Num examples = 10000
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 1250


Step,Training Loss,Validation Loss


Saving model checkpoint to exp/bart/results\checkpoint-500
Configuration saved in exp/bart/results\checkpoint-500\config.json
Model weights saved in exp/bart/results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in exp/bart/results\checkpoint-500\tokenizer_config.json
Special tokens file saved in exp/bart/results\checkpoint-500\special_tokens_map.json


AttributeError: 'TrainingArguments' object has no attribute 'generation_max_length'

In [12]:
model.from_pretrained("exp/bart/results/")

loading configuration file exp/bart/results/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LAB

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

In [13]:
logger = logging.getLogger(__name__)

In [14]:
output_train_file = os.path.join(args.output_dir, "train_results.txt")
if trainer.is_world_process_zero():
    with open(output_train_file, "w") as writer:
        logger.info("***** Train results *****")
        for key, value in sorted(train_result.metrics.items()):
            logger.info(f"  {key} = {value}")
            writer.write(f"{key} = {value}\n")

    # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
    trainer.state.save_to_json(os.path.join(args.output_dir, "trainer_state.json"))

NameError: name 'train_result' is not defined

In [15]:
test_args = {
    "num_beams": 3,
    "no_repeat_ngram_size": 2,
    "min_summ_length": 20,
    "max_summ_length": 100,
    "length_penalty": 1.0,
}

In [16]:
results = {}

model = model
tokenizer = tokenizer
print("\n")
print("Running Evaluation Script")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

output = []
inco = 0

min_length = test_args["min_summ_length"]
max_length = test_args["max_summ_length"]

df_test = pd.read_csv("cnn_validation.csv")

for index, row in tqdm(df_test[:100].iterrows()):
    text = row["article"]
    ref = row["highlights"]
    
    input_tokenized = tokenizer.encode(
        text,
        return_tensors="pt",
        max_length=max_source_length,
        truncation=True,
    ).to(device)
    
    dim = list(input_tokenized.size())
    summary_ids = model.generate(
        input_tokenized,
        num_beams=test_args["num_beams"],
        no_repeat_ngram_size=test_args["no_repeat_ngram_size"],
        length_penalty=test_args["length_penalty"],
        min_length=min_length,
        max_length=max_length,
        early_stopping=True,
    )

    summ = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
    if summ.find("nnnn")!=-1:
        summ = re.sub(r'nnn*nn', '', summ)
        inco = inco + 1
    score = evaluate_summary(ref,summ)
    output.append((score.precision,ref,summ))



Running Evaluation Script


100it [00:54,  1.82it/s]


In [17]:
# Evaluation

precision = [round(x[0],4) for x in output]
fmeasure = [round(evaluate_summary(x[1],x[2]).fmeasure,4) for x in output]
actual = [x[1] for x in output]
generated = [re.sub(r'nnn*n', '',x[2]) for x in output]

df = pd.DataFrame({'Generated Summary':generated,'Actual Summary':actual, 'Precision': precision, 'F Score': fmeasure})
df['article'] = df_test['article'][:100]

csv_output = os.path.join("test_results_v3.csv")
df.to_csv(csv_output)

print("Evaluation results saved in {}".format(csv_output))

Evaluation results saved in test_results_v3.csv


In [None]:


output_df = pd.read_csv(csv_output)
length_df = len(output_df)
top = 10
if length_df < 20:
top = int(length_df/2) - 1
if top <=0 :
    top = 1

if length_df!=0:  
final_output = [(x['Precision'],x['Actual Summary'],x['Generated Summary'],x['F Score']) for ind,x in output_df.iterrows()]

output_desc = sorted(final_output, key = lambda x: -x[0])

fsc = np.mean([t[3] for t in output_desc])
pre = np.mean([t[0] for t in output_desc])

output_eval_file = os.path.join(training_args.output_dir, "evaluation_scores.txt")
if trainer.is_world_process_zero():
with open(output_eval_file, "w") as writer:
    writer.write("-----------------------------------------------------------------------------------------------------------------------------------------------------------------")
    writer.write("\n")
    writer.write("Mean F Measure: {:.4f}".format(fsc))
    writer.write("\n")
    writer.write("Mean Precision (Rouge1): {:.4f}".format(pre))

    writer.write("\n")
    writer.write("-----------------------------------------------------------------------------------------------------------------------------------------------------------------")
    writer.write("\n")
    writer.write("Best {}: ".format(top))
    writer.write("\n")

    for tup in output_desc[0:top]:
        writer.write("F Measure: {}".format(tup[3]))
        writer.write("\n")
        writer.write("Precision: {}".format(tup[0]))
        writer.write("\n")
        writer.write("Actual Summary:")
        writer.write("\n")
        writer.write(tup[1])
        writer.write("\n")
        writer.write("Generated Summary:")
        writer.write("\n")
        writer.write(tup[2])
        writer.write("\n")
        writer.write("-----------------------------------------------------------------------------------------------------------------------------------------------------------------")
        writer.write("\n")
    writer.write("\n\n")

    writer.write("Worst {}: ".format(top))
    writer.write("\n")

    n = len(final_output)
    output_asc = sorted(final_output, key = lambda x: x[0])
    for tup in output_asc[:top]:
        writer.write("F Measure: {}".format(tup[3]))
        writer.write("\n")
        writer.write("Precision: {}".format(tup[0]))
        writer.write("\n")
        writer.write("Actual Summary: ")
        writer.write("\n")
        writer.write(tup[1])
        writer.write("\n")
        writer.write("Generated Summary:")
        writer.write("\n")
        writer.write(tup[2])
        writer.write("\n")
        writer.write("-----------------------------------------------------------------------------------------------------------------------------------------------------------------")
        writer.write("\n")

print("Evaluation scores saved in {}".format(output_eval_file))

In [None]:
summ