In [1]:
!pip install datasets rouge
!pip install transformers[torch]
# !pip install sentencepiece # required for pegasus model

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rouge, pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6 rouge-1.0

In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import string
from rouge import Rouge
rouge = Rouge()
from torch.utils.data import Dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
import torch
from collections import defaultdict

## Here choose model, tokenizer (defaults to model's tokenizer), whether to use data preprocessing or not

In [4]:
DATA_PATH = '../tweet_sum_data_files/dialogs_data_with_summaries.xlsx'
PREPROCESS_DIALOGS = True
WRITE_TO_FILE = True # Write model outputs for test split to file

# model_name = "t5-small"
# model_name = "facebook/bart-large"
model_name = "facebook/bart-base"
# model_name = "google/pegasus-cnn_dailymail"
# model_name = "facebook/bart-large-cnn"
# model_name = "google/pegasus-large"
# model_name = "google/pegasus-xsum"
# model_name = "SoooSlooow/TweetBART2"

tokenizer_name = model_name # Default
# tokenizer_name = "facebook/bart-base"

training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name.replace('/', '-')}_dir2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,  # Use on platforms with acceleration
    push_to_hub=False,
    load_best_model_at_end=True
)

## preprocess_dialog() may be applied to whole train + eval dataset before training

## preprocess_summary() is applied to model's output and target reference,  only before evaluating perfomance

In [6]:
def preprocess_summary(summary):    # Comment code lines to use different preprocess strategies
    # summary = " ".join(summary.split()) # Replace all whitespaces (i.e. '\t', '\n') with ' '
    # summary = summary.replace('Customer:', '').replace('Agent:', '')
    summary = re.sub(r"@(\w){1,15}", '', summary) # Remove twitter nicknames
    summary = summary.lower()
    summary = summary.translate(str.maketrans('', '', string.punctuation)) # Remove ALL punctuation i.e. '?,.!'
    return summary

def preprocess_dialog(dialog):
  url_pattern = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
  url_pattern_nohttp = r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
  twtr_nicks = r"@(\w){1,15}"
  if re.search(url_pattern, dialog):
    # print('1')
    dialog = re.sub(url_pattern, '', dialog)
  if re.search(url_pattern_nohttp, dialog):
    # print('3')
    dialog = re.sub(url_pattern_nohttp, '', dialog)
  # if re.search(twtr_nicks, dialog):     # Comment/uncomment to use/remove twitter nicknames
    # print('2')
    # dialog = re.sub(twtr_nicks, '', dialog)
  return dialog


In [7]:
# prefix = "summarize: " # For t5 model
prefix = ""

def preprocess_function(examples):
    #inputs = [prefix + doc for doc in examples["text"]]
    inputs = prefix + examples["text"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

class SummarizationTorchDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        return preprocess_function(self.df.loc[idx, :])

### Load dataset and transform it to required format

In [8]:
df = pd.read_excel(DATA_PATH)
df['abstractive_summaries'] = df['abstractive_summaries'].map(literal_eval)
train_df = df[df['data_split'] == 'train']
eval_df = df[df['data_split'] == 'valid']
test_df = df[df['data_split'] == 'test']
test_dialog_texts = test_df['text'].values.tolist()
test_abstractive_summaries = test_df['abstractive_summaries'].values.tolist()

# train_df = train_df[0:10]
# eval_df = eval_df[0:10]

if PREPROCESS_DIALOGS:
    eval_df['text'] = eval_df['text'].map(lambda txt: preprocess_dialog(txt))
    train_df['text'] = train_df['text'].map(lambda txt: preprocess_dialog(txt))
    test_df['text'] = test_df['text'].map(lambda txt: preprocess_dialog(txt))

train_df_exploded = train_df[['text', 'abstractive_summaries']].explode('abstractive_summaries').\
    sample(frac=1, random_state=42).reset_index(drop=True).\
    rename({'abstractive_summaries': 'summary'}, axis=1)
eval_df_exploded = eval_df[['text', 'abstractive_summaries']].explode('abstractive_summaries').\
    sample(frac=1, random_state=42).reset_index(drop=True).\
    rename({'abstractive_summaries': 'summary'}, axis=1)

train_dataset = SummarizationTorchDataset(train_df_exploded)
eval_dataset = SummarizationTorchDataset(eval_df_exploded)

## Load pretrained models

In [9]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# import torch
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.to(device);

# from transformers import pipeline

# summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)
# t5_pretrained_summaries = summarizer(test_dialog_texts)
# t5_pretrained_summaries = [summ['summary_text'] for summ in t5_pretrained_summaries ]
# get_perfomance(t5_pretrained_summaries, test_abstractive_summaries)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# train from checkpoint:
# checkpoint = "./pegasus-large_dir2/checkpoint-7887"
# trainer.train(checkpoint)

trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.811878
2,2.065900,1.765912
3,2.065900,1.715673


In [22]:
def get_perfomance(cand_summaries, ref_summaries):
    res = {'rouge-1': [], 'rouge-2': [], 'rouge-l': []}
    for cand_summ, ref_summs_set in zip(cand_summaries, ref_summaries):
        cand_summ = preprocess_summary(cand_summ)
        perfomance = defaultdict(list)
        for ref_summ in ref_summs_set:
            ref_summ = preprocess_summary(ref_summ)
            scores = rouge.get_scores(cand_summ, ref_summ)[0]
            for rouge_type, values in scores.items():
                perfomance[rouge_type].append(values['f'])

        for rouge_type in res.keys():
            res[rouge_type].append(
                max(perfomance[rouge_type])
            )
    return {k: np.mean(res[k]) for k in res}

## Evaluate perfomance with ROUGE metric

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.to(device);
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)
t5_finetuned_summaries = summarizer(test_dialog_texts)
t5_finetuned_summaries = [summ['summary_text'] for summ in t5_finetuned_summaries]

get_perfomance(t5_finetuned_summaries, test_abstractive_summaries)

### Save generated summaries to file

In [25]:
if WRITE_TO_FILE:
    with open(f"{model_name.replace('/','-')}_dir/test-split_output.txt", 'w') as f:
      for dialog, summ in zip(test_dialog_texts, t5_finetuned_summaries):
        print(dialog, file=f)
        print(summ, file=f)

FileNotFoundError: ignored