In [1]:
! pip install transformers
! pip install datasets
!pip install accelerate -U
! pip install sentencepiece
! pip install rouge_score
! pip install transformers[torch]
! pip install tabulate

Collecting accelerate
  Using cached accelerate-0.30.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-0.30.0-py3-none-any.whl (302 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.30.0
Collecting rouge_score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [2]:
import torch
import numpy as np
import datasets

from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    logging
)

import nltk
from tabulate import tabulate

2024-05-06 04:51:13.641524: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-06 04:51:13.672733: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 04:51:13.672766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 04:51:13.673690: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-06 04:51:13.679652: I tensorflow/core/platform/cpu_feature_guar

In [3]:
# Set seed for reproducibility
torch.manual_seed(1234)

<torch._C.Generator at 0x7fed50392b70>

In [4]:
model_name = "sshleifer/distilbart-xsum-12-3"

model = BartForConditionalGeneration.from_pretrained(model_name, device_map="auto")
tokenizer = BartTokenizer.from_pretrained(model_name, device_map="auto")

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

df = pd.read_csv("data.csv", dtype=pd.StringDtype(), usecols=range(3))
df = df.drop(columns=['category'])

In [6]:
def clean_data(df):
    # Remove rows with missing headlines/body
    df = df.dropna(axis=0).copy()

    # Convert to lowercase
    df['body'] = df['body'].apply(lambda row: row.lower())
    df['title'] = df['title'].apply(lambda row: row.lower())
    
    # Remove headline from body
    df['body'] = df.apply(
        lambda row: row['body'].replace(row['title'], ''), axis=1)

    return df

df = clean_data(df)
dataset = Dataset.from_pandas(df)

In [7]:
train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()

In [8]:
ENCODER_MAX_LENGTH = 1024
DECODER_MAX_LENGTH = 64

def batch_tokenize_preprocess(batch, tokenizer):
    source = [str(item) for item in batch["body"]]
    target = [str(item) for item in batch["title"]]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=ENCODER_MAX_LENGTH)
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=DECODER_MAX_LENGTH)

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(batch, tokenizer),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(batch, tokenizer),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Map:   0%|          | 0/3332 [00:00<?, ? examples/s]

Map:   0%|          | 0/371 [00:00<?, ? examples/s]

In [9]:
nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge", trust_remote_code=True)


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  metric = datasets.load_metric("rouge", trust_remote_code=True)


In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=10,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,
    learning_rate=5e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    save_total_limit=3,
    logging_steps=1000,
    eval_steps=3000,
    save_steps=3000,
    evaluation_strategy='steps',
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True
)

early_stop = EarlyStoppingCallback(3, 0)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stop]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [16]:
trainer.evaluate()

{'eval_loss': 5.929446220397949, 'eval_rouge1': 18.3452, 'eval_rouge2': 4.8448, 'eval_rougeL': 15.0449, 'eval_rougeLsum': 15.0576, 'eval_gen_len': 27.4798, 'eval_runtime': 111.6116, 'eval_samples_per_second': 3.324, 'eval_steps_per_second': 0.833}


{'eval_loss': 5.929446220397949,
 'eval_rouge1': 18.3452,
 'eval_rouge2': 4.8448,
 'eval_rougeL': 15.0449,
 'eval_rougeLsum': 15.0576,
 'eval_gen_len': 27.4798,
 'eval_runtime': 111.6116,
 'eval_samples_per_second': 3.324,
 'eval_steps_per_second': 0.833}

In [17]:
logging.set_verbosity_error()
trainer.train()

{'loss': 4.6765, 'learning_rate': 4.923826934795857e-05, 'epoch': 0.3}
{'eval_loss': 4.26727294921875, 'eval_rouge1': 30.9178, 'eval_rouge2': 13.052, 'eval_rougeL': 27.6609, 'eval_rougeLsum': 27.713, 'eval_gen_len': 17.2803, 'eval_runtime': 96.6069, 'eval_samples_per_second': 3.84, 'eval_steps_per_second': 0.963, 'epoch': 0.3}
{'loss': 4.3762, 'learning_rate': 4.7714808043875684e-05, 'epoch': 0.6}
{'eval_loss': 4.189115524291992, 'eval_rouge1': 30.6536, 'eval_rouge2': 13.4038, 'eval_rougeL': 27.0988, 'eval_rougeLsum': 27.043, 'eval_gen_len': 15.8113, 'eval_runtime': 94.2499, 'eval_samples_per_second': 3.936, 'eval_steps_per_second': 0.987, 'epoch': 0.6}
{'loss': 4.3479, 'learning_rate': 4.6191346739792815e-05, 'epoch': 0.9}
{'eval_loss': 4.1114373207092285, 'eval_rouge1': 30.0623, 'eval_rouge2': 13.4908, 'eval_rougeL': 27.2623, 'eval_rougeLsum': 27.2322, 'eval_gen_len': 16.1509, 'eval_runtime': 94.3697, 'eval_samples_per_second': 3.931, 'eval_steps_per_second': 0.985, 'epoch': 0.9}
{'l

TrainOutput(global_step=9000, training_loss=3.6133626302083335, metrics={'train_runtime': 3928.0269, 'train_samples_per_second': 8.483, 'train_steps_per_second': 8.483, 'train_loss': 3.6133626302083335, 'epoch': 2.7})

In [18]:
trainer.evaluate()

{'eval_loss': 4.084568023681641, 'eval_rouge1': 30.9248, 'eval_rouge2': 13.5565, 'eval_rougeL': 27.8201, 'eval_rougeLsum': 27.9006, 'eval_gen_len': 15.9326, 'eval_runtime': 93.6303, 'eval_samples_per_second': 3.962, 'eval_steps_per_second': 0.993, 'epoch': 2.7}


{'eval_loss': 4.084568023681641,
 'eval_rouge1': 30.9248,
 'eval_rouge2': 13.5565,
 'eval_rougeL': 27.8201,
 'eval_rougeLsum': 27.9006,
 'eval_gen_len': 15.9326,
 'eval_runtime': 93.6303,
 'eval_samples_per_second': 3.962,
 'eval_steps_per_second': 0.993,
 'epoch': 2.7}

In [19]:
def generate_headline(test_samples, model):
    inputs = tokenizer(
        test_samples["body"],
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [20]:
test_samples = validation_data_txt.select(range(16))
generated_headlines = generate_headline(test_samples, model)[1]
print(
    tabulate(
        zip(
            range(len(generated_headlines)),
            generated_headlines,
            list(test_samples["title"]),
        ),
        headers=["Id", "Generated headlines", "Ground Truth"],
    )
)

  Id  Generated headlines                                                     Ground Truth
----  ----------------------------------------------------------------------  -----------------------------------------------------------------------------------------------
   0  9 new netflix movies that are coming to netflix                         here’s what is coming to netflix in may 2018
   1  mgm denies police timeline of las vegas shooting                        mgm 'confident' that police are wrong about las vegas shooting timeline
   2  a letter to obama calling for more clean energy                         a compromise we can't afford
   3  dan rather is writing a new essay about patriotism                      dan rather to write a book on 'what unites us'
   4  why elizabeth murray was diagnosed with stage 4 cancer                  everybody knows . . . elizabeth murray premieres at tribeca: a talk with director kristi zea
   5  why we can't do so much for it                       

In [21]:
trainer.save_model("./results/best_bart")