# Training an Article Title Generation Model with T5

In [1]:
# setup env
# !pip install datasets transformers rouge_score accelerate evaluate -q

## Download Dataset from kaggle

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"fahmiazizfadhil","key":"38ae1365c0bbf8b1c5deef5864c0487d"}'}

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d fabiochiusano/medium-articles
!unzip -q medium-articles.zip -d dataset

Downloading medium-articles.zip to /content
100% 368M/369M [00:18<00:00, 22.5MB/s]
100% 369M/369M [00:18<00:00, 20.5MB/s]


## Import Library

In [5]:
import evaluate
import nltk
import numpy as np
import string
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
# Set variable & parameter
PATH = "/content/dataset/medium_articles.csv"
MODEL_CHECKPOINT = "t5-base"
MODEL_REPO = "t5-base-title-generator"
MODEL_DIR = "logs-t5"
PREFIX = "summarize: "
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 64
BATCH_SIZE = 8

## Load Dataset

In [7]:
datasets = load_dataset("csv", data_files=PATH)
datasets

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 192368
    })
})

In [8]:
# train/test split
datasets_train_test = datasets["train"].shuffle(42).train_test_split(test_size=5000)
datasets_train_validation = datasets_train_test["train"].shuffle(42).train_test_split(test_size=3000)

datasets["train"] = datasets_train_validation["train"]
datasets["validation"] = datasets_train_validation["test"]
datasets["test"] = datasets_train_test["test"]

datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 184368
    })
    validation: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 5000
    })
})

In [9]:
n_samples_train = len(datasets["train"])
n_samples_validation = len(datasets["validation"])
n_samples_test = len(datasets["test"])
n_samples_total = n_samples_train + n_samples_validation + n_samples_test

print(f"- Training set: {n_samples_train*100/n_samples_total:.2f}%")
print(f"- Validation set: {n_samples_validation*100/n_samples_total:.2f}%")
print(f"- Test set: {n_samples_test*100/n_samples_total:.2f}%")

- Training set: 95.84%
- Validation set: 1.56%
- Test set: 2.60%


In [10]:
# keep only a subsample of the datasets
datasets["train"] = datasets["train"].select(range(8000))
datasets["validation"] = datasets["validation"].select(range(3000))
datasets["test"] = datasets["test"].select(range(3000))

datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 3000
    })
})

## Data Processing

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


filter out some bad samples (i.e. articles whose title is long less than 20 characters and whose text content is long less than 500 characters).



In [12]:
medium_datasets_cleaned = datasets.filter(
    lambda example: (len(example['text']) >= 500) and
    (len(example['title']) >= 20)
)

Filter:   0%|          | 0/8000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [13]:
def clean_text(text):
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                     if len(sent) > 0 and
                                     sent[-1] in string.punctuation]
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned

def preprocess_data(examples):
    texts_cleaned = [clean_text(text) for text in examples["text"]]
    inputs = [PREFIX + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

      # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["title"], max_length=MAX_TARGET_LENGTH,
                           truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Note that we are truncating the inputs at 512 tokens. While T5 can manage longer inputs, the memory requirements grow quadratically with the size of the inputs, and that was the maximum size that I could use in my Colab session. 512 tokens correspond to about 682 English words, which is more or less what an average person reads in two minutes. The majority of Medium articles have between four and seven minutes of reading time, therefore we are currently throwing away useful information for our task. Despite this, many articles state what they are about in their first paragraphs, therefore good titles can be generated on most occasions.

In [14]:
tokenized_datasets = medium_datasets_cleaned.map(preprocess_data, batched=True)
tokenized_datasets

Map:   0%|          | 0/6843 [00:00<?, ? examples/s]



Map:   0%|          | 0/2578 [00:00<?, ? examples/s]

Map:   0%|          | 0/2541 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6843
    })
    validation: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2578
    })
    test: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2541
    })
})

## Hyperparameter

In [15]:
# Model
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [29]:
# args
args = Seq2SeqTrainingArguments(
    MODEL_REPO,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard",
    push_to_hub=True
)

In [17]:
# data collator
data_collator = DataCollatorForSeq2Seq(tokenizer)

## Compute Metrics

In [18]:
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


In [22]:
# Login Hugging Face
from huggingface_hub import notebook_login
notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Training

In [30]:
trainer = Seq2SeqTrainer(
    model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Start TensorBoard before training to monitor it in progress
%load_ext tensorboard
%tensorboard --logdir '{MODEL_DIR}'/runs

In [31]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,2.0744,2.594937,0.3025,0.15,0.2779,0.2786,12.6474
200,2.1587,2.587766,0.3063,0.1511,0.2795,0.2804,12.3068
300,2.1679,2.584278,0.3093,0.1543,0.284,0.285,12.3002
400,2.5576,2.518639,0.3032,0.1508,0.2794,0.2805,12.3258
500,2.6485,2.498071,0.3109,0.1548,0.2863,0.2875,12.1055
600,2.8094,2.489887,0.3091,0.1543,0.2846,0.2854,11.9767
700,2.571,2.488784,0.308,0.1536,0.2836,0.2845,12.0628
800,2.6264,2.486771,0.3081,0.1537,0.2841,0.2852,11.9484




TrainOutput(global_step=856, training_loss=2.4602853115473953, metrics={'train_runtime': 3213.9584, 'train_samples_per_second': 2.129, 'train_steps_per_second': 0.266, 'total_flos': 4167098845102080.0, 'train_loss': 2.4602853115473953, 'epoch': 1.0})

In [32]:
trainer.push_to_hub()

'https://huggingface.co/fahmiaziz/t5-base-title-generator/tree/main/'

## Evaluate model on Test set

In [33]:
import torch

In [35]:
# get test split
test_tokenized_dataset = tokenized_datasets["test"]

# pad texts to the same length
def preprocess_test(examples):
    inputs = [PREFIX + text for text in examples["text"]]
    model_inputs = tokenizer(
        inputs, max_length=MAX_INPUT_LENGTH, truncation=True,
        padding="max_length"
    )
    return model_inputs

test_tokenized_dataset = test_tokenized_dataset.map(preprocess_test, batched=True)

Map:   0%|          | 0/2541 [00:00<?, ? examples/s]

In [36]:
# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=32)

In [40]:
# generate text for each batch
all_predictions = []
for i, batch in enumerate(dataloader):
    # Memindahkan data ke GPU
    batch = {key: value.to('cuda:0') for key, value in batch.items()}
    predictions = model.generate(**batch)
    all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(
    test_tokenized_dataset["title"], max_length=MAX_TARGET_LENGTH,
    truncation=True, padding="max_length"
)["input_ids"]

# Mengkopi tensor dari GPU ke CPU
all_predictions_flattened = [pred.to('cpu') for pred in all_predictions_flattened]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)


{'rouge1': 0.3933,
 'rouge2': 0.2521,
 'rougeL': 0.3715,
 'rougeLsum': 0.3723,
 'gen_len': 11.9449}

## Try model

In [41]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_REPO)

In [44]:
text = """
Many financial institutions started building conversational AI, prior to the Covid19
pandemic, as part of a digital transformation initiative. These initial solutions
were high profile, highly personalized virtual assistants — like the Erica chatbot
from Bank of America. As the pandemic hit, the need changed as contact centers were
under increased pressures. As Cathal McGloin of ServisBOT explains in “how it started,
and how it is going,” financial institutions were looking for ways to automate
solutions to help get back to “normal” levels of customer service. This resulted
in a change from the “future of conversational AI” to a real tactical assistant
that can help in customer service. Haritha Dev of Wells Fargo, saw a similar trend.
Banks were originally looking to conversational AI as part of digital transformation
to keep up with the times. However, with the pandemic, it has been more about
customer retention and customer satisfaction. In addition, new use cases came about
as a result of Covid-19 that accelerated adoption of conversational AI. As Vinita
Kumar of Deloitte points out, banks were dealing with an influx of calls about new
concerns, like questions around the Paycheck Protection Program (PPP) loans. This
resulted in an increase in volume, without enough agents to assist customers, and
tipped the scale to incorporate conversational AI. When choosing initial use cases
to support, financial institutions often start with high volume, low complexity
tasks. For example, password resets, checking account balances, or checking the
status of a transaction, as Vinita points out. From there, the use cases can evolve
as the banks get more mature in developing conversational AI, and as the customers
become more engaged with the solutions. Cathal indicates another good way for banks
to start is looking at use cases that are a pain point, and also do not require a
lot of IT support. Some financial institutions may have a multi-year technology
roadmap, which can make it harder to get a new service started. A simple chatbot
for document collection in an onboarding process can result in high engagement,
and a high return on investment. For example, Cathal has a banking customer that
implemented a chatbot to capture a driver’s license to be used in the verification
process of adding an additional user to an account — it has over 85% engagement
with high satisfaction. An interesting use case Haritha discovered involved
educating customers on financial matters. People feel more comfortable asking a
chatbot what might be considered a “dumb” question, as the chatbot is less judgmental.
Users can be more ambiguous with their questions as well, not knowing the right
words to use, as chatbot can help narrow things down.
"""

inputs = [PREFIX + text]

inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

In [48]:
predicted_title

'Conversational AI — How it’s Going — Cathal McGloin of ServisBOT'

# Build APP

In [1]:
!pip install -q streamlit transformers
!npm install localtunnel

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[K[?25h[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mn

In [3]:
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

Password/Enpoint IP for localtunnel is: 35.187.244.176


In [4]:
%%writefile app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import math
import torch

MODEL_REPO = "t5-base-title-generator"
max_input_length = 512

st.header("Generate candidate titles for articles")

st_model_load = st.text('Loading title generator model...')

@st.cache(allow_output_mutation=True)
def load_model():
    print("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_REPO)
    nltk.download('punkt')
    print("Model loaded!")
    return tokenizer, model

tokenizer, model = load_model()
st.success('Model loaded!')
st_model_load.text("")

with st.sidebar:
    st.header("Model parameters")
    if 'num_titles' not in st.session_state:
        st.session_state.num_titles = 5
    def on_change_num_titles():
        st.session_state.num_titles = num_titles
    num_titles = st.slider("Number of titles to generate", min_value=1, max_value=10, value=1, step=1, on_change=on_change_num_titles)
    if 'temperature' not in st.session_state:
        st.session_state.temperature = 0.7
    def on_change_temperatures():
        st.session_state.temperature = temperature
    temperature = st.slider("Temperature", min_value=0.1, max_value=1.5, value=0.6, step=0.05, on_change=on_change_temperatures)
    st.markdown("_High temperature means that results are more random_")

if 'text' not in st.session_state:
    st.session_state.text = ""
st_text_area = st.text_area('Text to generate the title for', value=st.session_state.text, height=500)

def generate_title():
    st.session_state.text = st_text_area

    # tokenize text
    inputs = ["summarize: " + st_text_area]
    inputs = tokenizer(inputs, return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    print(f"Input has {num_tokens} tokens")
    max_input_length = 500
    num_spans = math.ceil(num_tokens / max_input_length)
    print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * max_input_length - num_tokens) / max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + max_input_length * i, start + max_input_length * (i + 1)])
        start -= overlap
    print(f"Span boundaries are {spans_boundaries}")
    spans_boundaries_selected = []
    j = 0
    for _ in range(num_titles):
        spans_boundaries_selected.append(spans_boundaries[j])
        j += 1
        if j == len(spans_boundaries):
            j = 0
    print(f"Selected span boundaries are {spans_boundaries_selected}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries_selected]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries_selected]

    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # compute predictions
    outputs = model.generate(**inputs, do_sample=True, temperature=temperature)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predicted_titles = [nltk.sent_tokenize(decoded_output.strip())[0] for decoded_output in decoded_outputs]

    st.session_state.titles = predicted_titles

# generate title button
st_generate_button = st.button('Generate title', on_click=generate_title)

# title generation labels
if 'titles' not in st.session_state:
    st.session_state.titles = []

if len(st.session_state.titles) > 0:
    with st.container():
        st.subheader("Generated titles")
        for title in st.session_state.titles:
            st.markdown("__" + title + "__")

Writing app.py


In [6]:
!streamlit run /content/app.py &>/content/logs.txt &

In [7]:
!npx localtunnel --port 8501 & curl https://ipv4.icanhazip.com

35.187.244.176
[K[?25hnpx: installed 22 in 4.034s
your url is: https://kind-hotels-jam.loca.lt
