In [2]:
# dependencies
# %pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
# %pip install transformers datasets evaluate rouge-score nltk py7zr
# nltk.download("punkt")

In [1]:
from transformers import T5Tokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


import pandas as pd

In [2]:
import torch
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

In [3]:
torch.cuda.is_available()

True

## Notebook param

In [4]:
CACHE_DIR = './cache_mod'
SEED = 0
N_SAMPLES = 1000
model_name = "google/flan-t5-base"
# dataset_name = "samsum"

## Load data

In [5]:
df = pd.read_csv("./Text_Simplification/raw_data.csv")

sources = ["BreakingNewsEnglish","NewsInLevels"]
df1 = df[df["data_source"].isin(sources)]

df1.shape[0]

12910

In [6]:
df1.head(1)

Unnamed: 0,source,target,source_level_og,target_level_og,data_source,data_type,source_level_cefr,target_level_cefr,id
0,British people are big tea drinkers. It is a t...,British people love tea. They drink it for dif...,3.0,2.0,BreakingNewsEnglish,text_simplification,,,TS000000001


In [7]:
df2 = df1
# df2 = df1.sample(1000)

df2.shape[0]

12910

## Tokenizer, Model and Evaluation metric

In [8]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

In [9]:
# We will be fine-tuning the `google/flan-t5-large` model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=CACHE_DIR)

In [10]:
# rouge will be used to evaluate simplification
metric = evaluate.load("rouge")

## TODO@Prayut: Use Flesch-Kincaid or SMOG Index

## Preprocess data

In [11]:
df3 = df2[['source','source_level_og','target','target_level_og']]

df_train, df_test = train_test_split(df3, test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42)  # 0.25 x 0.8 = 0.2

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'source_level_og', 'target', 'target_level_og', '__index_level_0__'],
        num_rows: 10457
    })
    validation: Dataset({
        features: ['source', 'source_level_og', 'target', 'target_level_og', '__index_level_0__'],
        num_rows: 1162
    })
    test: Dataset({
        features: ['source', 'source_level_og', 'target', 'target_level_og', '__index_level_0__'],
        num_rows: 1291
    })
})

In [12]:
def tokenize_fn(examples):
        # t5 input requires a prompt prefix that specifies the task
        prefixed_input = ["Simplify from " + str(source_level) + " to " + str(target_level) + " : " + source for source, source_level, target_level in zip(examples["source"], examples['source_level_og'], examples['target_level_og'])]

        # tokenize inputs
        # note that padding is left out here because it will be left to the data collator
        model_inputs = tokenizer(prefixed_input, truncation=True, padding=False)

        # tokenizing labels using `text_target` argument
        # note that padding is left out here because it will be left to the data collator
        labels = tokenizer(examples["target"], truncation=True, padding=False)

        # `labels` is a required name for pytorch evaluation
        model_inputs["labels"] = labels["input_ids"]
        
        return model_inputs 
    
# applying preprocess function to entire dataset
# note 1: had the tokenizers had padding=True, all observations in the dataset would have been padded/truncatd to the same length, regardless of how they are batched
# note 2: this creates new column, and the `map` method takes an arguments to remove unneeded columns
tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=['source', 'source_level_og', 'target', 'target_level_og', '__index_level_0__'])
tokenized_dataset

Map:   0%|          | 0/10457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1162 [00:00<?, ? examples/s]

Map:   0%|          | 0/1291 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10457
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1162
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1291
    })
})

In [13]:
# giving example of how data looks raw, then tokenized, then decoded
# note again, there is no padding here
sample = tokenized_dataset["train"][25:27]

print("~~~~original inputs~~~~~")
print(dataset["train"]["source"][25])

print("~~~~encoded inputs~~~~~")
print(sample["input_ids"][0])

print("~~~~decoded inputs~~~~~")
print(tokenizer.decode(sample["input_ids"][0]))

print("~~~~encoded targets~~~~~")
print(sample["labels"][0])

print("~~~~decoded target~~~~~")
print(tokenizer.decode(sample["labels"][0]))

print("~~~~sample length in batch~~~~~")
print([len(x) for x in sample["input_ids"]])

~~~~original inputs~~~~~
Italian authorities released footage of daring night time raids to dismantle the Sicilian Mafia.
The police used cover surveillance and wiretaps to monitor multiple suspects who they suspect of Mafia association, extortion and weapons violations. In total, they arrested 46 people, including a presumed new regional boss.

~~~~encoded inputs~~~~~
[180, 10296, 4921, 45, 1877, 632, 12, 3, 12734, 3, 10, 4338, 5779, 1883, 13420, 13, 649, 53, 706, 97, 15941, 7, 12, 1028, 348, 17, 109, 8, 29250, 23, 152, 1534, 89, 23, 9, 5, 37, 2095, 261, 1189, 12305, 11, 4107, 8873, 7, 12, 3393, 1317, 6220, 7, 113, 79, 6220, 13, 1534, 89, 23, 9, 6028, 6, 3, 10398, 127, 1575, 11, 7749, 17880, 5, 86, 792, 6, 79, 10195, 9668, 151, 6, 379, 3, 9, 26451, 26, 126, 3518, 7930, 5, 3, 1]
~~~~decoded inputs~~~~~
Simplify from 3.0 to 1.0 : Italian authorities released footage of daring night time raids to dismantle the Sicilian Mafia. The police used cover surveillance and wiretaps to monitor mul

## Evaluation metric

In [16]:
## TODO@Prayut: Modify this later to use appropriate metric for simplification tasks

def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred
    
    # predictions have to be decoded into tokens
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # returns a dictionary metric: score pairs
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract a few results
    result = {key: value for key, value in result.items()}
    
    # Add mean generated length, will be shown during training loop output
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

## Prep for training

In [17]:
# dynamically padding the inputs for each batch, as oppose to padding to the max of the entire dataset
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_name,
    padding=True,
    label_pad_token_id=-100 # pytorch ignores during loss when label ids are -100
)

In [18]:
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="flan-t5-base-tune/",
    per_device_train_batch_size=8, # important for avoiding OOM
    per_device_eval_batch_size=8, # important for avoiding OOM
    predict_with_generate=True,
    fp16=False, # setting to true here produces NaNs in evaluation for some reason
    learning_rate=5e-4,
    weight_decay=0.01,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [19]:
# creating smaller training and test samples to speed up training
# this is optional, though recommended to see if testing is working without errors before scaling up ot full dataset
small_train = tokenized_dataset["train"].shuffle(seed=SEED).select(range(500))
small_test = tokenized_dataset["test"].shuffle(seed=SEED).select(range(500))

In [20]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=small_train, # replace with tokenized_dataset["train"] if want to use full dataset
    eval_dataset=small_test, # replace with tokenized_dataset["test"] if want to use full dataset
    compute_metrics=compute_metrics,
)

## Fine-tuning

In [None]:
# Start training
result = trainer.train()

print(f"Time: {result.metrics['train_runtime']:.2f}")
print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

***** Running training *****
  Num examples = 500
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 315
  Number of trainable parameters = 247577856
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# evaluating best model on the test set
trainer.evaluate()

In [None]:
# saving the model to local directory
trainer.save_model("flan-t5-based-tuned-to-max")