In [12]:
from datasets import load_dataset
from langdetect import detect
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')
from nltk.corpus import cmudict
# Download the CMU Pronouncing Dictionary for syllable counting
nltk.download('cmudict')
d = cmudict.dict()

# Load a dataset for text simplification
dataset = load_dataset("bogdancazan/wikilarge-text-simplification") 

[nltk_data] Downloading package punkt to /Users/oliviagao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/oliviagao/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [13]:
def syllable_count(word):
    """Return the syllable count for a word."""
    word = word.lower()
    if word in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word]])  # Get the max syllables
    else:
        return None

In [14]:
def flesch_kincaid(text):
    """
    Calculate Flesch Reading Ease and Flesch-Kincaid Grade Level for a given text.
    """
    if text.strip() == '' or detect(text) != 'en':
        return -1
    
    # Split text into sentences
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty strings
    num_sentences = len(sentences)

    # Split text into words
    words = re.findall(r'\w+', text)
    num_words = len(words)

    # Count syllables in words
    num_syllables = sum(syllable_count(word) for word in words)

    # Calculate ASL and ASW
    asl = num_words / num_sentences if num_sentences > 0 else 0
    asw = num_syllables / num_words if num_words > 0 else 0

    # Calculate Flesch Reading Ease
    reading_ease = 206.835 - (1.015 * asl) - (84.6 * asw)

    return reading_ease

In [15]:
# Print out some information about the dataset
print(dataset.shape)
print(dataset["train"][0]['Normal'])
print(dataset["validation"][0])
print(dataset["test"][0])

{'train': (148843, 2), 'validation': (494, 2), 'test': (191, 2)}
there is manuscript evidence that austen continued to work on these pieces as late as the period and that her niece and nephew anna and james edward austen made further additions as late as.
{'Normal': 'upper sorbian is a minority language spoken by sorbs in germany in the historical province of upper lusatia lrb hornja u ica in sorbian rrb which is today part of saxony.', 'Simple': 'there are around speakers of upper sorbian living in saxony. upper sorbian is a minority language in germany according to the european charter for regional or minority languages.'}
{'Normal': 'his next work saturday follows an especially eventful day in the life of a successful neurosurgeon.', 'Simple': 'his next work at saturday will be a successful neurosurgeon.'}


In [16]:
from transformers import T5Tokenizer

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

MAX_LENGTH = 128

def tokenize_function(examples):
    # Add the task prefix for T5 input format 
    inputs = ["simplify: " + text for text in examples['Normal']]
    targets = examples['Simple']
    
    # Tokenize the inputs and outputs (text pairs)
    model_inputs = tokenizer(inputs, padding="max_length", max_length=MAX_LENGTH, truncation=True)
    labels = tokenizer(targets, padding="max_length", max_length=MAX_LENGTH, truncation=True)

    # Add labels as tokenized targets (this will be used for decoder during training)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization
tokenized_train = filtered_train.map(tokenize_function, batched=True)
tokenized_validation = filtered_validation.map(tokenize_function, batched=True)
tokenized_test = filtered_test.map(tokenize_function, batched=True)

print(tokenized_train[0])
print(tokenized_validation[0])
print(tokenized_test[0])

train_subset = tokenized_train.shuffle(seed=42).select(range(2000))
validation_subset = tokenized_validation.shuffle(seed=42).select(range(400))
testing_subset = tokenized_test.shuffle(seed=42).select(range(191))

{'Normal': 'there is manuscript evidence that austen continued to work on these pieces as late as the period and that her niece and nephew anna and james edward austen made further additions as late as.', 'Simple': 'there is some proof that austen continued to work on these pieces later in life. her nephew and niece james edward and anna austen may have made further additions to her work in around.', 'input_ids': [18356, 10, 132, 19, 14496, 2084, 24, 403, 324, 2925, 12, 161, 30, 175, 2161, 38, 1480, 38, 8, 1059, 11, 24, 160, 23642, 11, 23213, 3, 10878, 11, 7620, 15, 7, 3, 15, 26, 2239, 403, 324, 263, 856, 811, 7, 38, 1480, 38, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name) 

learning_rates = [1e-5, 3e-5]
batch_sizes = [8, 16]
num_epochs = [2, 3]

logs = []

# Loop through all hyperparameter combinations
for lr in learning_rates:
    for batch_size in batch_sizes:
        for epochs in num_epochs:
            print(f"Training with lr={lr}, batch_size={batch_size}, epochs={epochs}")
            
            # Set up training arguments
            training_args = TrainingArguments(
                output_dir="./results",
                eval_strategy="epoch",
                learning_rate=lr,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=8,
                num_train_epochs=epochs,
                weight_decay=0.01,
                push_to_hub=False,
                logging_dir='./logs',
                logging_steps=10,
                save_steps=10,
                save_total_limit=2,
                fp16=True,
            )

            # Set up Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_subset,
                eval_dataset=validation_subset,
            )

            # Start training
            trainer.train()
            
            logs.append(trainer.state.log_history)

Training with lr=1e-05, batch_size=8, epochs=2


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,1.7454,0.7255
2,1.0969,0.638859


Training with lr=1e-05, batch_size=8, epochs=3


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.654,0.596828
2,0.6047,0.562396
3,0.5392,0.549608


Training with lr=1e-05, batch_size=16, epochs=2


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.5676,0.526494
2,0.5791,0.521082


Training with lr=1e-05, batch_size=16, epochs=3


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.5322,0.499682
2,0.5398,0.487785
3,0.5396,0.48492


Training with lr=3e-05, batch_size=8, epochs=2


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.4287,0.419346
2,0.4183,0.409564


Training with lr=3e-05, batch_size=8, epochs=3


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.3859,0.398137
2,0.3925,0.392723
3,0.3406,0.391469


Training with lr=3e-05, batch_size=16, epochs=2


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.3878,0.390135
2,0.4087,0.39003


Training with lr=3e-05, batch_size=16, epochs=3


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.3789,0.389451
2,0.4055,0.389032
3,0.4024,0.388885


In [17]:
# Picked best hyperparameters: lr=3e-05, batch_size=8, epochs=3
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
    fp16=True,
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=validation_subset,
)

# Start training
trainer.train()

model.save_pretrained('../t5_model')
tokenizer.save_pretrained('../t5_tokenizer')

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.4006,0.405604
2,0.3998,0.397824
3,0.3502,0.396206


('../t5_tokenizer/tokenizer_config.json',
 '../t5_tokenizer/special_tokens_map.json',
 '../t5_tokenizer/spiece.model',
 '../t5_tokenizer/added_tokens.json')

In [18]:
# Initialize rouge scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Set device to MPS (if available)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

def simplify_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_sentences = [
    "The experiment yielded inconclusive results due to extraneous variables.",
    "The ecosystem's biodiversity is crucial for maintaining ecological balance.",
    "Although she was considered smart, she failed all her exams.",
]

# Test sentences
for sentence in test_sentences:
    simplified = simplify_text(sentence)
    print(f"Original: {sentence}")
    print(f"Simplified: {simplified}")
    print("-" * 50)


Original: The experiment yielded inconclusive results due to extraneous variables.
Simplified: The experiment yielded inconclusive results due to extraneous variables.
--------------------------------------------------
Original: The ecosystem's biodiversity is crucial for maintaining ecological balance.
Simplified: La biodiversité de l'écosystème est cruciale pour l'équilibre écologique.
--------------------------------------------------
Original: Although she was considered smart, she failed all her exams.
Simplified: She was considered smart but she failed all her exams.
--------------------------------------------------


In [19]:
# Print out evaluation metrics separately to not crowd the sentences
sentences = ["The experiment yielded inconclusive results due to extraneous variables.", "The ecosystem's biodiversity is crucial for maintaining ecological balance.", "Although she was considered smart, she failed all her exams."]
simplified_sentences = ["The experiment yielded inconclusive results due to extraneous variables.", "La biodiversité de l'écosystème est cruciale pour l'équilibre écologique.", "She was considered smart but she failed all her exams."]
for sentence, simplified in zip(sentences, simplified_sentences):
    scores = scorer.score(sentence, simplified)
    for key in scores:
        print(f'{key}: {scores[key]}') 
    reading_ease = flesch_kincaid(sentence)
    simplified_reading_ease = flesch_kincaid(simplified)
    print(f"Flesch Reading Ease: {reading_ease:.2f} -> {simplified_reading_ease: .2f}")

rouge1: Score(precision=1.0, recall=1.0, fmeasure=1.0)
rouge2: Score(precision=1.0, recall=1.0, fmeasure=1.0)
rougeL: Score(precision=1.0, recall=1.0, fmeasure=1.0)
Flesch Reading Ease: -18.50 -> -18.50
rouge1: Score(precision=0.08333333333333333, recall=0.1, fmeasure=0.0909090909090909)
rouge2: Score(precision=0.0, recall=0.0, fmeasure=0.0)
rougeL: Score(precision=0.08333333333333333, recall=0.1, fmeasure=0.0909090909090909)
Flesch Reading Ease: -23.27 -> -1.00
rouge1: Score(precision=0.9, recall=0.9, fmeasure=0.9)
rouge2: Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778)
rougeL: Score(precision=0.9, recall=0.9, fmeasure=0.9)
Flesch Reading Ease: 78.25 ->  86.71
