In [1]:
# Import Libraries
import pandas as pd # Handles tabular data (DataFrames) and CSV reading
import numpy as np # Provides numerical and array operations
import matplotlib.pyplot as plt # Creates visualizations
import seaborn as sns # Enhances visualizations with attractive statistical plots
import nltk # Provides NLP tools like tokenization, stopwords, and text processing
from nltk.corpus import stopwords # Contains common stopwords (e.g., "the", "and", "is") to remove from text
from nltk.tokenize import word_tokenize # Splits text into individual words (tokens)
from wordcloud import WordCloud  # Creates word clouds from text data


In [2]:
import pandas as pd

# Load Hamlet dataset
hamlet_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-09-17/hamlet.csv'
df_hamlet = pd.read_csv(hamlet_url)


In [3]:
import os # Provides a way to interact with the operating system, used here to set environment variables
from transformers import pipeline # Loads pre-trained NLP models for summarization and paraphrasing 

# Fix Hugging Face symlink warning for Windows
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load a summarization model explicitly
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Get first dialogue text
dialogue_text = df_hamlet['dialogue'].iloc[0]

# Dynamically adjust max_length to avoid warnings
max_length = min(len(dialogue_text.split()) + 10, 50)

# Generate summary
summary = summarizer(dialogue_text, max_length=max_length, min_length=10, do_sample=False)

# Output results
print("Original Text:\n", dialogue_text)
print("\nSummarized Text:\n", summary[0]['summary_text'])



Device set to use cpu
Your max_length is set to 18, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)


Original Text:
 FRANCISCO at his post. Enter to him BERNARDO

Summarized Text:
 The Daily Discussion is open to everyone. Please share your thoughts on this week


In [4]:
#Evaluate if the T5 model could successfully reword structured inputs with using prompts 
from transformers import pipeline # Loads pre-trained NLP models for summarization and paraphrasing 

paraphraser = pipeline("text2text-generation", model="t5-base") #Load the t5-base model as a text-to-text generation pipeline.

# Reformat input to sound more like a sentence
dialogue_text = "The scene opens with FRANCISCO at his post. Then BERNARDO enters."

prompt = f"Paraphrase this: {dialogue_text}" # Create a prompt 
response = paraphraser(prompt, max_length=50, min_length=10, do_sample=False) #Send the prompt to the model for paraphrasing.

print("\nParaphrased Text:\n", response[0]['generated_text']) #Print the paraphrased text from the response.


Device set to use cpu



Paraphrased Text:
 : The scene opens with FRANCISCO at his post. Then BERNARDO enters.


In [38]:
#Prepare Data: Labeled Examples
from datasets import Dataset #Loads the Dataset class from Hugging Face’s datasets library.
#Dataset class is used to create and manage datasets for training language models.

# Create labeled examples
# Simulates a mini supervised dataset: each example teaches the model how to rewrite a sentence.
train_data = [
    {"input": "paraphrase: FRANCISCO at his post. Enter to him BERNARDO", 
     "output": "The scene opens with FRANCISCO at his post. Then BERNARDO enters."},
    {"input": "paraphrase: Nay, answer me: stand, and unfold yourself.", 
     "output": "Please answer and identify yourself."},
    {"input": "paraphrase: Long live the king!", 
     "output": "May the king live long!"},
    {"input": "paraphrase: Bernardo?", 
     "output": "Is that you, Bernardo?"},
    {"input": "paraphrase: You come most carefully upon your hour.", 
     "output": "You arrived exactly on time."}
]

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(train_data) #Converts the list into a Hugging Face Dataset object so it can be tokenized, batched, and used for training.
dataset = dataset.train_test_split(test_size=0.2) #Split the dataset


In [52]:
#Load Model & Tokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-base"  
#Breaks down text into tokens that the model can understand and turns predictions back into text.
#SentencePiece tokenizer, which splits text into subwords. It treats input as a raw stream of characters, making it flexible for many languages. 
tokenizer = T5Tokenizer.from_pretrained(model_name) 
#T5 model
model = T5ForConditionalGeneration.from_pretrained(model_name) 



In [53]:
#Preprocess the Dataset
#Set the maximum number of tokens allowed in the input and output sentences.
#Ensures all sequences have the same length, which is important for model training.
max_input_length = 64
max_target_length = 64

def preprocess(example):
    inputs = tokenizer(example["input"], max_length=max_input_length, padding="max_length", truncation=True)
    targets = tokenizer(example["output"], max_length=max_target_length, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True) #Applies the preprocess function to every item in the dataset. 
#batched=True lets it process multiple examples at once (faster).


In [54]:
#Fine-Tune the Model
from transformers import TrainingArguments, Trainer

# Define training arguments to define how the model should be trained
training_args = TrainingArguments(
    output_dir="./t5-paraphrase",
    evaluation_strategy="epoch", 
    learning_rate=5e-5, # How fast model updates weights
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,
    num_train_epochs=10, 
    weight_decay=0.01, #regularization technique used during training to prevent overfitting
    logging_dir='./logs',
    logging_steps=10
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

# Train the model. Now the T5 model is being fine-tuned on paraphrased examples. The model is learning how to reword.
trainer.train()

# Paraphrasing function (after training is done)
#This function takes new input, uses fine-tuned model to generate a paraphrase, and returns it.
def paraphrase_text(text):
    input_text = f"paraphrase: {text}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True) 

# Test the paraphrasing after training
print(paraphrase_text("FRANCISCO at his post. Enter to him BERNARDO"))



  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,16.360489
2,No log,13.677347
3,No log,12.208059
4,No log,11.193885
5,12.081200,10.306516
6,12.081200,9.548788
7,12.081200,8.96421
8,12.081200,8.55994
9,12.081200,8.309845
10,7.876300,8.202006


Paraphrase: FRANCISCO at his post. Enter to him BERNARDO at his post.
