In [None]:
# Install required NLP and evaluation libraries and import core dependencies
!pip install transformers sentencepiece rouge-score

from transformers import pipeline
import requests
import textwrap
from rouge_score import rouge_scorer
import pandas as pd


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d945462b173a7b3a16691da8559a27ffdd75d4baa6018ee0945b836f96284026
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
# Download raw book text from Project Gutenberg using its URL
url = "https://www.gutenberg.org/cache/epub/67979/pg67979.txt"
raw_text = requests.get(url).text

print("Total characters:", len(raw_text))

Total characters: 416444


In [None]:
# Remove Gutenberg header (first ~50 lines)
lines = raw_text.split("\n")
cleaned = "\n".join(lines[50:])

cleaned = cleaned.replace("\r", "").strip()

print(cleaned[:800])


CHAPTER VIII
 CHAPTER IX
 CHAPTER X
 CHAPTER XI
 CHAPTER XII
 CHAPTER XIII
 CHAPTER XIV
 CHAPTER XV
 CHAPTER XVI
 CHAPTER XVII
 CHAPTER XVIII
 CHAPTER XIX
 CHAPTER XX
 CHAPTER XXI
 CHAPTER XXII
 CHAPTER XXIII
 CHAPTER XXIV
 CHAPTER XXV
 CHAPTER XXVI
 CHAPTER XXVII
 CHAPTER XXVIII
 CHAPTER XXIX
 CHAPTER XXX
 CHAPTER XXXI
 CHAPTER XXXII
 CHAPTER XXXIII
 CHAPTER XXXIV
 CHAPTER XXXV
 CHAPTER XXXVI
 CHAPTER XXXVII
 CHAPTER XXXVIII
 CHAPTER XXXIX
 CHAPTER XL
 CHAPTER XLI
 CHAPTER XLII
 CHAPTER XLIII
 CHAPTER XLIV
 CHAPTER XLV




THE BLUE CASTLE




CHAPTER I


If it had not rained on a certain May morning Valancy Stirling’s whole
life would have been entirely different. She would have gone, with the
rest of her clan, to Aunt Wellington’s engagement picnic and Dr. Trent
would have gone to Montre


In [None]:
# ~1200 chars per chunk
chunks = textwrap.wrap(cleaned, 1200)
len(chunks)


340

In [None]:
# Initialize a pre-trained BART summarization pipeline from Hugging Face
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    tokenizer="facebook/bart-large-cnn"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
# Generate summaries for a subset of text chunks using the summarization pipeline
summaries = []

for i, chunk in enumerate(chunks[:30]):
    result = summarizer(
        chunk,
        max_length=150,
        min_length=60,
        do_sample=False
    )
    summaries.append(result[0]["summary_text"])

summaries[:3]


["Valancy Stirling's whole life would have been entirely different if it had not rained on a certain May morning. She would have gone, with the rest of her clan, to Aunt Wellington’s engagement picnic. But it did rain and you shall hear what happened to her because of it.",
 'Valancy had never quite relinquished a certain pitiful, shamed, little hope that Romance would come her way yet. The tears came into her eyes as she lay there alone in the faintly greying darkness. She dared not let herself cry as hard as she wanted to. She was afraid that crying might bring on another attack of that pain around the heart.',
 '“I answered with the plain truth, ‘I am crying because I cannot get married’ How horrified Mother would be,” Valancy thought. “It is not,’ Valancy could hear her mother’s prim, dictatorial voice asserting, “it is not _maidenly_ to think about _men_.”']

In [None]:
# Compute ROUGE scores between each original chunk and its summary
scorer = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeL'],
    use_stemmer=True
)

rouge_results = []
for i in range(len(summaries)):
    score = scorer.score(chunks[i], summaries[i])
    rouge_results.append(score)

rouge_results[0]


{'rouge1': Score(precision=1.0, recall=0.2602040816326531, fmeasure=0.4129554655870446),
 'rouge2': Score(precision=0.94, recall=0.24102564102564103, fmeasure=0.3836734693877551),
 'rougeL': Score(precision=0.803921568627451, recall=0.20918367346938777, fmeasure=0.3319838056680163)}

In [None]:
# Store ROUGE scores in a pandas DataFrame for easier analysis and display
rows = []
for i, score in enumerate(rouge_results):
    rows.append([
        i,
        score['rouge1'].fmeasure,
        score['rouge2'].fmeasure,
        score['rougeL'].fmeasure
    ])

df_rouge = pd.DataFrame(
    rows,
    columns=["Chunk", "ROUGE-1", "ROUGE-2", "ROUGE-L"]
)

df_rouge.head()


Unnamed: 0,Chunk,ROUGE-1,ROUGE-2,ROUGE-L
0,0,0.412955,0.383673,0.331984
1,1,0.439716,0.414286,0.439716
2,2,0.336,0.306452,0.32
3,3,0.377953,0.31746,0.259843
4,4,0.437037,0.425373,0.377778


In [None]:
# Save all generated summaries to a text file for later review
with open("summaries.txt", "w") as f:
    for i, s in enumerate(summaries):
        f.write(f"=== Summary {i+1} ===\n")
        f.write(s + "\n\n")
