In [None]:
# Ensure required resources are downloaded
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import nltk
import heapq
import re
from transformers import pipeline
from bs4 import BeautifulSoup
import requests



# Sample Text for Summarization
text = """
Natural Language Processing (NLP) is a branch of artificial intelligence that helps computers understand, interpret, and produce human language.
Applications include machine translation, sentiment analysis, and chatbots. NLP uses machine learning models trained on vast datasets.
"""

# Extractive Summarization using NLTK
def extractive_summarization(text, num_sentences=2):
    sentences = nltk.sent_tokenize(text)
    word_frequencies = {}
    stopwords = set(nltk.corpus.stopwords.words("english"))
    words = nltk.word_tokenize(text)

    for word in words:
        word = word.lower()
        if word not in stopwords and word.isalnum():
            word_frequencies[word] = word_frequencies.get(word, 0) + 1

    max_freq = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_frequencies:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_frequencies[word]

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

# Abstractive Summarization using Hugging Face Transformer
def abstractive_summarization(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=50, min_length=10, do_sample=False)
    return summary[0]['summary_text']

# Example for Multi-Document Summarization
def get_wikipedia_summary(topic):
    url = f"https://en.wikipedia.org/wiki/{topic}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = soup.find_all("p")
    text = ' '.join([para.text for para in paragraphs[:5]])  # First few paragraphs
    return extractive_summarization(text)

# Execute and Display Results
print("Extractive Summary:")
print(extractive_summarization(text))

print("\nAbstractive Summary:")
print(abstractive_summarization(text))

print("\nMulti-Document Summary Example (Wikipedia - NLP):")
print(get_wikipedia_summary("Natural_language_processing"))


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Extractive Summary:

Natural Language Processing (NLP) is a branch of artificial intelligence that helps computers understand, interpret, and produce human language. NLP uses machine learning models trained on vast datasets.

Abstractive Summary:


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu


 Natural Language Processing (NLP) is a branch of artificial intelligence that helps computers understand, interpret, and produce human language . NLP uses machine learning models trained on vast datasets .

Multi-Document Summary Example (Wikipedia - NLP):
The premise of symbolic NLP is well-summarized by John Searle's Chinese room experiment: Given a collection of rules (e.g., a Chinese phrasebook, with questions and matching answers), the computer emulates natural language understanding (or other NLP tasks) by applying those rules to the data it confronts. Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of machine learning algorithms for language processing.


Summarized Text:
 PageRank is used in NLP for ranking sentences. PageRank is also useful in keyword extraction.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
pip install numpy networkx nltk scikit-learn




In [None]:
import numpy as np
import networkx as nx
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')

def text_rank_summarization(text, num_sentences=2):
    """
    Summarizes the input text using TextRank (PageRank-based ranking).

    Parameters:
        text (str): The input document.
        num_sentences (int): Number of sentences to include in summary.

    Returns:
        str: Extracted summary.
    """
    # 1 Sentence Tokenization
    sentences = nltk.sent_tokenize(text)

    # 2️ Compute TF-IDF Vectors
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    # 3️ Compute Cosine Similarity Matrix
    similarity_matrix = cosine_similarity(X)

    # 4️ Create a Graph and Apply PageRank
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    # 5️ Rank Sentences Based on PageRank Scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # 6️ Generate Summary with Top Sentences
    summary = " ".join([s for _, s in ranked_sentences[:num_sentences]])

    return summary

#  Example Usage
text = """PageRank is a graph-based ranking algorithm originally developed for ranking web pages.
          It has been adapted to NLP tasks such as extractive text summarization.
          The algorithm constructs a similarity graph where nodes are text units and edges represent similarity.
          It applies a random walk model to rank the most important sentences.
          This helps in selecting the most representative information in a document."""

summary = text_rank_summarization(text, num_sentences=2)

print(" Extracted Summary:\n", summary)


 Extracted Summary:
 It applies a random walk model to rank the most important sentences. The algorithm constructs a similarity graph where nodes are text units and edges represent similarity.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
