<a href="https://colab.research.google.com/github/faezehhyd/NLP_Assignment_2/blob/main/Assignment_2_Case_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faker

Collecting faker
  Downloading Faker-22.2.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-22.2.0


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from faker import Faker

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
"""Generate a word frequency table from the given text.
This table contains the frequency of each word in the text, excluding stop words.
"""
def generate_word_frequency_table(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    word_frequency_table = FreqDist(filtered_words)
    return word_frequency_table


# Tokenize the given text into sentences.
def tokenize_sentences(text):
    return sent_tokenize(text)

# Score each sentence based on term frequency.
def score_sentences(sentences, word_frequency_table):
    sentence_scores = {}
    for sentence in sentences:
        for word, freq in word_frequency_table.items():
            if word in sentence.lower():
                if sentence[:10] in sentence_scores:
                    sentence_scores[sentence[:10]] += freq
                else:
                    sentence_scores[sentence[:10]] = freq

    # Normalize scores by dividing by the number of words in each sentence
    for sentence, score in sentence_scores.items():
        sentence_scores[sentence] = score / len(sentence.split())

    return sentence_scores


# Find the threshold by calculating the average score of sentences.
def find_threshold(sentence_scores):
    # Calculate the average score as the threshold
    return sum(sentence_scores.values()) / len(sentence_scores)


# Generate a summary by selecting sentences with scores above the threshold.
def generate_summary(sentences, sentence_scores, threshold, max_tokens):
    summary = ""
    total_tokens = 0

    for sentence in sentences:
        if sentence[:10] in sentence_scores and sentence_scores[sentence[:10]] >= threshold:
            if total_tokens + len(sentence.split()) <= max_tokens:
                summary += " " + sentence
                total_tokens += len(sentence.split())

    return summary


# Generate a fake document with the specified number of paragraphs.
def generate_fake_document(paragraphs):
    fake = Faker()
    fake_document = ""
    total_tokens = 0

    for _ in range(paragraphs):
        paragraph = fake.paragraph() + "\n"
        fake_document += paragraph
        total_tokens += len(word_tokenize(paragraph))

    return fake_document, total_tokens

def collate_summaries(document_summaries, steps):
    # Collate document summaries based on the specified number of steps.
    collated_summaries = []
    for i in range(0, len(document_summaries), steps):
        step_summaries = document_summaries[i:i+steps]
        collated_summary = " ".join(step_summaries)
        collated_summaries.append(collated_summary)
    return collated_summaries


def hierarchical_summarization(num_fake_documents, paragraphs_per_document, steps, max_tokens):
    total_tokens_all_documents = 0

    # Create a file for the summary of summaries
    summary_of_summaries_file = open("summary_of_summaries.txt", "w", encoding="utf-8")

    for i in range(num_fake_documents):
        fake_document, total_tokens = generate_fake_document(paragraphs_per_document)
        total_tokens_all_documents += total_tokens
        print("generating fake documents number", i + 1)
        print("total tokens in document:",total_tokens)
        print("\n")

    print("\nTotal tokens for all documents:", total_tokens_all_documents)

    # Check if the total tokens for all documents exceed the max_tokens (context window size)
    if total_tokens_all_documents > max_tokens:
        print("Exceeds the context window size. Summarizing all documents.")
        print("\n")
    else:
        print("Within the context window size. No summarization needed.")
        return

    # Summarize all documents
    for i in range(num_fake_documents):
        fake_document, _ = generate_fake_document(paragraphs_per_document)
        print("Summarizing Document", i+1)
        word_frequency_table = generate_word_frequency_table(fake_document)
        print("\tgenerating word frequency table")
        sentences = tokenize_sentences(fake_document)
        print("\ttokenizing sentences")
        sentence_scores = score_sentences(sentences, word_frequency_table)
        print("\tscoring sentences")
        threshold = find_threshold(sentence_scores)
        print("\tfinding threshold")
        document_summaries = []
        print("\tgenerate summary")
        print("\n")

        for _ in range(steps):
            summary = generate_summary(sentences, sentence_scores, threshold, max_tokens)
            document_summaries.append(summary)

            # Collate summaries
            collated_summaries = collate_summaries(document_summaries, steps)

            # Recursive summarization
            sentences = tokenize_sentences(" ".join(collated_summaries))
            word_frequency_table = generate_word_frequency_table(" ".join(collated_summaries))
            sentence_scores = score_sentences(sentences, word_frequency_table)

        # Save documents and summaries to files
        with open(f"fake_document_{i + 1}.txt", "w", encoding="utf-8") as doc_file:
            doc_file.write(fake_document)

        # Save the last collated summary to the summary file
        with open(f"summary_{i + 1}.txt", "w", encoding="utf-8") as summary_file:
            summary_file.write(collated_summaries[-1])

        # Append the last collated summary to the summary of summaries file
        summary_of_summaries_file.write(collated_summaries[-1] + "\n")

    summary_of_summaries_file.close()
    print("Creating summary of summaries")

if __name__ == "__main__":
    # Specify the number of fake documents, length in terms of paragraphs, number of steps, and max tokens
    num_fake_documents = 5
    paragraphs_per_document = 1000
    steps = 3
    max_tokens = 4096  # maximum allowed tokens in the context window

    print("number of documents:", num_fake_documents)
    print("paragraphs per document:", paragraphs_per_document)
    print("number of steps:", steps)
    print("max tokens(maximum allowed tokens in the context window):", max_tokens)
    print("\n")

    hierarchical_summarization(num_fake_documents, paragraphs_per_document, steps, max_tokens)


number of documents: 5
paragraphs per document: 1000
number of steps: 3
max tokens(maximum allowed tokens in the context window): 4096


generating fake documents number 1
total tokens in document: 16417


generating fake documents number 2
total tokens in document: 16250


generating fake documents number 3
total tokens in document: 16153


generating fake documents number 4
total tokens in document: 15986


generating fake documents number 5
total tokens in document: 16229



Total tokens for all documents: 81035
Exceeds the context window size. Summarizing all documents.


Summarizing Document 1
	generating word frequency table
	tokenizing sentences
	scoring sentences
	finding threshold
	generate summary


Summarizing Document 2
	generating word frequency table
	tokenizing sentences
	scoring sentences
	finding threshold
	generate summary


Summarizing Document 3
	generating word frequency table
	tokenizing sentences
	scoring sentences
	finding threshold
	generate summary


Summarizi