In [None]:
pip install transformers



In [None]:
pip install nltk



In [None]:
pip install spacy



In [None]:
pip install python-docx

In [None]:
#Importing required libraries.
import nltk
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from docx import Document
from transformers import BartTokenizer, BartForConditionalGeneration


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

spacy.cli.download("en_core_web_sm")

from nltk.tokenize import sent_tokenize, word_tokenize

# Defining file paths
filePaths = ['/content/P2025Chapter1.docx', '/content/P2025Chapter2.docx', '/content/P2025Chapter3.docx', '/content/P2025Chapter4.docx', '/content/P2025Chapter5.docx',
             '/content/P2025Chapter6.docx', '/content/P2025Chapter7.docx', '/content/P2025Chapter8.docx', '/content/P2025Chapter9.docx', '/content/P2025Chapter10.docx',
             '/content/P2025Chapter11.docx', '/content/P2025Chapter12.docx', '/content/P2025Chapter13.docx', '/content/P2025Chapter14.docx', '/content/P2025Chapter15.docx',
             '/content/P2025Chapter16.docx', '/content/P2025Chapter17.docx', '/content/P2025Chapter18.docx', '/content/P2025Chapter19.docx', '/content/P2025Chapter20.docx',
             '/content/P2025Chapter21.docx', '/content/P2025Chapter22.docx', '/content/P2025Chapter23.docx', '/content/P2025Chapter24.docx', '/content/P2025Chapter25.docx',
             '/content/P2025Chapter26.docx', '/content/P2025Chapter27.docx', '/content/P2025Chapter28.docx', '/content/P2025Chapter29.docx', '/content/P2025Chapter30.docx',
             '/content/P2025Foreword.docx', '/content/P2025Onward.docx', '/content/P2025Section1.docx', '/content/P2025Section2.docx', '/content/P2025Section3.docx',
             '/content/P2025Section4.docx', '/content/P2025Section5.docx']

#Defining a function to extract the text from the docx file.
def docxTextExtraction(docx_file):
  doc = Document(docx_file)
  return "\n".join([para.text
                    for para in doc.paragraphs])

documents = []

for path in filePaths:
    try:
        text = docxTextExtraction(path)
        documents.append(text)
        print(f"Loaded {path} successfully")
    except Exception as e:
        print(f"Failed to load {path}: {str(e)}")

    #Defining functions to preprocess the corpus and divide the text into chunks.
    def preprocessCorp(text):
        text = re.sub(r'\s+', ' ', text)
        text = text.replace("Mandate for Leadership: The Conservative Promise", "")
        text = text.replace("Mandate for Leadership", "")
        return text
    #Chunking texts to handle large documents that exceed BART's token limits.
    def textChunks(text, max_tokens=200):
        sentences = sent_tokenize(text)
        chunks = []
        currentChunk = []
        currentLength = 0

        for sentence in sentences:
            sentenceLength = len(nltk.word_tokenize(sentence))
            if currentLength + sentenceLength <= max_tokens:
                currentChunk.append(sentence)
                currentLength += sentenceLength
            else:
                chunks.append(" ".join(currentChunk))
                currentChunk = [sentence]
                currentLength = sentenceLength
            if currentChunk:
              chunks.append(" ".join(currentChunk))
        return chunks
    #Defining functions for summarization using BART
    def summarizeWithBart(text, max_length=100, min_length=50):
        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
        inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
        summaryIDs = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summaryIDs[0], skip_special_tokens=True)
        return summary

    def summarizeWithBartLarge(text, max_tokens=200, max_length=100, min_length=50, max_summary_length_ratio=0.2):
        chunks = textChunks(text, max_tokens=max_tokens)
        totalLength = sum(len(nltk.word_tokenize(chunk)) for chunk in chunks)
        targetLength = int(totalLength * max_summary_length_ratio)

        summaries = []
        currentSummaryLength = 0
        for chunk in chunks:
            summary = summarizeWithBart(chunk, max_length=max_length, min_length=min_length)
            summaryLength = len(nltk.word_tokenize(summary))
            if currentSummaryLength + summaryLength > targetLength:
                break
            summaries.append(summary)
            currentSummaryLength += summaryLength

        return " ".join(summaries)
    #Defining a function to reconstruct paragraphs with LDA topic modeling.
    #LDA provides topical clusters for better readability.
    def assembleParagraphs(summary, numTopics='auto'):
        sentences = sent_tokenize(summary)
        vectorizer = TfidfVectorizer(stop_words='english')
        X = vectorizer.fit_transform(sentences)

        if numTopics == 'auto':
            numTopics = min(10, len(sentences) // 5)

        lda = LatentDirichletAllocation(n_components=numTopics, random_state=0)
        lda.fit(X)
        topics = lda.transform(X)

        clusters = np.argmax(topics, axis=1)
        clusteredSentences = [[] for _ in range(numTopics)]

        for sentence, cluster in zip(sentences, clusters):
            clusteredSentences[cluster].append(sentence)

        structuredSummary = "\n\n".join([" ".join(cluster) for cluster in clusteredSentences])
        return structuredSummary

    #Analyzing each part of the document.
    for i, doc in enumerate(documents):
        doc = preprocessCorp(doc)

        summary = summarizeWithBartLarge(doc, max_tokens=200, max_length=100, min_length=50)
        structuredSummary = assembleParagraphs(summary, numTopics='auto')
        print(f"Document {i+1} - Summary: {structuredSummary}")