In [None]:
pip install nltk

In [None]:
pip install spacy

In [None]:
pip install textstat

In [None]:
pip install python-docx

In [None]:
#Importing required libraries
import nltk
import spacy
from nltk.corpus import stopwords, brown
from docx import Document
from wordcloud import WordCloud
import textstat
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.tokenize import sent_tokenize
import seaborn as sns
from math import pi

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('brown')

# Download spaCy model
spacy.cli.download("en_core_web_sm")


# Define file paths
filePaths = ['/content/P2025Chapter1.docx','/content/P2025Chapter2.docx','/content/P2025Chapter3.docx', '/content/P2025Chapter4.docx','/content/P2025Chapter5.docx','/content/P2025Chapter6.docx',
             '/content/P2025Chapter7.docx','/content/P2025Chapter8.docx','/content/P2025Chapter9.docx', '/content/P2025Chapter10.docx', '/content/P2025Chapter11.docx','/content/P2025Chapter12.docx',
             '/content/P2025Chapter13.docx', '/content/P2025Chapter14.docx', '/content/P2025Chapter15.docx', '/content/P2025Chapter16.docx', '/content/P2025Chapter17.docx','/content/P2025Chapter18.docx',
             '/content/P2025Chapter19.docx','/content/P2025Chapter20.docx','/content/P2025Chapter21.docx', '/content/P2025Chapter22.docx','/content/P2025Chapter23.docx','/content/P2025Chapter24.docx',
             '/content/P2025Chapter25.docx','/content/P2025Chapter26.docx','/content/P2025Chapter27.docx', '/content/P2025Chapter28.docx','/content/P2025Chapter29.docx','/content/P2025Chapter30.docx',
             '/content/P2025Foreword.docx','/content/P2025Onward.docx','/content/P2025Section1.docx', '/content/P2025Section2.docx','/content/P2025Section3.docx','/content/P2025Section4.docx', '/content/P2025Section5.docx']

#Defining a function to extract text from a .docx file.
def extractDocXText(docx_file):
    doc = Document(docx_file)
    return "\n".join([para.text for para in doc.paragraphs])

documents = []
names = []

for path in filePaths:
    try:
        text = extractDocXText(path)
        documents.append(text)
        names.append(path.split('/')[-1].replace('.docx', ''))
        print(f"Loaded {path} successfully")
    except Exception as e:
        print(f"Failed to load {path}: {str(e)}")

    #Combining all parts to use the whole document for analysis.
    wholeDocument = " ".join(documents)

    # Defining a function to preprocess the text by removing whitespace and words that often don't add context.
    def preprocessCorp(text):
        text = re.sub(r'\s+', ' ', text)
        text = text.replace("Mandate for Leadership: The Conservative Promise", "")
        text = text.replace("Mandate for Leadership", "")
        return text
    #Defining a function to calculate word and sentence statistics.
    def calcStats(text):
        words = nltk.word_tokenize(text)
        sentences = sent_tokenize(text)
        numWords = len(words)
        numSentences = len(sentences)
        avgSentenceLength = numWords / numSentences if numSentences != 0 else 0
        return numWords, numSentences, avgSentenceLength
    #Defining a function to calculate readability metrics.
    def calcReadability(text):
        return {
            "Syllable Count": textstat.syllable_count(text),
            "Flesch Reading Ease": textstat.flesch_reading_ease(text),
            "Flesch-Kincaid Grade": textstat.flesch_kincaid_grade(text),
            "Gunning Fog": textstat.gunning_fog(text),
            "SMOG Index": textstat.smog_index(text),
            "Coleman-Liau Index": textstat.coleman_liau_index(text),
            "Automated Readability Index": textstat.automated_readability_index(text),
            "Dale-Chall Readability Score": textstat.dale_chall_readability_score(text),
            "Difficult Words": textstat.difficult_words(text)
        }

    #Creating a dataframe for calculation storage.
    calcsDf = pd.DataFrame(columns=['Name', 'Words', 'Sentences', 'Avg Sentence Length',
                               'Syllable Count', 'Flesch Reading Ease', 'Flesch-Kincaid Grade',
                               'Gunning Fog', 'SMOG Index', 'Coleman-Liau Index',
                               'Automated Readability Index', 'Dale-Chall Readability Score',
                               'Difficult Words'])

    #Looping through each document to calculate statistics and readability metrics.
    for i, doc in enumerate(documents):
        doc = preprocessCorp(doc)

        numWords, numSentences, avgSentenceLength = calcStats(doc)
        readabilityMetrics = calcReadability(doc)

        #Adding the metrics to the dataframe.
        calcsDf.loc[i] = [names[i], numWords, numSentences, avgSentenceLength] + list(readabilityMetrics.values())

#Categorizing documents by Chapter, Section, or Other.
calcsDf['Category'] = calcsDf['Name'].apply(lambda x: 'Chapter' if 'Chapter' in x else ('Section' if 'Section' in x else 'Other'))

#Creating a radar chart to visualize readability metrics.
#Defining a function to create a grayscale radar chart.
def radarChart(df_row, title):
    categories = list(df_row.index)
    values = df_row.values.flatten().tolist()
    values += values[:1]
    angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
    angles += angles[:1]
    plt.figure(figsize=(6, 6))
    ax = plt.subplot(111, polar=True)
    ax.plot(angles, values, linewidth=2, linestyle='solid', color='black')
    ax.fill(angles, values, 'grey', alpha=0.3)
    plt.xticks(angles[:-1], categories, color='black', size=8)
    plt.title(title, color='black')
    plt.show()

#Aggregating the readability metrics for the entire book and creating a radar chart.
bookAgg = calcsDf[['Flesch Reading Ease', 'Flesch-Kincaid Grade', 'Gunning Fog', 'SMOG Index',
                   'Coleman-Liau Index', 'Automated Readability Index', 'Dale-Chall Readability Score']].mean()
radarChart(bookAgg, 'Readability Metrics - Whole Book')

#Creating a line chart for readability
#Creating a line chart by Category to visualize Flesch-Kincaid Grade using Grayscale
plt.figure(figsize=(12, 8))
sns.lineplot(x='Name', y='Flesch-Kincaid Grade', hue='Category', data=calcsDf, palette='Greys', marker='o')
plt.title('Flesch-Kincaid Grade Across Chapters and Sections')
plt.xticks(rotation=90)
plt.xlabel('Chapter/Section')
plt.ylabel('Flesch-Kincaid Grade')
plt.show()

#Creating a word cloud of the most used difficult words in the entire book.
brownUCWords = brown.words()
freqDistBrown = nltk.FreqDist(brownUCWords)
#Defining a function to handle wordcloud specific preprocessing steps.
def preprocessForWordCloud(text):
    stopWords = set(stopwords.words('english'))
    words = [word.lower() for word in nltk.word_tokenize(text) if word.isalpha() and word.lower() not in stopWords]
    words = list(set(words))
    return " ".join(words)

#Defining a function to find difficult words based on syllable count and rarity (using Brown University corpus).
def findDifficultWords(text, min_syllables=4, min_frequency=10):
    words = nltk.word_tokenize(text)
    difficultWords = [word for word in words if textstat.syllable_count(word) >= min_syllables]
    difficultWords = [word for word in difficultWords if freqDistBrown[word.lower()] <= min_frequency]
    return difficultWords

#Applying previously created functions (preprocessForWordcloud and findDifficultWords)
preprocessedText = preprocessForWordCloud(wholeDocument)
difficultWords = findDifficultWords(preprocessedText)

#Generating the wordcloud.
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Greys').generate(" ".join(difficultWords))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Difficult Words')
plt.show()