In [None]:
import os
import nltk
import matplotlib.pyplot as plt
from PyPDF2 import PdfReader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict

nltk.download('punkt')
nltk.download('cmudict')

def syllable_count(word):
    d = cmudict.dict()
    try:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    except KeyError:
        # If word is not found in the CMU Pronouncing Dictionary, estimate syllables based on length
        return max(1, len(word) / 3)

def calculate_flesch_kincaid_grade_level(text):
    words = word_tokenize(text)
    total_words = len(words)
    total_sentences = len(sent_tokenize(text))
    total_syllables = sum(syllable_count(word) for word in words)

    # Flesch-Kincaid Grade Level formula
    grade_level = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59

    return grade_level

def process_pdf_file(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
        return calculate_flesch_kincaid_grade_level(text)

def plot_results(file_names, complexities):
    plt.bar(file_names, complexities, color='green')
    plt.xlabel('PDF Files')
    plt.ylabel('Flesch-Kincaid Grade Level')
    plt.title('Sentence Complexity Comparison (Flesch-Kincaid)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

def main(folder_path):
    file_names = []
    complexities = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            grade_level = process_pdf_file(file_path)

            file_names.append(filename)
            complexities.append(grade_level)

    plot_results(file_names, complexities)

if __name__ == "__main__":
    folder_path = "./PDF"
    main(folder_path)


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to C:\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
