In [15]:
!pip install -q gradio==5.29.0 pdfplumber==0.11.6 nltk==3.9.1 matplotlib==3.10.0 wordcloud==1.9.4 textstat==0.7.2 spacy==3.8.5 weasyprint==65.1 markdown2==2.5.3 python-docx==1.1.2

In [28]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords', quiet=False)
nltk.download('averaged_perceptron_tagger_eng')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [17]:
# File Handling / Formats
import pdfplumber
from docx import Document
from markdown2 import markdown
from weasyprint import HTML
import zipfile

# Natural Language Processing
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from textstat import flesch_reading_ease, flesch_kincaid_grade
import spacy

# Visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from textblob import TextBlob

# Text Feature Extraction / Analysis
from sklearn.feature_extraction.text import TfidfVectorizer

# Utilities
import re
import numpy as np
import os
import base64
from io import BytesIO
import gradio as gr
import tempfile
import logging

In [18]:
class DocReader:
    """
    A utility class to extract text content from supported document types.

    Supported formats:
    - PDF (.pdf)
    - Word Document (.docx)
    - Plain Text (.txt)
    """

    def __init__(self, file_path):
        """
        Initialize the DocReader with the path to a document file.

        Parameters:
            file_path (str): The full path to the input document file.
                             Supported formats: .pdf, .docx, or .txt
        """
        self.file_path = file_path

    def extract_text(self):
        """
        Extract the textual content from the provided document.

        Returns:
            str: The full extracted text from the document.

        Raises:
            ValueError: If the file extension is not one of the supported types.
        """
        if self.file_path.endswith('.pdf'):
            with pdfplumber.open(self.file_path) as pdf:
                text = "".join(page.extract_text() or "" for page in pdf.pages)
                return text
        elif self.file_path.endswith('.docx'):
            doc = Document(self.file_path)
            return " ".join(para.text for para in doc.paragraphs)
        elif self.file_path.endswith('.txt'):
            with open(self.file_path, 'r', encoding='utf-8') as f:
                return f.read()
        else:
            raise ValueError("Unsupported file format. Use .pdf, .docx, or .txt.")

In [19]:
class WordLevelAnalysis:
    """
    A class to perform detailed word-level analysis on a given text.

    This includes:
    - Tokenization and POS tagging
    - TF-IDF keyword extraction
    - N-gram frequency analysis
    - Co-occurrence of keywords
    - Stop word ratio
    - Lexical diversity
    - Average word length
    """

    def __init__(self, text):
        """
        Initialize the WordLevelAnalysis with input text.

        Args:
            text (str): The raw input text to be analyzed.
        """
        self.text = text
        self.words = word_tokenize(self.text)
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(stopwords.words('english'))

    def word_statstics(self):
        """
        Perform a full suite of word-level analyses on the text.

        Returns:
            dict: A dictionary containing results from multiple analyses, including:
                - filtered_words (list): Cleaned tokens without stopwords or punctuation.
                - top_keywords (list): Top TF-IDF keywords.
                - pos_tags (list): Top 5 most common part-of-speech tags.
                - avg_word_length (float): Average word length in characters.
                - stop_word_ratio (float): Ratio of stopwords to total words.
                - bigrams (list): Top bigram frequencies.
                - co_occurrences (list): Most common co-occurring keyword pairs.
                - lexical_diversity (float): Type-Token Ratio of the text.
        """
        filtered_words = [word for word in self.words if word.isalnum() and word not in self.stop_words]
        top_keywords = self.tfidf_keywords()
        tagged_words = nltk.pos_tag(self.words)
        pos_tags = [tag for word, tag in tagged_words if word.isalnum()]
        pos_freq = FreqDist(pos_tags)
        top_pos = pos_freq.most_common(5)
        avg_word_len = self.avg_word_length()
        stop_word_ratio = self.stop_word_ratio()
        bigrams = self.ngram_analysis(n=2)
        co_occurrences = self.keyword_co_occurrence()
        ttr = self.lexical_diversity()

        return {
            "filtered_words": filtered_words,
            "top_keywords": top_keywords,
            "pos_tags": top_pos,
            "avg_word_length": avg_word_len,
            "stop_word_ratio": stop_word_ratio,
            "bigrams": bigrams,
            "co_occurrences": co_occurrences,
            "lexical_diversity": ttr
        }

    def tfidf_keywords(self, top_n=10):
        """
        Extract the top keywords from the text using TF-IDF scoring.

        Args:
            top_n (int, optional): Number of top keywords to return. Default is 10.

        Returns:
            list of tuples: Top (keyword, score) pairs sorted by score.
        """
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform([self.text])
        feature_names = vectorizer.get_feature_names_out()
        tfidf_scores = tfidf_matrix.toarray()[0]
        keyword_scores = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)
        return keyword_scores[:top_n]

    def avg_word_length(self):
        """
        Calculate the average length of alphanumeric words in the text.

        Returns:
            float: Average word length in characters. Returns 0 if no valid words exist.
        """
        word_lengths = [len(word) for word in self.words if word.isalnum()]
        return sum(word_lengths) / len(word_lengths) if word_lengths else 0

    def stop_word_ratio(self):
        """
        Compute the proportion of stop words in the text.

        Returns:
            float: Ratio of stop words to total words. Returns 0 if no words exist.
        """
        stop_word_count = len([word for word in self.words if word in self.stop_words])
        total_words = len(self.words)
        return stop_word_count / total_words if total_words > 0 else 0

    def lexical_diversity(self):
        """
        Calculate lexical diversity using the Type-Token Ratio (TTR).

        Returns:
            float: Unique words / total words. Returns 0 if total words is 0.
        """
        unique_words = len(set(self.words))
        total_words = len(self.words)
        return unique_words / total_words if total_words > 0 else 0

    def keyword_co_occurrence(self, top_n=5):
        """
        Identify frequently co-occurring top keywords within the same sentence.

        Args:
            top_n (int, optional): Number of top keywords to consider. Default is 5.

        Returns:
            list of tuples: Pairs of keywords and their co-occurrence count.
        """
        filtered_words = [word for word in self.words if word.isalnum() and word not in self.stop_words]
        freq_dist = FreqDist(filtered_words)
        top_keywords = [word for word, freq in freq_dist.most_common(top_n)]
        co_occurrences = {}
        sentences = sent_tokenize(self.text)

        for sent in sentences:
            sent_words = set(word_tokenize(sent.lower()))
            for i, kw1 in enumerate(top_keywords):
                if kw1 in sent_words:
                    for kw2 in top_keywords[i+1:]:
                        if kw2 in sent_words:
                            pair = tuple(sorted([kw1, kw2]))
                            co_occurrences[pair] = co_occurrences.get(pair, 0) + 1

        return sorted(co_occurrences.items(), key=lambda x: x[1], reverse=True)[:5]

    def ngram_analysis(self, n=2, top_n=5):
        """
        Find the most frequent n-grams in the text (excluding stopwords and punctuation).

        Args:
            n (int, optional): Size of the n-gram (e.g., 2 = bigram). Default is 2.
            top_n (int, optional): Number of top n-grams to return. Default is 5.

        Returns:
            list of tuples: Top (ngram, frequency) pairs sorted by frequency.
        """
        words = [word for word in self.words if word.isalnum() and word not in self.stop_words]
        ngrams = list(nltk.ngrams(words, n))
        ngram_freq = FreqDist(ngrams)
        return ngram_freq.most_common(top_n)


In [20]:
class SentenceLevelAnalysis:
    """
    A class to perform sentence-level analysis on textual data.

    It provides:
    - Sentence count and length statistics
    - Average sentence length
    - Sentence-level sentiment analysis using TextBlob
    """

    def __init__(self, text):
        """
        Initialize the SentenceLevelAnalysis with input text.

        Args:
            text (str): The raw input text to analyze at the sentence level.
        """
        self.text = text
        self.nlp = spacy.load("en_core_web_sm")

    def sentence_statstics(self):
        """
        Perform core sentence-level statistics on the text.

        This includes:
        - Token-based sentence length distribution
        - Total sentence count
        - Average sentence length
        - Per-sentence sentiment polarity

        Returns:
            dict: A dictionary with the following keys:
                - sentence_lengths (list of int): Number of tokens per sentence.
                - sentence_count (int): Total number of sentences.
                - avg_sentence_length (float): Average number of tokens per sentence.
                - sentence_sentiments (list of float): Sentiment polarity scores per sentence.
        """
        doc = self.nlp(self.text)
        self.sentences = [sent.text for sent in doc.sents]

        sentence_lengths = [len(word_tokenize(sent)) for sent in self.sentences]
        sentence_count = len(self.sentences)
        word_count = len(word_tokenize(self.text))
        avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
        sentence_sentiments = self.sentence_sentiment_analysis()

        return {
            "sentence_lengths": sentence_lengths,
            "sentence_count": sentence_count,
            "avg_sentence_length": avg_sentence_length,
            "sentence_sentiments": sentence_sentiments
        }

    def sentence_sentiment_analysis(self):
        """
        Analyze sentiment polarity for each sentence using TextBlob.

        This method processes the first 50 sentences (for efficiency and visual clarity)
        and computes a polarity score between -1 (negative) and 1 (positive) for each.

        Returns:
            list of float: Sentiment polarity scores for up to 50 sentences.
        """
        sentiments = []
        for sent in self.sentences[:50]:
            blob = TextBlob(sent)
            sentiments.append(blob.sentiment.polarity)
        return sentiments


In [21]:
class ParagraphLevelAnalysis:
    """
    A class to perform paragraph-level analysis on a given text.

    It includes:
    - Paragraph count
    - Sentence count per paragraph
    - Paragraph length distribution
    - Average number of sentences per paragraph
    """

    def __init__(self, text):
        """
        Initialize the ParagraphLevelAnalysis with the input text.

        Args:
            text (str): The input document or block of text to analyze.
        """
        self.text = text
        self.sentences = sent_tokenize(self.text)

    def paragraph_statstics(self):
        """
        Compute paragraph-level statistics, including length distribution
        and average number of sentences per paragraph.

        Returns:
            dict: A dictionary containing:
                - paragraph_lengths (list of int): Number of sentences in each paragraph.
                - paragraph_count (int): Total number of paragraphs.
                - avg_sentences_per_paragraph (float): Average sentences per paragraph.
        """
        paragraphs = [p.strip() for p in self.text.split('\n') if p.strip()]
        paragraph_lengths = [len(sent_tokenize(p)) for p in paragraphs]
        paragraph_count, avg_sentences_per_paragraph = self.paragraph_stats()

        return {
            "paragraph_lengths": paragraph_lengths,
            "paragraph_count": paragraph_count,
            "avg_sentences_per_paragraph": avg_sentences_per_paragraph
        }

    def paragraph_stats(self):
        """
        Calculate core statistics for paragraphs.

        A paragraph is defined as a non-empty block of text separated by newlines.

        Returns:
            tuple:
                - paragraph_count (int): Number of paragraphs.
                - avg_sentences (float): Average number of sentences per paragraph.
        """
        paragraphs = [p.strip() for p in self.text.split('\n') if p.strip()]
        paragraph_count = len(paragraphs)
        sentences_per_paragraph = [len(sent_tokenize(p)) for p in paragraphs]
        avg_sentences = sum(sentences_per_paragraph) / paragraph_count if paragraph_count > 0 else 0
        return paragraph_count, avg_sentences


In [22]:
class DocLevelAnalysis:
    """
    A class to perform high-level document analysis.

    This includes:
    - Word count
    - Readability scores (Flesch Reading Ease & Flesch-Kincaid Grade)
    - Named entity recognition (NER)
    - Overall document sentiment
    - Punctuation frequency
    """

    def __init__(self, text):
        """
        Initialize the DocLevelAnalysis with the given document text.

        Args:
            text (str): The full document content as a string.
        """
        self.text = text
        self.words = word_tokenize(self.text)
        self.sentences = sent_tokenize(self.text)
        self.nlp = spacy.load("en_core_web_sm")

    def doc_statstics(self):
        """
        Compute key document-level statistics.

        Returns:
            dict: A dictionary containing:
                - word_count (int): Total number of words.
                - readability_ease (float): Flesch Reading Ease score.
                - readability_grade (float): Flesch-Kincaid Grade Level.
                - named_entities (dict): Named entity type counts.
                - sentiment (dict): Overall sentiment score and label.
                - punctuation_freq (list of tuples): Punctuation and their frequencies.
        """
        word_count = len(self.words)
        readability_ease = flesch_reading_ease(self.text)
        readability_grade = flesch_kincaid_grade(self.text)
        entity_counts = self.named_entity_distribution()
        sentiment = self.sentiment_analysis()
        punctuation_freq = self.punctuation_frequency()

        return {
            "word_count": word_count,
            "readability_ease": readability_ease,
            "readability_grade": readability_grade,
            "named_entities": entity_counts,
            "sentiment": sentiment,
            "punctuation_freq": punctuation_freq
        }

    def sentiment_analysis(self):
        """
        Compute overall sentiment polarity of the entire document using TextBlob.

        Returns:
            dict: A dictionary with:
                - 'score' (float): Sentiment polarity score in range [-1, 1].
                - 'label' (str): Sentiment label: "positive", "neutral", or "negative".
        """
        blob = TextBlob(self.text)
        sentiment = blob.sentiment.polarity
        label = "positive" if sentiment > 0.1 else "negative" if sentiment < -0.1 else "neutral"
        return {"score": sentiment, "label": label}

    def named_entity_distribution(self):
        """
        Identify and count named entities using spaCy's NER pipeline.

        Returns:
            dict: Mapping of entity types (e.g., 'PERSON', 'ORG') to unique counts.
                  Example: {'PERSON': 3, 'ORG': 2, 'DATE': 5}
        """
        doc = self.nlp(self.text)
        entity_info = {}

        for ent in doc.ents:
            label = ent.label_
            text = ent.text.strip()
            if label not in entity_info:
                entity_info[label] = set()
            entity_info[label].add(text)

        return {label: len(entities) for label, entities in entity_info.items()}

    def punctuation_frequency(self):
        """
        Count frequency of common punctuation marks in the document.

        Target punctuation marks: . , ! ? ; :

        Returns:
            list of tuples: Punctuation and their frequencies, sorted in descending order.
                            Example: [('.', 14), (',', 10), ('!', 3)]
        """
        punctuation = re.findall(r'[.,!?;:]', self.text)
        punctuation_freq = FreqDist(punctuation)
        return punctuation_freq.most_common()


In [23]:
def generate_key_insights(word_result, sent_result, para_result, doc_result):
    """
    Generate an executive summary of the document’s key linguistic and structural insights.

    This summary provides a concise overview of:
        - Overall sentiment polarity (positive, neutral, negative)
        - Readability difficulty based on Flesch Reading Ease
        - Named entity type diversity from document-level analysis
        - Lexical diversity measured by the Type-Token Ratio (TTR)

    Args:
        word_result (dict): Output from WordLevelAnalysis.word_statstics().
        sent_result (dict): Output from SentenceLevelAnalysis.sentence_statstics().
        para_result (dict): Output from ParagraphLevelAnalysis.paragraph_statstics().
        doc_result (dict): Output from DocLevelAnalysis.doc_statstics().

    Returns:
        str: A grammatically structured sentence summarizing tone, readability, entity richness,
             and vocabulary variety of the input document.
    """
    insights = []

    # Sentiment insight
    sentiment_label = doc_result["sentiment"]["label"]
    insights.append(f"This document has a {sentiment_label} tone (score: {doc_result['sentiment']['score']:.2f})")

    # Readability insight
    readability = "easy" if doc_result["readability_ease"] > 60 else "moderate" if doc_result["readability_ease"] > 30 else "difficult"
    insights.append(f"readability is {readability} (Flesch Reading Ease: {doc_result['readability_ease']:.2f})")

    # Named entity diversity
    entity_diversity = len(doc_result["named_entities"])
    entity_insight = "low" if entity_diversity < 5 else "high" if entity_diversity > 10 else "moderate"
    insights.append(f"named entity diversity is {entity_insight} ({entity_diversity} entity types)")

    # Lexical diversity
    lexical_insight = "high" if word_result["lexical_diversity"] > 0.5 else "low" if word_result["lexical_diversity"] < 0.3 else "moderate"
    insights.append(f"lexical diversity is {lexical_insight} (TTR: {word_result['lexical_diversity']:.2f})")

    return ", ".join(insights) + "."


In [24]:
class DocVisulaization:
    """
    A utility class for generating visualizations (word cloud, histograms, bar charts, heatmaps)
    from document analysis results and saving them as image files in a specified directory.
    """

    def __init__(self, report_dir):
        """
        Initialize the visualization class with a target directory for saving plots.

        Args:
            report_dir (str): Path to the directory where visualization images will be saved.
        """
        self.report_dir = report_dir

    def generate_word_cloud(self, words):
        """
        Generate and save a word cloud image from a list of words.

        Args:
            words (list of str): Words to include in the word cloud.

        Returns:
            str: Path to the saved word cloud image file.
        """
        stop_words = set(stopwords.words('english'))
        wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(' '.join(words))
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        image_path = os.path.join(self.report_dir, "wordcloud.png")
        plt.savefig(image_path, bbox_inches='tight')
        plt.close()
        return image_path

    def generate_histogram(self, data, title, xlabel, ylabel, filename):
        """
        Generate and save a histogram from numerical data.

        Args:
            data (list or array-like): Numerical values to plot.
            title (str): Plot title.
            xlabel (str): Label for the X-axis.
            ylabel (str): Label for the Y-axis.
            filename (str): Output file name (e.g., 'histogram.png').

        Returns:
            str: Path to the saved histogram image file.
        """
        plt.figure(figsize=(8, 6))
        plt.hist(data, bins=20, color='skyblue', edgecolor='black')
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        image_path = os.path.join(self.report_dir, filename)
        plt.savefig(image_path, bbox_inches='tight')
        plt.close()
        return image_path

    def generate_bar_chart(self, labels, values, title, xlabel, ylabel, filename):
        """
        Generate and save a bar chart with the provided labels and values.

        Args:
            labels (list of str): X-axis categories.
            values (list of float): Heights of the bars corresponding to each label.
            title (str): Plot title.
            xlabel (str): Label for the X-axis.
            ylabel (str): Label for the Y-axis.
            filename (str): Output file name (e.g., 'barchart.png').

        Returns:
            str: Path to the saved bar chart image file.
        """
        plt.figure(figsize=(8, 6))
        plt.bar(labels, values, color='lightgreen')
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.xticks(rotation=45)
        image_path = os.path.join(self.report_dir, filename)
        plt.savefig(image_path, bbox_inches='tight')
        plt.close()
        return image_path

    def generate_heatmap(self, data, title, filename):
        """
        Generate and save a heatmap from a 2D matrix of sentiment scores.

        Args:
            data (2D list or array): Sentiment scores; each row can represent a different metric.
            title (str): Plot title.
            filename (str): Output file name (e.g., 'heatmap.png').

        Returns:
            str: Path to the saved heatmap image file.
        """
        data = np.array(data).reshape(-1, len(data))
        plt.figure(figsize=(10, 2))
        sns.heatmap(data, cmap='RdYlGn', vmin=-1, vmax=1, annot=True, fmt=".2f",
                    cbar=False, annot_kws={"rotation": 90})
        plt.title(title)
        plt.xlabel("Sentence Index")
        plt.ylabel("Sentiment")
        plt.xticks(np.arange(len(data[0])) + 0.5, np.arange(len(data[0])))
        image_path = os.path.join(self.report_dir, filename)
        plt.savefig(image_path, bbox_inches='tight')
        plt.close()
        return image_path


In [25]:
class DocWriter:
    """
    A class responsible for generating document analysis reports with optional visualizations
    and packaging the output into a downloadable ZIP archive.

    Features:
    - Generates detailed reports in Markdown, HTML, or PDF format
    - Includes insights from multiple levels of analysis (word, sentence, paragraph, document)
    - Embeds visualizations such as word clouds, histograms, bar charts, and heatmaps
    - Packages report and visualizations into a ZIP archive for download
    """

    def __init__(self, report_dir, word_result, sent_result, para_result, doc_result):
        """
        Initialize the report writer with analysis results and target output directory.

        Args:
            report_dir (str): Directory to save the report and visualizations.
            word_result (dict): Output from WordLevelAnalysis.
            sent_result (dict): Output from SentenceLevelAnalysis.
            para_result (dict): Output from ParagraphLevelAnalysis.
            doc_result (dict): Output from DocLevelAnalysis.
        """
        self.report_dir = report_dir
        self.word_result = word_result
        self.sent_result = sent_result
        self.para_result = para_result
        self.doc_result = doc_result

    def _create_zip(self, report_path, image_paths):
        """
        Create a ZIP archive containing the report and all generated image files.

        Args:
            report_path (str): File path to the main report (e.g., .md, .pdf, or .html).
            image_paths (list): List of image file paths to include in the archive.

        Returns:
            str: Path to the generated ZIP archive.
        """
        zip_path = os.path.join(self.report_dir, "docstats_report.zip")
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            zipf.write(report_path, os.path.basename(report_path))
            for image_path in image_paths:
                zipf.write(image_path, os.path.join("images", os.path.basename(image_path)))
        return zip_path

    def _image_to_base64(self, image_path):
        """
        Convert an image to a base64-encoded string for inline embedding in Markdown/HTML.

        Args:
            image_path (str): Path to the image file.

        Returns:
            str: Base64-encoded data URI string.
        """
        with open(image_path, "rb") as image_file:
            encoded = base64.b64encode(image_file.read()).decode('utf-8')
        return f"data:image/png;base64,{encoded}"

    def generate_report(self, stats_to_include="all", include_visualizations=None, format="markdown"):
        """
        Generate a comprehensive report of document statistics and visualizations.

        The report can be exported in Markdown, HTML, or PDF format and includes:
        - Executive summary of sentiment, readability, and lexical features
        - Analysis sections based on selected stats
        - Embedded images for visualization
        - Packaged ZIP archive for download

        Args:
            stats_to_include (str or list, optional): "all" or a list of sections to include. Options:
                ["basic", "readability", "linguistic", "pos", "named_entities",
                 "keyword_analysis", "punctuation", "visualizations"]
            include_visualizations (list, optional): Visualizations to generate. Options:
                ["wordcloud", "sentence_histogram", "pos_bar", "sentiment_heatmap", "paragraph_histogram"]
            format (str, optional): Output format: "markdown", "html", or "pdf". Default is "markdown".

        Returns:
            tuple:
                - report_path (str): Path to the saved report file.
                - zip_path (str): Path to the ZIP archive containing the report and images.

        Raises:
            ValueError: If an unsupported report format is provided.
        """
        visualize = DocVisulaization(self.report_dir)

        if stats_to_include == "all":
            stats_to_include = [
                "basic", "readability", "linguistic", "pos", "named_entities",
                "keyword_analysis", "punctuation", "visualizations"
            ]

        if include_visualizations is None:
            include_visualizations = [
                "wordcloud", "sentence_histogram", "pos_bar",
                "sentiment_heatmap", "paragraph_histogram"
            ]

        # Generate visualizations
        visualizations = {}
        image_paths = []

        if "wordcloud" in include_visualizations:
            visualizations["wordcloud"] = visualize.generate_word_cloud(self.word_result["filtered_words"])
            image_paths.append(visualizations["wordcloud"])

        if "sentence_histogram" in include_visualizations:
            visualizations["sentence_histogram"] = visualize.generate_histogram(
                self.sent_result["sentence_lengths"], "Sentence Length Distribution",
                "Sentence Length (words)", "Frequency", "sentence_length_histogram.png"
            )
            image_paths.append(visualizations["sentence_histogram"])

        if "pos_bar" in include_visualizations:
            visualizations["pos_bar"] = visualize.generate_bar_chart(
                [tag for tag, _ in self.word_result["pos_tags"]],
                [freq for _, freq in self.word_result["pos_tags"]],
                "Top 5 Part-of-Speech Tags", "POS Tag", "Frequency", "pos_distribution.png"
            )
            image_paths.append(visualizations["pos_bar"])

        if "sentiment_heatmap" in include_visualizations:
            visualizations["sentiment_heatmap"] = visualize.generate_heatmap(
                self.sent_result["sentence_sentiments"],
                "Sentence-Level Sentiment Heatmap",
                "sentiment_heatmap.png"
            )
            image_paths.append(visualizations["sentiment_heatmap"])

        if "paragraph_histogram" in include_visualizations:
            visualizations["paragraph_histogram"] = visualize.generate_histogram(
                self.para_result["paragraph_lengths"], "Paragraph Length Distribution",
                "Paragraph Length (sentences)", "Frequency", "paragraph_length_histogram.png"
            )
            image_paths.append(visualizations["paragraph_histogram"])

        # Convert images to base64 for embedding
        visualizations_base64 = {
            k: self._image_to_base64(v) for k, v in visualizations.items()
        }

        # Start building the report content
        report = "# Document Statistics Report\n\n## Table of Contents\n"
        if "basic" in stats_to_include:
            report += "- [Basic Statistics](#basic-statistics)\n"
        if "readability" in stats_to_include:
            report += "- [Readability Scores](#readability-scores)\n"
        if "linguistic" in stats_to_include:
            report += "- [Linguistic Analysis](#linguistic-analysis)\n"
        if "pos" in stats_to_include:
            report += "- [Part-of-Speech Distribution](#part-of-speech-distribution)\n"
        if "named_entities" in stats_to_include:
            report += "- [Named Entity Distribution](#named-entity-distribution)\n"
        if "keyword_analysis" in stats_to_include:
            report += "- [Keyword Analysis](#keyword-analysis)\n"
        if "punctuation" in stats_to_include:
            report += "- [Punctuation Frequency](#punctuation-frequency)\n"
        if "visualizations" in stats_to_include:
            report += "- [Visualizations](#visualizations)\n"

        # Executive Summary
        report += "\n## Executive Summary\n"
        report += generate_key_insights(self.word_result, self.sent_result, self.para_result, self.doc_result) + "\n\n"
        report += f"**File:** {os.path.basename(self.report_dir)}\n\n"

        # Basic Stats
        if "basic" in stats_to_include:
            report += "## Basic Statistics\n"
            report += f"- Word Count: {self.doc_result['word_count']}\n"
            report += f"- Sentence Count: {self.sent_result['sentence_count']}\n"
            report += f"- Avg Sentence Length: {self.sent_result['avg_sentence_length']:.2f} words\n"
            report += f"- Avg Word Length: {self.word_result['avg_word_length']:.2f} characters\n"
            report += f"- Paragraph Count: {self.para_result['paragraph_count']}\n"
            report += f"- Avg Sentences per Paragraph: {self.para_result['avg_sentences_per_paragraph']:.2f}\n\n"

        # Readability
        if "readability" in stats_to_include:
            report += "## Readability Scores\n"
            report += f"- Flesch Reading Ease: {self.doc_result['readability_ease']:.2f}\n"
            report += f"- Flesch-Kincaid Grade Level: {self.doc_result['readability_grade']:.2f}\n\n"

        # Linguistic
        if "linguistic" in stats_to_include:
            report += "## Linguistic Analysis\n"
            report += f"- Lexical Diversity (TTR): {self.word_result['lexical_diversity']:.2f}\n"
            report += f"- Stop Word Ratio: {self.word_result['stop_word_ratio']:.2f}\n"
            report += f"- Sentiment Score: {self.doc_result['sentiment']['score']:.2f} ({self.doc_result['sentiment']['label']})\n\n"

        # POS
        if "pos" in stats_to_include:
            report += "## Part-of-Speech Distribution\n"
            for tag, freq in self.word_result["pos_tags"]:
                report += f"- {tag}: {freq}\n"
            if "pos_bar" in include_visualizations:
                report += f"\n![POS Distribution]({visualizations_base64['pos_bar']})\n\n"

        # Named Entities
        if "named_entities" in stats_to_include:
            report += "## Named Entity Distribution\n"
            for label, count in self.doc_result["named_entities"].items():
                report += f"- {label}: {count}\n"
            report += "\n"

        # Keyword Analysis
        if "keyword_analysis" in stats_to_include:
            report += "## Keyword Analysis\n"
            report += "### Top Keywords (TF-IDF)\n"
            for word, score in self.word_result["top_keywords"]:
                report += f"- {word}: {score:.2f}\n"
            report += "### Top Bigrams\n"
            for bigram, freq in self.word_result["bigrams"]:
                report += f"- {' '.join(bigram)}: {freq}\n"
            report += "### Keyword Co-Occurrence\n"
            for (w1, w2), freq in self.word_result["co_occurrences"]:
                report += f"- {w1} & {w2}: {freq} co-occurrences\n"
            report += "\n"

        # Punctuation
        if "punctuation" in stats_to_include:
            report += "## Punctuation Frequency\n"
            for punc, freq in self.doc_result["punctuation_freq"]:
                report += f"- {punc}: {freq}\n"
            report += "\n"

        # Visualizations
        if "visualizations" in stats_to_include:
            report += "## Visualizations\n"
            for key in include_visualizations:
                if key in visualizations_base64:
                    report += f"### {key.replace('_', ' ').title()}\n"
                    report += f"![{key}]({visualizations_base64[key]})\n\n"

        # Save Report
        if format == "markdown":
            report_path = os.path.join(self.report_dir, "report.md")
            with open(report_path, "w", encoding="utf-8") as f:
                f.write(report)
        elif format == "html":
            report_path = os.path.join(self.report_dir, "report.html")
            with open(report_path, "w", encoding="utf-8") as f:
                f.write(markdown(report))
        elif format == "pdf":
            report_path = os.path.join(self.report_dir, "report.pdf")
            HTML(string=markdown(report)).write_pdf(report_path)
        else:
            raise ValueError("Unsupported format. Use 'markdown', 'html', or 'pdf'.")

        # Bundle into ZIP
        zip_path = self._create_zip(report_path, image_paths)

        return report_path, zip_path


In [30]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def process_document(file, file_type, output_format):
    logger.info("Starting document processing")
    if not file:
        logger.error("No file uploaded")
        return "Please upload a file.", None, None

    temp_file = None
    try:
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_type, prefix="uploaded_")

        # Handle file input
        if isinstance(file, bytes):
            content = file
        else:
            with open(file.name, "rb") as f:
                content = f.read()
        with open(temp_file.name, "wb") as f:
            f.write(content)
        logger.info(f"Saved uploaded file to: {temp_file.name}")

        # === Modular pipeline ===
        read_obj = DocReader(temp_file.name)
        text = read_obj.extract_text()

        word_obj = WordLevelAnalysis(text)
        word_result = word_obj.word_statstics()

        sent_obj = SentenceLevelAnalysis(text)
        sent_result = sent_obj.sentence_statstics()

        para_obj = ParagraphLevelAnalysis(text)
        para_result = para_obj.paragraph_statstics()

        doc_obj = DocLevelAnalysis(text)
        doc_result = doc_obj.doc_statstics()

        write_obj = DocWriter("./", word_result, sent_result, para_result, doc_result)
        report_path, zip_path = write_obj.generate_report(format=output_format)

        # Read report content
        if output_format in ["markdown", "html"]:
            with open(report_path, "r", encoding="utf-8") as f:
                report_content = f.read()
        else:
            report_content = "PDF generated. Download the ZIP file to view the report."

        return report_content, report_path, zip_path

    except Exception as e:
        logger.exception("Error during processing")
        return f"Error: {e}", None, None

    finally:
        if temp_file and os.path.exists(temp_file.name):
            os.remove(temp_file.name)
            logger.info(f"Deleted temp file: {temp_file.name}")

def launch_gradio_interface():
    iface = gr.Interface(
        fn=process_document,
        inputs=[
            gr.File(label="Upload Document", file_types=[".pdf", ".docx", ".txt"]),
            gr.Dropdown(choices=[".pdf", ".docx", ".txt"], label="File Type", value=".pdf"),
            gr.Dropdown(choices=["markdown", "html", "pdf"], label="Output Format", value="markdown")
        ],
        outputs=[
            gr.Markdown(label="Report Preview"),
            gr.File(label="Download Report"),
            gr.File(label="Download ZIP Archive")
        ],
        title="Document Statistics Analyzer",
        description="Upload a document and generate word, sentence, paragraph, and document-level insights."
    )
    iface.launch()

if __name__ == "__main__":
    launch_gradio_interface()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://064215d29687a99b62.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
