# Document Text Extraction and Analysis

This notebook processes text-based documents to:

1. Extract text content from various document formats (PDF, DOCX, etc.)

2. Clean and preprocess extracted text

3. Perform comprehensive text analysis and visualization

4. Generate structured output for further processing

### 1. Setup and Configuration

In [None]:
import warnings
import justsdk
import pymupdf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import textwrap

# import docx
import _root as _  # noqa: F401

from pathlib import Path
from typing import Dict, Optional
from collections import Counter

# from wordcloud import WordCloud
from configs._constants import SAMPLE_DATA_DIR

warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8")
sns.set_palette("viridis")

In [None]:
CONFIG = {
    "target_file": "agile-method.pdf",
    "target_directory": "text",
    "supported_formats": [".pdf", ".docx", ".txt"],
    "text_processing": {
        "remove_extra_whitespace": True,
        "preserve_paragraphs": True,
        "min_word_length": 2,
        "remove_numbers": False,
    },
    "analysis": {
        "top_words_count": 20,
        "wordcloud_max_words": 100,
        "sentence_sample_size": 10,
    },
}

### 2. Document Discovery and Management

In [None]:
def discover_documents() -> Dict[str, Dict]:
    target_path = SAMPLE_DATA_DIR / CONFIG["target_directory"]
    if not target_path.exists():
        justsdk.print_error(f"Target directory not found: {target_path}")
        return {}

    documents = {}
    for ext in CONFIG["supported_formats"]:
        for file_path in target_path.glob(f"*{ext}"):
            size_mb = file_path.stat().st_size / (1024 * 1024)
            documents[file_path.name] = {
                "path": file_path,
                "size_mb": round(size_mb, 3),
                "extension": ext,
                "size_bytes": file_path.stat().st_size,
            }

    return documents


available_docs = discover_documents()
justsdk.print_info("Available documents:")
for name, info in available_docs.items():
    print(f"  {name} ({info['size_mb']} MB) - {info['extension'].upper()}")

### 3. Text Extraction Engine

In [None]:
class DocumentExtractor:
    def __init__(self, config: Dict):
        self.config = config

    def extract(self, file_path: Path) -> Optional[Dict]:
        if not file_path.exists():
            justsdk.print_error(f"File not found: {file_path}")
            return None

        extension = file_path.suffix.lower()

        try:
            if extension == ".pdf":
                return self._extract_pdf(file_path)
            # elif extension == ".docx":
            #     return self._extract_docx(file_path)
            elif extension == ".txt":
                return self._extract_txt(file_path)
            else:
                justsdk.print_error(f"Unsupported file format: {extension}")
                return None

        except Exception as e:
            justsdk.print_error(f"Extraction failed for {file_path.name}: {e}")
            return None

    def _extract_pdf(self, file_path: Path) -> Dict:
        doc = pymupdf.open(file_path)
        pages_text = []
        metadata = {}

        try:
            metadata = {
                "title": doc.metadata.get("title", "Unknown"),
                "author": doc.metadata.get("author", "Unknown"),
                "subject": doc.metadata.get("subject", "Unknown"),
                "creator": doc.metadata.get("creator", "Unknown"),
                "page_count": len(doc),
            }
        except Exception:
            metadata = {"page_count": len(doc)}

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text()
            pages_text.append(
                {
                    "page_number": page_num + 1,
                    "text": page_text,
                    "char_count": len(page_text),
                    "word_count": len(page_text.split()),
                }
            )

        doc.close()

        raw_text = "\n".join([page["text"] for page in pages_text])

        return {
            "file_name": file_path.name,
            "file_type": "PDF",
            "metadata": metadata,
            "pages": pages_text,
            "raw_text": raw_text,
            "processed_text": self._process_text(raw_text),
        }

    # def _extract_docx(self, file_path: Path) -> Dict:
    #     doc = docx.Document(file_path)
    #     paragraphs = []

    #     for i, para in enumerate(doc.paragraphs):
    #         paragraphs.append(
    #             {
    #                 "paragraph_number": i + 1,
    #                 "text": para.text,
    #                 "char_count": len(para.text),
    #                 "word_count": len(para.text.split()),
    #             }
    #         )

    #     raw_text = "\n".join([para["text"] for para in paragraphs])

    #     return {
    #         "file_name": file_path.name,
    #         "file_type": "DOCX",
    #         "metadata": {"paragraph_count": len(paragraphs)},
    #         "paragraphs": paragraphs,
    #         "raw_text": raw_text,
    #         "processed_text": self._process_text(raw_text),
    #     }

    def _extract_txt(self, file_path: Path) -> Dict:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()

        lines = raw_text.split("\n")
        line_data = []

        for i, line in enumerate(lines):
            line_data.append(
                {
                    "line_number": i + 1,
                    "text": line,
                    "char_count": len(line),
                    "word_count": len(line.split()),
                }
            )

        return {
            "file_name": file_path.name,
            "file_type": "TXT",
            "metadata": {"line_count": len(lines)},
            "lines": line_data,
            "raw_text": raw_text,
            "processed_text": self._process_text(raw_text),
        }

    def _process_text(self, text: str) -> str:
        """Clean and process extracted text."""

        processed = text

        if self.config["text_processing"]["remove_extra_whitespace"]:
            processed = re.sub(r"\s+", " ", processed)

        if not self.config["text_processing"]["preserve_paragraphs"]:
            processed = processed.replace("\n", " ").replace("\r", " ")
        else:
            processed = re.sub(r"\n\s*\n", "\n\n", processed)
            processed = re.sub(r"[ \t]+", " ", processed)

        return processed.strip()


extractor = DocumentExtractor(CONFIG)

### 4. Text Analysis Engine

In [None]:
class TextAnalyzer:
    def __init__(self, config: Dict):
        self.config = config
        self.stop_words = {
            "a",
            "an",
            "and",
            "are",
            "as",
            "at",
            "be",
            "by",
            "for",
            "from",
            "has",
            "he",
            "in",
            "is",
            "it",
            "its",
            "of",
            "on",
            "that",
            "the",
            "to",
            "was",
            "were",
            "will",
            "with",
            "you",
            "your",
            "this",
            "they",
            "have",
            "had",
            "what",
            "when",
            "where",
            "who",
            "which",
            "why",
            "how",
        }

    def analyze(self, text: str) -> Dict:
        basic_stats = self._calculate_basic_stats(text)
        word_analysis = self._analyze_words(text)
        sentence_analysis = self._analyze_sentences(text)
        reading_metrics = self._calculate_reading_metrics(text)

        return {
            "basic_stats": basic_stats,
            "word_analysis": word_analysis,
            "sentence_analysis": sentence_analysis,
            "reading_metrics": reading_metrics,
        }

    def _calculate_basic_stats(self, text: str) -> Dict:
        words = text.split()
        sentences = re.split(r"[.!?]+", text)
        paragraphs = text.split("\n\n")

        return {
            "char_count": len(text),
            "char_count_no_spaces": len(text.replace(" ", "")),
            "word_count": len(words),
            "sentence_count": len([s for s in sentences if s.strip()]),
            "paragraph_count": len([p for p in paragraphs if p.strip()]),
            "avg_words_per_sentence": len(words)
            / max(len([s for s in sentences if s.strip()]), 1),
            "avg_chars_per_word": len(text.replace(" ", "")) / max(len(words), 1),
        }

    def _analyze_words(self, text: str) -> Dict:
        words = re.findall(r"\b\w+\b", text.lower())

        min_length = self.config["text_processing"]["min_word_length"]
        filtered_words = [
            word
            for word in words
            if len(word) >= min_length and word not in self.stop_words
        ]

        word_freq = Counter(filtered_words)
        word_lengths = [len(word) for word in words]

        return {
            "unique_words": len(set(words)),
            "unique_words_filtered": len(set(filtered_words)),
            "most_common_words": word_freq.most_common(
                self.config["analysis"]["top_words_count"]
            ),
            "avg_word_length": np.mean(word_lengths) if word_lengths else 0,
            "word_length_distribution": Counter(word_lengths),
            "vocabulary_richness": len(set(words)) / max(len(words), 1),
        }

    def _analyze_sentences(self, text: str) -> Dict:
        sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
        sentence_lengths = [len(s.split()) for s in sentences]

        return {
            "sentence_count": len(sentences),
            "avg_sentence_length": np.mean(sentence_lengths) if sentence_lengths else 0,
            "median_sentence_length": np.median(sentence_lengths)
            if sentence_lengths
            else 0,
            "sentence_length_std": np.std(sentence_lengths) if sentence_lengths else 0,
            "shortest_sentence": min(sentence_lengths) if sentence_lengths else 0,
            "longest_sentence": max(sentence_lengths) if sentence_lengths else 0,
            "sample_sentences": sentences[
                : self.config["analysis"]["sentence_sample_size"]
            ],
        }

    def _calculate_reading_metrics(self, text: str) -> Dict:
        words = text.split()
        sentences = [s for s in re.split(r"[.!?]+", text) if s.strip()]

        if not words or not sentences:
            return {"reading_time_minutes": 0, "reading_level": "Unknown"}

        reading_time = len(words) / 200

        avg_sentence_length = len(words) / len(sentences)
        avg_word_length = np.mean([len(word) for word in words])

        complexity_score = (avg_sentence_length * 0.4) + (avg_word_length * 1.5)

        if complexity_score < 8:
            reading_level = "Elementary"
        elif complexity_score < 12:
            reading_level = "Middle School"
        elif complexity_score < 16:
            reading_level = "High School"
        else:
            reading_level = "College+"

        return {
            "reading_time_minutes": round(reading_time, 1),
            "reading_level": reading_level,
            "complexity_score": round(complexity_score, 2),
        }


analyzer = TextAnalyzer(CONFIG)

### 5. Document Processing Pipeline

In [None]:
if CONFIG["target_file"] not in available_docs:
    justsdk.print_error(f"Target file '{CONFIG['target_file']}' not found")
    if available_docs:
        CONFIG["target_file"] = list(available_docs.keys())[0]
        justsdk.print_info(f"Using first available file: {CONFIG['target_file']}")
    else:
        raise FileNotFoundError("No documents found")

target_doc = available_docs[CONFIG["target_file"]]
justsdk.print_info(
    f"Processing: {target_doc['path'].name} ({target_doc['size_mb']} MB)"
)

In [None]:
justsdk.print_info("Extracting text...")
extraction_result = extractor.extract(target_doc["path"])

if extraction_result:
    justsdk.print_success("Text extraction completed")
    print(f"File type: {extraction_result['file_type']}")
    print(f"Raw text length: {len(extraction_result['raw_text']):,} characters")
    print(
        f"Processed text length: {len(extraction_result['processed_text']):,} characters"
    )

    if extraction_result["metadata"]:
        print("\nDocument metadata:")
        for key, value in extraction_result["metadata"].items():
            print(f"  {key.title()}: {value}")
else:
    raise RuntimeError("Text extraction failed")

In [None]:
justsdk.print_info("Analyzing text...")
analysis_result = analyzer.analyze(extraction_result["processed_text"])

justsdk.print_success("Text analysis completed")
stats = analysis_result["basic_stats"]
reading = analysis_result["reading_metrics"]

print(f"""\nText Statistics:
  Characters: {stats["char_count"]:,} (including spaces)
  Words: {stats["word_count"]:,}
  Sentences: {stats["sentence_count"]:,}
  Paragraphs: {stats["paragraph_count"]:,}
  Average words per sentence: {stats["avg_words_per_sentence"]:.1f}
  Average characters per word: {stats["avg_chars_per_word"]:.1f}

Reading Metrics:
  Estimated reading time: {reading["reading_time_minutes"]} minutes
  Reading level: {reading["reading_level"]}
  Complexity score: {reading["complexity_score"]}
""")

### 6. Text Content Preview

In [None]:
justsdk.print_info("Text Content Preview:")
print("=" * 80)
print("RAW TEXT (first 500 characters):")
print(textwrap.fill(extraction_result["raw_text"][:500] + "...", width=80))
print("\n" + "=" * 80)
print("PROCESSED TEXT (first 500 characters):")
print(textwrap.fill(extraction_result["processed_text"][:500] + "...", width=80))
print("=" * 80)

sample_sentences = analysis_result["sentence_analysis"]["sample_sentences"]
if sample_sentences:
    print("\nSample sentences:")
    for i, sentence in enumerate(sample_sentences[:5], 1):
        wrapped_sentence = textwrap.fill(
            sentence, width=75, initial_indent=f"  {i}. ", subsequent_indent="     "
        )
        print(wrapped_sentence)

### 7. Text Analysis Visualizations

In [None]:
word_data = analysis_result["word_analysis"]

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

if word_data["most_common_words"]:
    words, counts = zip(*word_data["most_common_words"][:15])
    ax1.barh(range(len(words)), counts, color="skyblue")
    ax1.set_yticks(range(len(words)))
    ax1.set_yticklabels(words)
    ax1.set_xlabel("Frequency")
    ax1.set_title("Most Common Words (excluding stop words)")
    ax1.invert_yaxis()

length_dist = word_data["word_length_distribution"]
if length_dist:
    lengths = sorted(length_dist.keys())
    counts = [length_dist[length] for length in lengths]
    ax2.bar(lengths, counts, color="lightcoral", alpha=0.7)
    ax2.set_xlabel("Word Length (characters)")
    ax2.set_ylabel("Frequency")
    ax2.set_title("Word Length Distribution")
    ax2.grid(True, alpha=0.3)

sentence_data = analysis_result["sentence_analysis"]
sample_sentences = sentence_data["sample_sentences"]
sentence_lengths = (
    [len(s.split()) for s in sample_sentences] if sample_sentences else []
)

if sentence_lengths:
    ax3.hist(
        sentence_lengths,
        bins=min(10, len(set(sentence_lengths))),
        color="lightgreen",
        alpha=0.7,
        edgecolor="black",
    )
    ax3.axvline(
        sentence_data["avg_sentence_length"],
        color="red",
        linestyle="--",
        label=f"Average: {sentence_data['avg_sentence_length']:.1f}",
    )
    ax3.set_xlabel("Sentence Length (words)")
    ax3.set_ylabel("Frequency")
    ax3.set_title("Sentence Length Distribution (sample)")
    ax3.legend()
    ax3.grid(True, alpha=0.3)

metrics = [
    ("Vocabulary\nRichness", word_data["vocabulary_richness"]),
    ("Avg Word\nLength", word_data["avg_word_length"]),
    ("Avg Sentence\nLength", sentence_data["avg_sentence_length"]),
    ("Complexity\nScore", analysis_result["reading_metrics"]["complexity_score"]),
]

metric_names, metric_values = zip(*metrics)
colors = ["gold", "lightblue", "lightgreen", "salmon"]
ax4.bar(metric_names, metric_values, color=colors, alpha=0.7)
ax4.set_ylabel("Score")
ax4.set_title("Text Complexity Metrics")
ax4.grid(True, alpha=0.3)

for i, v in enumerate(metric_values):
    ax4.text(
        i,
        v + max(metric_values) * 0.01,
        f"{v:.2f}",
        ha="center",
        va="bottom",
        fontweight="bold",
    )

plt.tight_layout()
plt.show()

In [None]:
fig, (_, ax2) = plt.subplots(1, 2, figsize=(16, 8))  # noqa: F811

# NOTE: WordCloud is only for cosmetic, removing this dep for now
# if word_data["most_common_words"]:
#     word_freq_dict = dict(
#         word_data["most_common_words"][: CONFIG["analysis"]["wordcloud_max_words"]]
#     )

#     wordcloud = WordCloud(
#         width=800,
#         height=400,
#         background_color="white",
#         max_words=CONFIG["analysis"]["wordcloud_max_words"],
#         colormap="viridis",
#     ).generate_from_frequencies(word_freq_dict)

#     ax1.imshow(wordcloud, interpolation="bilinear")
#     ax1.axis("off")
#     ax1.set_title("Word Cloud (Most Frequent Words)", fontsize=14, fontweight="bold")

stats = analysis_result["basic_stats"]
structure_data = [
    ("Characters", stats["char_count"]),
    ("Words", stats["word_count"]),
    ("Sentences", stats["sentence_count"]),
    ("Paragraphs", stats["paragraph_count"]),
]

labels, sizes = zip(*structure_data)
colors = plt.cm.Set3(np.linspace(0, 1, len(labels)))

# Use actual sizes for pie chart display
wedges, texts, autotexts = ax2.pie(
    sizes,
    labels=labels,
    colors=colors,
    autopct=lambda pct: f"{pct:.1f}%",
    startangle=90,
)

# Add actual counts as text in the center or as annotations
for i, (label, size) in enumerate(structure_data):
    ax2.annotate(
        f"{size:,}",
        xy=(wedges[i].theta2 - (wedges[i].theta2 - wedges[i].theta1) / 2, 0.7),
        xycoords="data",
        ha="center",
        va="center",
        fontweight="bold",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
    )

ax2.set_title("Document Structure", fontsize=14, fontweight="bold")

plt.tight_layout()
plt.show()