## Extract from text-based docs

This notebook demonstrates the pipeline to:

- Extract text from text-based documents using `pymupdf`.

### 1. Importing libraries

In [None]:
import warnings
import justsdk
import pymupdf

from _constants import SAMPLE_DIR

### 2. Configurations

In [None]:
warnings.filterwarnings("ignore")

TARGET_DIR = SAMPLE_DIR / "text"

TARGET_SAMPLE = "agile-method.pdf"

### 3. Get sample files

In [None]:
def get_sample_files() -> dict:
    sample_ext = [".pdf", ".docx"]

    samples = [sample for ext in sample_ext for sample in TARGET_DIR.glob(f"*{ext}")]

    samples_dict = {}
    for sample in samples:
        size_mb = sample.stat().st_size / (1024 * 1024)
        samples_dict[sample.name] = {
            "path": sample,
            "size_mb": size_mb,
        }

    return samples_dict


sample_input = get_sample_files()
justsdk.print_info("Sample files found:")
for name, info in sample_input.items():
    print(f"  {name} ({info['size_mb']:.2f} MB): {info['path']}")

### 4. Extract text

In [None]:
target_doc = pymupdf.open(sample_input[TARGET_SAMPLE]["path"])
text_target_doc = "".join(
    [target_doc.load_page(i).get_text() for i in range(len(target_doc))]
)
text_clean_target_doc = text_target_doc.replace("\n", " ").replace("\r", " ")

justsdk.print_info("Extracted text (raw):")
print(text_target_doc[:500] + "...")

justsdk.print_info("Extracted text (cleaned):")
print(text_clean_target_doc[:100] + "...")