# Stage 2: Document Dataset Quality Check

In [1]:
import shutil
from di_parser import Document, document_from_xml, UnsupportedDocumentType
from pathlib import Path
from IPython.display import Markdown, display

## Load the Dataset

In [2]:
root = Path("./resources/2024-Oct-unique")
all_docfiles: tuple[Path, Document] = []
doc_error = 0
for file in root.rglob("*.di"):
    try:
        doc = document_from_xml(file)
        #print(doc)
    except UnsupportedDocumentType:
        # Consider support for more documents like 陳情案件
        doc_error += 1
        continue

    all_docfiles.append((file, doc))

print(f"Parsed {len(all_docfiles)} documents. Ignored {doc_error} unsupported documents.")

Parsed 17180 documents. Ignored 1456 unsupported documents.


## Quality Check

If **subject** or **description** is empty, then the document is invalid.

If the document type is **簽** and the **act** is empty, then the document is invalid.

In [3]:
verified_docs: tuple[Path, Document] = []
for (file, doc) in all_docfiles:
    if not doc.subject:
        continue
    if not doc.description:
        continue
    if doc.document_type == "簽" and not doc.act:
        continue

    verified_docs.append((file, doc))

print(f"Verified {len(verified_docs)} documents.")

Verified 16939 documents.


## Writeout Verified Documents

In [4]:
output_root = Path("./resources/2024-Oct-cleaned")
output_root.mkdir(parents=True, exist_ok=True)

for (file, _) in verified_docs:
    shutil.copy(file, output_root / file.name)