# Data Hygiene & Noise


In [None]:
import fitz
import re
from typing import List, Dict
from collections import Counter
from IPython.display import display, Markdown

class DocumentCleaner:
    """
    Remove boilerplate, headers, footers, and noise from documents.
    """
    
    def __init__(self):
        self.common_patterns = [
            r'Page \d+ of \d+',
            r'^\d+\s*$',  # Page numbers alone
            r'Confidential',
            r'Internal Use Only',
            r'©.*?\d{4}',  # Copyright notices
            r'All Rights Reserved',
            r'DRAFT',
            r'For Internal Use Only'
        ]
    
    def clean_text(self, text: str) -> str:
        """
        Remove common boilerplate patterns.
        """
        cleaned = text
        
        # Remove common patterns
        for pattern in self.common_patterns:
            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
        
        # Remove excessive whitespace
        cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
        cleaned = re.sub(r' {2,}', ' ', cleaned)
        
        # Remove lines that are only symbols/dashes
        lines = cleaned.split('\n')
        lines = [line for line in lines if not re.match(r'^[-=_*]{3,}$', line.strip())]
        cleaned = '\n'.join(lines)
        
        return cleaned.strip()
    
    def detect_headers_footers(self, pages: List[str]) -> Dict:
        """
        Detect repeated headers/footers across pages.
        """
        if len(pages) < 3:
            return {'headers': [], 'footers': []}
        
        # Get first/last 3 lines of each page
        first_lines = ['\n'.join(page.split('\n')[:3]) for page in pages]
        last_lines = ['\n'.join(page.split('\n')[-3:]) for page in pages]
        
        # Find most common patterns
        header_counter = Counter(first_lines)
        footer_counter = Counter(last_lines)
        
        # A header/footer should appear on at least 30% of pages
        threshold = len(pages) * 0.3
        
        headers = [text for text, count in header_counter.items() if count >= threshold]
        footers = [text for text, count in footer_counter.items() if count >= threshold]
        
        return {'headers': headers, 'footers': footers}
    
    def remove_headers_footers(self, pages: List[str]) -> List[str]:
        """
        Remove detected headers and footers from all pages.
        """
        boilerplate = self.detect_headers_footers(pages)
        
        cleaned_pages = []
        for page in pages:
            page_text = page
            
            # Remove headers
            for header in boilerplate['headers']:
                page_text = page_text.replace(header, '', 1)  # Only first occurrence
            
            # Remove footers
            for footer in boilerplate['footers']:
                # Remove from end
                if page_text.endswith(footer):
                    page_text = page_text[:-len(footer)]
            
            cleaned_pages.append(page_text.strip())
        
        return cleaned_pages


# Extract PDF pages as text
def extract_pdf_pages(pdf_path: str) -> List[str]:
    """
    Return a list where each element is the text of one PDF page.
    """
    doc = fitz.open(pdf_path)
    pages = []
    for page in doc:
        pages.append(page.get_text("text"))
    doc.close()
    return pages


In [None]:
# Clean and display PDF pages
pdf_pages = extract_pdf_pages("RAG_BENCHMARK.pdf")
cleaner = DocumentCleaner()
cleaned_pages = cleaner.remove_headers_footers(pdf_pages)

# Display all cleaned pages as Markdown
for i, page_text in enumerate(cleaned_pages):
    display(Markdown(f"### Cleaned Page {i+1}\n\n{page_text}"))

### Cleaned Page 1

ACME CORPORATION — INTERNAL USE ONLY
Page 1
Internal Operations & Knowledge Consolidation 2024

### Cleaned Page 2

ACME CORPORATION — INTERNAL USE ONLY
Page 2
Operational Overview
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval. Tables
contained numeric sequences that, when extracted incorrectly, reversed intended meaning. Certain
dependencies introduced latency that could not be isolated to a single functional unit. Internal
coordination benefited from informal escalation paths that were not formally documented.
Certain dependencies introduced latency that could not be isolated to a single functional unit. A
multi-step procedure required careful cross-checks between related sections to maintain consistency.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. Some sections referenced visual data with ambiguous labels,
complicating automated retrieval. Internal coordination benefited from informal escalation paths that
were not formally documented.
Tables contained numeric sequences that, when extracted incorrectly, reversed intended meaning.
Certain dependencies introduced latency that could not be isolated to a single functional unit. Minor
inconsistencies in notation arose across the different submodules, impacting interpretation.
Unit
Score
Rank
A1
78.4
3
B2
91.2
1
C7
66.9
5
Some sections referenced visual data with ambiguous labels, complicating automated retrieval. These
conditions persisted without materially altering aggregate outcomes. Minor inconsistencies in notation
arose across the different submodules, impacting interpretation.

### Cleaned Page 3

ACME CORPORATION — INTERNAL USE ONLY
Page 3

### Cleaned Page 4

ACME CORPORATION — INTERNAL USE ONLY
Page 4
Regional Observations
Tables contained numeric sequences that, when extracted incorrectly, reversed intended meaning.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction. Internal coordination benefited from informal escalation paths that were not formally
documented. Operational throughput exhibited non-linear adjustments over the observation window,
influenced by regional scheduling constraints.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval.
Embedded diagrams provided contextual information not easily referenced in the surrounding prose.
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Internal coordination benefited from informal escalation paths that were not formally documented.
Certain dependencies introduced latency that could not be isolated to a single functional unit. Minor
inconsistencies in notation arose across the different submodules, impacting interpretation. Some
sections referenced visual data with ambiguous labels, complicating automated retrieval. A multi-step
procedure required careful cross-checks between related sections to maintain consistency.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction. Internal coordination benefited from informal escalation paths that were not formally
documented. A multi-step procedure required careful cross-checks between related sections to
maintain consistency. Embedded diagrams provided contextual information not easily referenced in the
surrounding prose. These conditions persisted without materially altering aggregate outcomes.
Region
Index α
Index β
Index γ
Status
North
0.82
1.14
0.77
Open
East
0.64
1.02
0.69
Limited

### Cleaned Page 5

ACME CORPORATION — INTERNAL USE ONLY
Page 5
Infrastructure Summary
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Certain dependencies introduced latency that could not be isolated to a single functional unit. Tables
contained numeric sequences that, when extracted incorrectly, reversed intended meaning. These
conditions persisted without materially altering aggregate outcomes. Some sections referenced visual
data with ambiguous labels, complicating automated retrieval.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. A multi-step procedure required careful cross-checks between related
sections to maintain consistency.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction. Minor inconsistencies in notation arose across the different submodules, impacting
interpretation. A multi-step procedure required careful cross-checks between related sections to
maintain consistency.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval. Certain
dependencies introduced latency that could not be isolated to a single functional unit. Embedded
diagrams provided contextual information not easily referenced in the surrounding prose. Operational
throughput exhibited non-linear adjustments over the observation window, influenced by regional
scheduling constraints.
Certain dependencies introduced latency that could not be isolated to a single functional unit.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction. A multi-step procedure required careful cross-checks between related sections to maintain
consistency. Tables contained numeric sequences that, when extracted incorrectly, reversed intended
meaning.
Region
Index α
Index β
Index γ
Status
North
0.82
1.14
0.77
Open
East
0.64
1.02
0.69
Limited

### Cleaned Page 6

ACME CORPORATION — INTERNAL USE ONLY
Page 6
Certain dependencies introduced latency that could not be isolated to a single functional unit.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. Internal coordination benefited from informal escalation paths that were
not formally documented.

### Cleaned Page 7

ACME CORPORATION — INTERNAL USE ONLY
Page 7
Extended Records
Certain dependencies introduced latency that could not be isolated to a single functional unit.
Embedded diagrams provided contextual information not easily referenced in the surrounding prose.
Internal coordination benefited from informal escalation paths that were not formally documented.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction.
A multi-step procedure required careful cross-checks between related sections to maintain consistency.
Internal coordination benefited from informal escalation paths that were not formally documented.
Certain dependencies introduced latency that could not be isolated to a single functional unit.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction. These conditions persisted without materially altering aggregate outcomes.
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Certain dependencies introduced latency that could not be isolated to a single functional unit. Internal
coordination benefited from informal escalation paths that were not formally documented.
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. These conditions persisted without materially altering aggregate
outcomes.
Embedded diagrams provided contextual information not easily referenced in the surrounding prose. A
multi-step procedure required careful cross-checks between related sections to maintain consistency.
These conditions persisted without materially altering aggregate outcomes.
Region
Index α
Index β
Index γ
Status
North
0.82
1.14
0.77
Open
East
0.64
1.02
0.69
Limited

### Cleaned Page 8

ACME CORPORATION — INTERNAL USE ONLY
Page 8

### Cleaned Page 9

ACME CORPORATION — INTERNAL USE ONLY
Page 9
Financial Notes
Internal coordination benefited from informal escalation paths that were not formally documented.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. A multi-step procedure required careful cross-checks between related
sections to maintain consistency.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction. Some sections referenced visual data with ambiguous labels, complicating automated
retrieval. Operational throughput exhibited non-linear adjustments over the observation window,
influenced by regional scheduling constraints. Tables contained numeric sequences that, when
extracted incorrectly, reversed intended meaning. These conditions persisted without materially altering
aggregate outcomes.
Embedded diagrams provided contextual information not easily referenced in the surrounding prose.
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval.
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Embedded diagrams provided contextual information not easily referenced in the surrounding prose.
These conditions persisted without materially altering aggregate outcomes.
Region
Index α
Index β
Index γ
Status
North
0.82
1.14
0.77
Open
East
0.64
1.02
0.69
Limited
Tables contained numeric sequences that, when extracted incorrectly, reversed intended meaning.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. Some sections referenced visual data with ambiguous labels,
complicating automated retrieval. Embedded diagrams provided contextual information not easily

### Cleaned Page 10

ACME CORPORATION — INTERNAL USE ONLY
Page 10
referenced in the surrounding prose.

### Cleaned Page 11

ACME CORPORATION — INTERNAL USE ONLY
Page 11
Analytics Highlights
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. Internal coordination benefited from informal escalation paths that were
not formally documented. Some sections referenced visual data with ambiguous labels, complicating
automated retrieval. Certain dependencies introduced latency that could not be isolated to a single
functional unit. Tables contained numeric sequences that, when extracted incorrectly, reversed
intended meaning.
Embedded diagrams provided contextual information not easily referenced in the surrounding prose.
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval.
Embedded diagrams provided contextual information not easily referenced in the surrounding prose.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. These conditions persisted without materially altering aggregate
outcomes. Tables contained numeric sequences that, when extracted incorrectly, reversed intended
meaning. Internal coordination benefited from informal escalation paths that were not formally
documented.
These conditions persisted without materially altering aggregate outcomes. Some sections referenced
visual data with ambiguous labels, complicating automated retrieval. Operational throughput exhibited
non-linear adjustments over the observation window, influenced by regional scheduling constraints.
Unit
Score
Rank
A1
78.4
3
B2
91.2
1
C7
66.9
5

### Cleaned Page 12

ACME CORPORATION — INTERNAL USE ONLY
Page 12

### Cleaned Page 13

ACME CORPORATION — INTERNAL USE ONLY
Page 13
Supplementary Data
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. Comparative analysis against prior intervals suggests gradual
stabilization rather than abrupt correction. Internal coordination benefited from informal escalation paths
that were not formally documented. Some sections referenced visual data with ambiguous labels,
complicating automated retrieval. These conditions persisted without materially altering aggregate
outcomes.
Minor inconsistencies in notation arose across the different submodules, impacting interpretation.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. Some sections referenced visual data with ambiguous labels,
complicating automated retrieval. These conditions persisted without materially altering aggregate
outcomes.
A multi-step procedure required careful cross-checks between related sections to maintain consistency.
Some sections referenced visual data with ambiguous labels, complicating automated retrieval. Minor
inconsistencies in notation arose across the different submodules, impacting interpretation.
Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt
correction. A multi-step procedure required careful cross-checks between related sections to maintain
consistency. Tables contained numeric sequences that, when extracted incorrectly, reversed intended
meaning. Minor inconsistencies in notation arose across the different submodules, impacting
interpretation.
Unit
Score
Rank
A1
78.4
3
B2
91.2
1
C7
66.9
5

### Cleaned Page 14

ACME CORPORATION — INTERNAL USE ONLY
Page 14
Embedded diagrams provided contextual information not easily referenced in the surrounding prose.
Certain dependencies introduced latency that could not be isolated to a single functional unit. Some
sections referenced visual data with ambiguous labels, complicating automated retrieval.