In [None]:
"""
Robust OCR-cleaner 
- Conservative heuristics + audit CSV for every automatic correction.
- Jupyter-friendly: call the functions directly.
"""

import re
import csv
from pathlib import Path
import docx
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from collections import Counter
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.enum.text import WD_COLOR_INDEX
from wordfreq import zipf_frequency
class SafeDict:
    def __init__(self, threshold=2.5, custom_words=None):
        self.threshold = threshold
        self.custom_words = set(custom_words or [])

    def check(self, word):
        if not word:
            return False

        w = word.lower().strip(".,;:()[]{}")

        if w in self.custom_words:
            return True

        return zipf_frequency(w, "en") >= self.threshold

    def suggest(self, word):
        # Optional stub to avoid crashes
        return []


AUDIT_HEADER = ["file","para_index","action","pattern","before_preview","after_preview"]

# Utilities
EN_DICT = SafeDict(
    threshold=2.5,
    custom_words={
        "econometrics",
        "macroeconomics",
        "keynesian",
        "neoclassical",
    }
)

EN_WORD_RE = re.compile(r"^[A-Za-z][A-Za-z']+$")
UR_WORD_RE = re.compile(r"^[\u0600-\u06FF\u0750-\u077F]+$")

def is_valid_urdu(word):
    return zipf_frequency(word, "ur") >= 2.0
def is_word(w, threshold=2.5):
    return zipf_frequency(w.lower(), "en") >= threshold

URDU_SCRIPT_RE = re.compile(r'^[\u0600-\u06FF\u0750-\u077F]+$')

def is_urdu_word(token: str) -> bool:
    return bool(URDU_SCRIPT_RE.match(token))


def _safe_preview(s, n=140):
    return (s[:n] + "...") if len(s) > n else s
UNICODE_HYPHEN_PATTERN = re.compile(
    "[" +
    "\u2010"  # hyphen
    "\u2011"  # non-breaking hyphen
    "\u2012"  # figure dash
    "\u2013"  # en dash
    "\u2014"  # em dash
    "\u2015"  # horizontal bar
    "\u00AD"  # soft hyphen
    "]"
)

def highlight_misspellings_in_paragraph(paragraph):
    """
    Highlight misspelled English and Urdu words in a paragraph.
    Visual-only modification of DOCX runs.
    """
    text = paragraph.text

    # Extract both English and Urdu tokens
    words = re.findall(r"\b[A-Za-z\u0600-\u06FF\u0750-\u077F']+\b", text)

    misspelled = set()

    for w in words:
        # Skip short tokens
        if len(w) < 4:
            continue

        # Skip acronyms
        if w.isupper():
            continue

        # English word
        if EN_WORD_RE.match(w):
            if not is_word(w):
                misspelled.add(w)
            continue

        # Urdu word
        if UR_WORD_RE.match(w):
            if not is_valid_urdu(w):
                misspelled.add(w)
            continue

        # Mixed / garbage tokens ‚Üí ignore safely

    if not misspelled:
        return

    # Clear paragraph and rebuild runs
    paragraph._p.clear_content()

    tokens = re.split(r"(\b[A-Za-z\u0600-\u06FF\u0750-\u077F']+\b)", text)
    for tok in tokens:
        run = paragraph.add_run(tok)
        if tok in misspelled:
            run.font.highlight_color = WD_COLOR_INDEX.YELLOW

def normalize_unicode_and_spaces(text: str) -> str:
    if text is None:
        return ""
    # Replace non-breaking spaces
    text = text.replace("\u00A0", " ").replace("\u2007", " ").replace("\u202F", " ")
    # Remove zero-width spaces
    text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
    # Normalize line breaks
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    # Preserve double newlines (paragraph separation)
    text = re.sub(r"[ \t\f\v]+", " ", text)  # collapse spaces/tabs
    text = re.sub(r"\n{3,}", "\n\n",text)        # collapse 3+ newlines to 2
    return text.strip()

def normalize_em_dashes(text):
    """
    Normalize OCR dash artifacts into proper em dashes.
    Handles:
    - word - word
    - word--word
    - word-- word
    - word --word
    Avoids:
    - hyphenated words
    - numeric ranges
    - minus signs
    """

    # 1Ô∏è‚É£ Normalize double or more hyphens between letters ‚Üí em dash
    text = re.sub(
        r'(?<=[A-Za-z])-{2,}(?=[A-Za-z])',
        '‚Äî',
        text
    )

    # 2Ô∏è‚É£ Normalize spaced dash between words ‚Üí em dash
    text = re.sub(
        r'(?<=[A-Za-z])\s+[-‚Äì‚Äî]\s+(?=[A-Za-z])',
        ' ‚Äî ',
        text
    )

    # 3Ô∏è‚É£ Normalize mixed spacing: word-- word / word --word
    text = re.sub(
        r'(?<=[A-Za-z])\s*-{2,}\s*(?=[A-Za-z])',
        '‚Äî',
        text
    )

    return text

# Common repetitive OCR headers (specific to this book or similar structures)
HEADER_PATTERNS = []

def detect_repetitive_headers(paragraphs, min_repeats=3):
    """
    Detect recurring short lines likely to be running headers or footers.
    Returns a set of repeated short lines.
    """
    normalized = []
    for p in paragraphs:
        line = p.strip()
        if not line:
            continue

        # Skip standalone page numbers or numbering artifacts
        if re.match(r"^[-‚Äì‚Äî]?\s*\(?\d{1,3}\)?\s*[-‚Äì‚Äî]?$", line):
            continue

        # Require at least 3 letters (skip roman numerals or very short tokens)
        if len(re.findall(r"[A-Za-z]", line)) <= 2:
            continue
        normalized.append(line)

    counts = Counter(normalized)

    repetitive = {
        line for line, freq in counts.items()
        if freq >= min_repeats and not line.islower() and not line[0].isdigit()
    }

    return repetitive

def remove_known_headers(paragraphs, detected_headers, audit_writer=None, filename=None):
    """
    Removes repetitive headers detected by detect_repetitive_headers().
    Keeps the FIRST occurrence of each header line, removes the rest.
    """
    filtered = []
    seen_headers = set()

    for i, para in enumerate(paragraphs):
        stripped = para.strip()

        if stripped in detected_headers:
            if stripped not in seen_headers:
                # Keep first occurrence
                seen_headers.add(stripped)
                filtered.append(para)
            else:
                # Remove subsequent repetitions
                if audit_writer:
                    audit_writer.writerow([
                        filename or "",
                        i,
                        "remove_header",
                        "repetitive_header_duplicate",
                        _safe_preview(stripped),
                        ""
                    ])
            continue

        filtered.append(para)

    return filtered


def fix_ocr_hyphenated_words(text):
    """
    Fix OCR-broken words:
    - Merge words broken with hyphen + space.
    - Remove hyphen if the merged word is a valid English word.
    - Keep hyphen if it forms a legitimate hyphenated word.
    """
    pattern = re.compile(r'([A-Za-z]+)-\s+([A-Za-z]+)')

    def merge_match(m):
        first, second = m.group(1), m.group(2)
        combined = first + second
        if is_word(combined.lower()):   # valid English word ‚Üí remove hyphen
            return combined
        else:                           # not valid ‚Üí keep hyphen
            return first + "-" + second

    for _ in range(3):  # repeat for multi-stage breaks
        text = pattern.sub(merge_match, text)
    return text



def enhanced_ocr_preclean(text):
    """Enhanced version of OCR cleaning specifically for academic texts like the Contemporary Economic Challenges"""
    if not text:
        return text

    # 1. Initial normalization
    text = normalize_unicode_and_spaces(text)

    # 2. Fix hyphenation and line breaks more aggressively
    text = fix_ocr_hyphenated_words(text)

    # 2. Normalize spaced hyphens
    text = re.sub(r'\s*-\s*', "-", text)
    
    # 3. Clean academic-specific markers
    text = re.sub(r'(?m)^\s*\[?[ivxIVX]+\]?\s*$','',text)  # Remove Roman numerals like [i], [ii]
    text = re.sub(r"(?m)^\*\s?.*\n?", "", text) # Remove any paragraph starting with an asterisk (*)
    text = re.sub(r'\s*\[\d{1,3}\]\s*', ' ', text)  # Remove [123]
    ##text = re.sub(r'\s?\(\s*[\d\s:;\-‚Äì‚Äî,]+\s*\)', '', text) # Remove (1, 2-3:4;5)

    # 4. Paragraph segmentation
    paras = text.split('\n\n')
    cleaned_paragraphs = []

    for i, para in enumerate(paras):
        text = para.strip()     

        # Skip empty lines
        if not text:
            continue

        # Skip empty or page-number-only lines
        if text.isdigit() or re.match(r'^[0-9]{1,3}$', text.strip()):
            continue

        # Skip small lines that are likely just page numbers like "- 3 -" or "(4)"
        if re.match(r'^[-‚Äì‚Äî]?\s*\(?\d{1,3}\)?\s*[-‚Äì‚Äî]?$', text.strip()):
            continue

        cleaned_paragraphs.append(text)


    # 5. Join cleaned paragraphs back
    text = '\n\n'.join(cleaned_paragraphs)

    # Protect time formats like 11.40 A.M or 9.15 P.M
    text = re.sub(r'\b(\d{1,2})\.(\d{2})\s*(A\.M|P\.M)\b',r'\1:\2 \3',text,flags=re.IGNORECASE)
    
    # 6. Remove numeric/footnote artifacts
    # Remove lines that START with digits followed by optional spaces and a letter,
    # then anything until end of line. Typically removes "12 Chapter ‚Ä¶", "3A Stuff‚Ä¶"
    text = re.sub(r'^\d+\s*[A-Za-z].*?(?=\n|$)', '', text, flags=re.MULTILINE)
    # Remove 'Page 12' only if it's at the start of a line or alone
    text = re.sub(r'^(?:Page\s*\d+)\s*$', '', text, flags=re.MULTILINE)
    # Remove 1‚Äì2 digit numbers that appear DIRECTLY after a word starting with a letter:
    # Example: "word23" ‚Üí "word"
    text = re.sub(r'(?<![A-Z])(?<=[a-z])\d+', '', text)
    # Remove lines starting with 1-2 digits followed by a word (no punctuation before)
    text = re.sub(r'^\s*\d{1,2}\s+[A-Za-z]+\b.*$', '', text, flags=re.MULTILINE)
    # Remove trailing blocks of lines that start with numbers ‚Äî usually end-of-book index blocks.
    text = re.sub(r'(\n\s*\d{1,2}\s+.*)+\Z', '', text)
    # Remove (pg. 12) style footnotes
    text = re.sub(r'\(\s*pg\.?\s*\d+\s*\)', '', text, flags=re.IGNORECASE)
    # Replace dot followed by alphanumeric (".12", ".a", ".X12") with a plain dot.
    text = re.sub(r'(?<!\d)\.[0-9A-Za-z]+(?=\s|$)', '.', text)
    # Replace ".*" ‚Üí "."
    text = re.sub(r'\.(\*)', '.', text)
    # Replace ". 12 Something‚Ä¶" ‚Üí ". Something".
    text = re.sub(r'\.\s*\d+\s+(?=[A-Z])', '. ', text)
    # Remove numbers appearing right after an opening quotation mark (e.g., "23 ‚Üí ")
    text = re.sub(r'"\s*(\d+)(?=\s|[A-Z])', '"', text)
    # Remove numbers immediately after a comma if followed by space and text or newline
    text = re.sub(r'(?<!\d),(\d{1,3})(?=\s+[A-Za-z\u0600-\u06FF])',',',text)
    # Remove numbers after a full stop followed by space if followed by capital letter or end of paragraph
    text = re.sub(r'\.\s+\d+(?=\s+[A-Z\u0600-\u06FF]|$)', '. ', text)

    # 7. Special character handling
    text = re.sub(r'["""]', '"', text)
    text = re.sub(r'[\u2018\u2019]', "'", text) # standardise quotation marks

    # 8. Final normalization
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'(?<=[.!?])\s{2,}(?=\S)', ' ', text)
    text = text.strip()

    return text


def merge_soft_split_paragraphs_text(text):
    """
    Merge soft-split paragraphs in a text string:
    - Current paragraph ends with lowercase letter and no punctuation
    - Next paragraph starts with lowercase letter
    """
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
    merged_paras = []
    i = 0
    while i < len(paras):
        para = paras[i]

        # Merge soft split paragraphs
        while (
            i + 1 < len(paras) and
            para[-1].islower() and                  # current ends lowercase
            not para[-1] in ".!?" and               # current does not end with punctuation
            paras[i + 1].strip()[0].islower()       # next starts lowercase
        ):
            next_para = paras[i + 1].strip()
            para = para + " " + next_para
            i += 1  # skip merged paragraph

        merged_paras.append(re.sub(r"\s+", " ", para).strip())
        i += 1

    return "\n\n".join(merged_paras)

def merge_paragraphs_split_by_hyphen(text):
    """
    Merge paragraphs that are split by a line containing only a hyphen:
    - Keeps the hyphen in the merged sentence
    - Leaves other paragraphs untouched
    """
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
    merged_paras = []
    i = 0

    while i < len(paras):
        para = paras[i]

        # Check if the next paragraph is a single hyphen
        while i + 1 < len(paras) and paras[i + 1].strip() == "-":
            next_para = paras[i + 2].strip() if i + 2 < len(paras) else ""
            # Merge current paragraph + hyphen + next paragraph
            para = f"{para} - {next_para}" if next_para else f"{para} -"
            i += 2  # skip the hyphen paragraph and the next merged paragraph

        merged_paras.append(re.sub(r"\s+", " ", para).strip())
        i += 1

    return "\n\n".join(merged_paras)

def merge_broken_words(paragraphs, audit_writer=None, filename=None):
    """
    Merge paragraphs split by broken hyphenated words,
    skipping numeric/page-number/junk paragraphs in between.
    """
    merged = []
    i = 0
    while i < len(paragraphs):
        cur = paragraphs[i].rstrip()
        # Check for hyphen at end of paragraph
        while True:
            m = re.search(r'([A-Za-z]+)-\s*$', cur)
            if not m:
                break

            # Look ahead for the next valid paragraph to merge
            j = i + 1
            next_para = ""
            while j < len(paragraphs):
                candidate = paragraphs[j].strip()
                # Skip junk paragraphs: page numbers, numeric, or single brackets
                if re.match(r'^(\[\d+\]|\d+|[-‚Äì‚Äî]+)$', candidate):
                    j += 1
                    continue
                next_para = candidate
                break

            if not next_para:
                break  # nothing to merge

            # Merge broken word + next paragraph
            full_word = m.group(1) + next_para.split()[0]
            rest = " ".join(next_para.split()[1:])
            cur = cur[:-(len(m.group(0)))] + full_word + " " + rest

            if audit_writer:
                audit_writer.writerow([
                    filename or "",
                    i,
                    "merge_broken_word_multi",
                    f"{m.group(1)}- + {next_para[:30]}...",
                    cur[:50]+"...",
                    cur[:50]+"..."
                ])

            # Move i forward to skip merged paragraphs
            i = j
        merged.append(cur)
        i += 1
    return merged

def format_urdu_run(run):
    """
    Apply Urdu formatting to a RUN only.
    """

    rPr = run._element.get_or_add_rPr()

    # RTL
    rtl = OxmlElement("w:rtl")
    rPr.append(rtl)

    # Language
    lang = OxmlElement("w:lang")
    lang.set(qn("w:val"), "ur-PK")
    rPr.append(lang)

    # Font
    rFonts = OxmlElement("w:rFonts")
    rFonts.set(qn("w:ascii"), "Jameel Noori Nastaleeq")
    rFonts.set(qn("w:hAnsi"), "Jameel Noori Nastaleeq")
    rFonts.set(qn("w:cs"), "Jameel Noori Nastaleeq")
    rFonts.set(qn("w:fareast"), "Jameel Noori Nastaleeq")
    rPr.append(rFonts)

    # Style
    run.font.size = Pt(14)
    run.bold = False

def ocr_preclean(text):
    """Apply OCR-specific cleaning before paragraph-level normalization."""
    text = enhanced_ocr_preclean(text)
    return text

def is_heading(text):
    return len(text.strip()) < 60

# -----------------------
# Full-document processing and I/O
# -----------------------
def process_docx_file(input_path, output_docx, audit_csv_path=None, audit=False, merge_paragraphs=False):
    """
    Clean a single .docx file and write:
      - cleaned .docx (output_docx)
      - audit CSV listing automatic corrections (audit_csv_path)
    """
    input_path = Path(input_path)
    if not input_path.exists():
        raise FileNotFoundError(input_path)

    prev_was_heading = False

    # read docx paragraphs
    doc = docx.Document(str(input_path))
    # Merge runs inside each paragraph
    paras = ["".join(run.text for run in p.runs) for p in doc.paragraphs]

    # prepare audit CSV writer
    if audit:
        audit_rows_file = open(audit_csv_path, "w", newline="", encoding="utf-8")
        audit_writer = csv.writer(audit_rows_file)
        audit_writer.writerow(AUDIT_HEADER)
    else:
        class NullWriter:
            def writerow(self, *args, **kwargs):
                pass
        audit_writer = NullWriter()
        audit_rows_file = None
        

    # --- Detect repetitive headers automatically ---
    detected_headers = detect_repetitive_headers(paras, min_repeats=3)
    if detected_headers:
        print("\nüìò Detected possible repetitive headers:")
        for h in detected_headers:
            print("   ‚Ä¢", h)
    # Clean each paragraph
    cleaned_paras = []
    for i, p in enumerate(paras):
        if not p or p.isspace():
            continue
        cleaned = ocr_preclean(p)
        if cleaned != p and audit_writer:
            audit_writer.writerow([input_path.name, i, "ocr_preclean", "", _safe_preview(p), _safe_preview(cleaned)])
        cleaned_paras.append(cleaned)
        # Remove known repetitive headers (book title, Preface repeats, Appendix headers)
        # --- Dynamically remove detected repetitive headers ---
    auto_patterns = [re.escape(h) for h in detected_headers]
    if auto_patterns:
        dynamic_header_regex = [fr'^\s*{p}\s*$' for p in auto_patterns]
        HEADER_PATTERNS.extend(dynamic_header_regex)
        print(HEADER_PATTERNS)
        
    cleaned_paras = remove_known_headers(cleaned_paras, detected_headers, audit_writer=audit_writer, filename=input_path.name)

    # Build new docx
    new = docx.Document()
    # --- Preserve original paragraph spacing
    cleaned_text = "\n\n".join(cleaned_paras)
    cleaned_text = merge_soft_split_paragraphs_text(cleaned_text)
    cleaned_text = merge_paragraphs_split_by_hyphen(cleaned_text)
    cleaned_paras = [p for p in cleaned_text.split("\n\n") if p.strip()]
    cleaned_paras = merge_broken_words(cleaned_paras, audit_writer=audit_writer, filename=input_path.name)

    for para_text in cleaned_paras:
        p = new.add_paragraph()

        # split into words + spaces (keeps punctuation intact)
        tokens = re.split(r"(\s+)", para_text)

        for tok in tokens:
            run = p.add_run(tok)

            if is_urdu_word(tok):
                format_urdu_run(run)

        current_is_heading = is_heading(para_text)

        # Alignment logic
        p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

        # Font logic
        if current_is_heading:
            for r in p.runs:
                r.bold = True
                r.font.size = Pt(14)

            
        # Update state
        highlight_misspellings_in_paragraph(p)
     
    for sec in new.sections:
        sec.top_margin = Inches(1)
        sec.bottom_margin = Inches(1)
        sec.left_margin = Inches(1.25)
        sec.right_margin = Inches(1.25)
    new.save(output_docx)
    print("‚úÖ Cleaned DOCX:", output_docx)

    if audit_rows_file:
        audit_rows_file.close()
        print("‚úÖ Audit log:", audit_csv_path)

    return {
        "cleaned_docx": str(output_docx)
    }


# -----------------------
# Batch helper
# -----------------------
def process_folder(root_dir, overwrite=False, audit=False):
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(root)

    results = []

    for subdir in root.iterdir():
        if not subdir.is_dir():
            continue

        docx_files = [
            f for f in subdir.glob("*.docx")
            if not f.name.startswith("~$")
            and not f.stem.endswith(".cleaned")
        ]

        if not docx_files:
            continue

        for docx_file in docx_files:
            output_docx = subdir / f"{docx_file.stem}.cleaned.docx"
            audit_csv = subdir / f"{docx_file.stem}.audit.csv" if audit else None

            if output_docx.exists() and not overwrite:
                print("Skipping (exists):", output_docx)
                continue

            print(f"üìÇ Processing: {docx_file.name}")

            try:
                res = process_docx_file(
                    input_path=str(docx_file),
                    output_docx=str(output_docx),
                    audit_csv_path=str(audit_csv) if audit else None,
                    audit=audit
                )
                results.append(res)

            except Exception as e:
                print(f"‚ùå Failed: {docx_file} ‚Üí {e}")

    return results

# Example usage

process_docx_file("D:\IPS assignments\Assignment 5\ŸÇÿ±ÿ¢ŸÜ ÿ≠⁄©€åŸÖ ÿßÿ±ÿ™ŸÇÿß€å ÿπŸÑ€å ÿ®ŸÜÿØ⁄Ø€å.docx",
                 "cleaned2.docx",
                 audit_csv_path="none",
                 audit=False, merge_paragraphs=False)

# process_folder("D:\\IPS assignments\\Assignment 9", audit=False)


  process_docx_file("D:\IPS assignments\Assignment 5\ŸÇÿ±ÿ¢ŸÜ ÿ≠⁄©€åŸÖ ÿßÿ±ÿ™ŸÇÿß€å ÿπŸÑ€å ÿ®ŸÜÿØ⁄Ø€å.docx",


‚úÖ Cleaned DOCX: cleaned2.docx


{'cleaned_docx': 'cleaned2.docx'}

"D:\IPS assignments\Automation_program\Project Folder"

In [1]:
import yake
import re

def extract_keywords_yake(text, top_n=25):
    if not text or len(text.strip()) < 100:
        return []

    text = re.sub(r"[^A-Za-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    extractor = yake.KeywordExtractor(
        lan="en",
        n=3,
        dedupLim=0.9,
        top=top_n * 2
    )

    keywords = extractor.extract_keywords(text)

    results = []
    seen = set()

    for kw, _ in keywords:
        kw_norm = kw.lower().strip()
        if len(kw_norm) < 3:
            continue
        if any(char.isdigit() for char in kw_norm):
            continue
        if kw_norm in seen:
            continue

        seen.add(kw_norm)
        results.append(kw)

        if len(results) == top_n:
            break

    return results


import pandas as pd
from pathlib import Path
import docx

def collect_keywords_matrix_yake(root_dir, excel_path, top_n=15):
    root = Path(root_dir)

    keyword_map = {}  # document ‚Üí [keywords]

    for subdir in root.iterdir():
        if not subdir.is_dir():
            continue

        cleaned_docs = list(subdir.glob("*.cleaned.docx"))
        if not cleaned_docs:
            continue

        for docx_file in cleaned_docs:
            doc = docx.Document(docx_file)
            text = "\n".join(
                p.text for p in doc.paragraphs if p.text.strip()
            )

            keywords = extract_keywords_yake(text, top_n=top_n)
            doc_name = docx_file.stem.replace(".cleaned", "")
            keyword_map[doc_name] = keywords


    if not keyword_map:
        print("‚ö†Ô∏è No keywords extracted.")
        return

    # Normalize column length
    max_len = max(len(v) for v in keyword_map.values())

    data = {"keywords": [""] * max_len}

    for doc, kws in keyword_map.items():
        padded = kws + [""] * (max_len - len(kws))
        data[doc] = padded

    df = pd.DataFrame(data)
    df.to_excel(excel_path, index=False)

    print("‚úÖ Keyword matrix saved to:", excel_path)



collect_keywords_matrix_yake(
    root_dir="D:/IPS assignments/Assignment 8",
    excel_path="D:/IPS assignments/Assignment 8/keywords_yake.xlsx", top_n=20
)



‚úÖ Keyword matrix saved to: D:/IPS assignments/Assignment 8/keywords_yake.xlsx


In [None]:
from keybert import KeyBERT
import re
from pathlib import Path
import docx
import pandas as pd
from zipfile import BadZipFile
from collections import Counter
import hashlib
import spacy

# --------------------------------------------------
# 1. Load models ONCE
# --------------------------------------------------
kw_model = KeyBERT("sentence-transformers/all-MiniLM-L6-v2")
nlp = spacy.load("en_core_web_sm")

# --------------------------------------------------
# 2. Normalization (STRICT)
# --------------------------------------------------
def normalize_text(text: str) -> str:
    text = re.sub(r"[^A-Za-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

# --------------------------------------------------
# 3. Structural DOCX reader (weighted)
# --------------------------------------------------
def read_docx_weighted(path: Path) -> str:
    doc = docx.Document(path)
    chunks = []

    for p in doc.paragraphs:
        txt = p.text.strip()
        if not txt:
            continue

        style = p.style.name.lower()

        if style.startswith("heading"):
            chunks.extend([txt] * 3)
        elif txt.isupper():
            chunks.extend([txt] * 2)
        else:
            chunks.append(txt)

    return normalize_text(" ".join(chunks))

# --------------------------------------------------
# 4. Fingerprint (duplicate detection)
# --------------------------------------------------
def fingerprint(text: str) -> str:
    return hashlib.md5(text.encode()).hexdigest()

# --------------------------------------------------
# 5. Adaptive keyword count
# --------------------------------------------------
def adaptive_top_n(word_count: int) -> int:
    if word_count < 500:
        return 8
    elif word_count < 1500:
        return 12
    return 18

# --------------------------------------------------
# 6. Phrase control rules (CRITICAL)
# --------------------------------------------------
BAD_ENDINGS = {
    "current", "selective", "general", "various",
    "major", "minor", "differences", "notwithstanding"
}

BANNED_SINGLE_WORDS = {
    "year", "focus", "change", "error", "demands",
    "comparability", "indicators", "reduction",
    "growth", "finance", "agriculture", "institutions",
    "pakistan", "prosperity"
}

WHITELIST_SINGLE_WORDS = {
    "poverty", "inflation", "democracy"
}

# --------------------------------------------------
# 7. Extract noun-phrase candidates ONLY
# --------------------------------------------------
def extract_noun_phrases(text: str):
    doc = nlp(text)
    phrases = set()

    for chunk in doc.noun_chunks:
        phrase = chunk.text.lower().strip()

        if len(phrase) < 4:
            continue
        if any(char.isdigit() for char in phrase):
            continue

        tokens = phrase.split()

        # üö´ Ban meaningless single words
        if len(tokens) == 1:
            if phrase not in WHITELIST_SINGLE_WORDS:
                continue

        if phrase in BANNED_SINGLE_WORDS:
            continue

        if tokens[-1] in BAD_ENDINGS:
            continue

        phrases.add(phrase)

    return list(phrases)

# --------------------------------------------------
# 8. Keyword extraction (STRICT KeyBERT)
# --------------------------------------------------
def extract_keywords_keybert(text: str):
    words = text.split()
    if len(words) < 200:
        return []

    candidates = extract_noun_phrases(text)
    if len(candidates) < 10:
        return []

    top_n = adaptive_top_n(len(words))

    raw = kw_model.extract_keywords(
        text,
        candidates=candidates,
        use_mmr=True,
        diversity=0.45,
        top_n=top_n
    )

    cleaned = []
    for kw, _ in raw:
        kw = kw.strip()

        tokens = kw.split()

        # Final enforcement: prefer multi-word concepts
        if len(tokens) < 2 and kw not in WHITELIST_SINGLE_WORDS:
            continue

        cleaned.append(kw)

        if len(cleaned) == top_n:
            break

    return cleaned

# --------------------------------------------------
# 9. Cross-document IDF reweighting
# --------------------------------------------------
def apply_corpus_idf(keyword_map: dict):
    df_counter = Counter()
    total_docs = len(keyword_map)

    for kws in keyword_map.values():
        for kw in set(kws):
            df_counter[kw] += 1

    idf = {kw: total_docs / df_counter[kw] for kw in df_counter}

    reranked = {
        doc: sorted(kws, key=lambda k: -idf.get(k, 0))
        for doc, kws in keyword_map.items()
    }

    idf_table = pd.DataFrame(
        [(kw, df_counter[kw], idf[kw]) for kw in df_counter],
        columns=["keyword", "doc_frequency", "idf_score"]
    ).sort_values("idf_score", ascending=False)

    return reranked, idf_table

# --------------------------------------------------
# 10. Corpus processor
# --------------------------------------------------
def collect_keywords_matrix_keybert(root_dir: str, excel_path: str):
    root = Path(root_dir)
    keyword_map = {}
    seen_fingerprints = set()

    for docx_file in root.rglob("*.cleaned.docx"):
        print(f"üîÑ Processing: {docx_file.name}")

        try:
            text = read_docx_weighted(docx_file)
        except BadZipFile:
            print(f"‚ö†Ô∏è Skipping invalid DOCX: {docx_file}")
            continue

        fp = fingerprint(text)
        if fp in seen_fingerprints:
            print(f"‚ö†Ô∏è Duplicate skipped: {docx_file.name}")
            continue
        seen_fingerprints.add(fp)

        keywords = extract_keywords_keybert(text)
        if not keywords:
            continue

        doc_name = docx_file.stem.replace(".cleaned", "")
        keyword_map[doc_name] = keywords

    if not keyword_map:
        print("‚ö†Ô∏è No keywords extracted.")
        return

    keyword_map, idf_table = apply_corpus_idf(keyword_map)

    max_len = max(len(v) for v in keyword_map.values())
    data = {
        doc: kws + [""] * (max_len - len(kws))
        for doc, kws in keyword_map.items()
    }

    df_keywords = pd.DataFrame(data)
    df_keywords.index.name = "Rank"

    with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
        df_keywords.to_excel(writer, sheet_name="Keywords", index=True)
        idf_table.to_excel(writer, sheet_name="Corpus_IDF", index=False)

    print("‚úÖ FINAL strict keyword matrix saved to:", excel_path)

# -----------------------------------
# 10. Run
# -----------------------------------
collect_keywords_matrix_keybert(
    root_dir="D:\\IPS assignments\\Assignment 8\\5-106 Post Budget Economy- Issues and Challenges",
    excel_path="D:/IPS assignments/Assignment 8/keywords_KeyBERT_enhanced.xlsx"
)

üîÑ post-budget economy- issues and challenges.cleaned.docx
‚úÖ Keyword extraction complete: D:/IPS assignments/Assignment 8/keywords_KeyBERT_fenhanced.xlsx


In [1]:
from pathlib import Path
import shutil

def collect_and_rename_cleaned_files(
    source_root: str,
    output_dir: str
):
    source_root = Path(source_root)
    output_dir = Path(output_dir)

    output_dir.mkdir(parents=True, exist_ok=True)

    for file_path in source_root.rglob("*"):
        if file_path.is_file() and ".cleaned" in file_path.stem:
            # Remove ".cleaned" from filename
            new_name = file_path.name.replace(".cleaned", "")
            target_path = output_dir / new_name

            # Handle name collisions
            counter = 1
            while target_path.exists():
                target_path = output_dir / f"{file_path.stem.replace('.cleaned','')}_{counter}{file_path.suffix}"
                counter += 1

            shutil.copy2(file_path, target_path)

    print("‚úÖ All .cleaned files collected and renamed successfully.")


# =========================
# USAGE
# =========================
collect_and_rename_cleaned_files(
    source_root=r"D:\IPS assignments\Assignment 8",
    output_dir=r"D:\IPS assignments\cleaned_files"
    )


‚úÖ All .cleaned files collected and renamed successfully.
