In [4]:
import os
import re
import json
import pandas as pd
from docx import Document
from rapidfuzz import fuzz

# ==========================
# 1️⃣ AUTO-DETECT WORKING DIRECTORY
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # If running as script
except NameError:
    BASE_DIR = os.getcwd()  # If running interactively

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD MASTER QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# Build master list of all possible questions and themes
ALL_QUESTIONS = []
THEMES = list(QUESTION_MAPPING.keys())
for theme, q_map in QUESTION_MAPPING.items():
    for q_main, q_variants in q_map.items():
        ALL_QUESTIONS.append(q_main)
        ALL_QUESTIONS.extend(q_variants)

# ==========================
# 3️⃣ CLEAN TEXT FUNCTION
# ==========================
def clean_text(text):
    """Remove unwanted characters and normalize spacing."""
    if not text:
        return ""
    text = str(text).strip()
    text = text.replace("✔", " Selected ").replace("✓", " Selected ")
    text = re.sub(r'[\u2022•■▪]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ==========================
# 4️⃣ QUESTION/THEME DETECTOR
# ==========================
def is_question_or_theme(line):
    """Checks if a line is a question or theme heading using lookup.json."""
    if not line.strip():
        return False
    # Match against known questions
    if any(fuzz.partial_ratio(line.lower(), q.lower()) >= 85 for q in ALL_QUESTIONS):
        return True
    # Match against theme headings
    if any(fuzz.partial_ratio(line.lower(), t.lower()) >= 85 for t in THEMES):
        return True
    # Simple pattern for numbered questions
    if re.match(r'^\d+[\.\)]', line):
        return True
    return False

# ==========================
# 5️⃣ PROCESS DOCX FILE CONTENT (Enhanced for Table Responses)
# ==========================
def process_docx(filepath):
    """
    Extracts paragraphs and tables from DOCX.
    If a table appears immediately after a question,
    links its text to that question as the response.
    """
    doc = Document(filepath)
    text_data = []
    last_question_index = -1

    # --- Extract paragraphs ---
    for para in doc.paragraphs:
        txt = clean_text(para.text)
        if txt:
            text_data.append(txt)
            # Track last detected question
            if is_question_or_theme(txt):
                last_question_index = len(text_data) - 1

    # --- Extract tables and link them to preceding questions ---
    for table in doc.tables:
        table_text = []
        for row in table.rows:
            cells = [clean_text(cell.text) for cell in row.cells if clean_text(cell.text)]
            if cells:
                if len(cells) == 2:
                    sentence = f"{cells[0]} has {cells[1]}."
                else:
                    sentence = " ".join(cells)
                table_text.append(sentence)

        if table_text:
            combined_table_text = " ".join(table_text)
            # Link table to last detected question
            if last_question_index != -1 and last_question_index < len(text_data):
                text_data[last_question_index] += " " + combined_table_text
            else:
                text_data.append(combined_table_text)

    return text_data

# ==========================
# 6️⃣ FIND BEST MATCHING RESPONSE
# ==========================
def find_response(all_text, variants, threshold=70):
    """Finds the best matching question and extracts its answer safely."""
    best_score = 0
    best_index = -1

    # Find the closest match for the question
    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return "No response provided"

    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()
        if not line:
            break
        # Stop if next line is a question or theme
        if is_question_or_theme(line):
            break
        response_lines.append(line)

    response = " ".join(response_lines).strip()
    return response if response else "No response provided"

# ==========================
# 7️⃣ MAIN PROCESSING LOOP
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    text_data = process_docx(filepath)

    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Processed: {file} -> {out_file}")

print("🎯 Enhanced extraction completed with full table and next-question handling.")


✅ Processed: Bomet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bomet_gendered_enterprise.xlsx
🎯 Enhanced extraction completed with full table and next-question handling.
