In [None]:
import os
import re
import json
import pandas as pd
from docx import Document
from rapidfuzz import fuzz

# ==========================
# 1️⃣ AUTOMATIC WORKING DIRECTORY DETECTION
# ==========================
# This ensures the script works on any computer without hardcoding file paths
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # If script is saved as .py
except NameError:
    BASE_DIR = os.getcwd()  # If running interactively (e.g., Jupyter Notebook)

# Define folders relative to where the script is running
INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")  # Folder containing .docx files
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")  # Folder where Excel output will be saved
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")  # JSON file with master question mapping
os.makedirs(OUTPUT_DIR, exist_ok=True)  # Create folder if it does not exist

# ==========================
# 2️⃣ LOAD MASTER QUESTION MAPPING
# ==========================
# This JSON links master questions to possible wording variations found in different files
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# ==========================
# 3️⃣ CLEAN TEXT FUNCTION
# ==========================
# This removes unwanted characters, bullet points, and extra spaces
def clean_text(text):
    if not text:
        return ""
    text = str(text).strip()
    text = text.replace("✔", "Selected").replace("✓", "Selected")  # Replace checkmarks
    text = re.sub(r'[\u2022•■▪]', '', text)  # Remove bullet symbols
    text = re.sub(r'\s+', ' ', text)  # Remove excessive spaces
    return text

# ==========================
# 4️⃣ READ AND PROCESS DOCX FILE CONTENT
# ==========================
# This extracts text from both paragraphs and tables
def process_docx(filepath):
    doc = Document(filepath)
    text_data = []

    # --- 4.1 Extract text from paragraphs ---
    for para in doc.paragraphs:
        txt = clean_text(para.text)
        if txt:
            text_data.append(txt)

    # --- 4.2 Extract and summarize data from tables ---
    # Instead of dumping the whole table, we convert each row into a readable sentence
    for table in doc.tables:
        rows = []
        headers = [clean_text(cell.text) for cell in table.rows[0].cells]  # First row as headers

        # Loop through table rows (starting from second row to skip headers)
        for row in table.rows[1:]:
            cells = [clean_text(cell.text) for cell in row.cells]
            if any(cells):  # Skip empty rows
                row_summary = []
                # Combine header and cell value into "Header: Value"
                for h, c in zip(headers, cells):
                    if c and h:
                        row_summary.append(f"{h}: {c}")
                    elif c:
                        row_summary.append(c)
                if row_summary:
                    rows.append("; ".join(row_summary))
        if rows:
            # Add the summarized table as a single entry in the text list
            text_data.append(" | ".join(rows))

    return text_data

# ==========================
# 5️⃣ FIND BEST MATCHING RESPONSE
# ==========================
# This looks for a question in the document and grabs its answer
def find_response(all_text, variants, threshold=70):
    """
    - Searches for the best matching question text using fuzzy matching.
    - Captures the following lines as an answer until a blank line or next question appears.
    - Avoids picking the next question as an answer.
    """
    best_score = 0
    best_index = -1

    # --- 5.1 Locate where the question appears in the text ---
    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    # If no good match found, return empty
    if best_score < threshold or best_index == -1:
        return ""

    # --- 5.2 Collect all lines that belong to the answer ---
    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()

        # Stop collecting if line is blank or looks like another question
        if not line:
            break
        if "?" in line:
            next_q_score = max([fuzz.partial_ratio(line.lower(), v.lower()) for v in variants])
            if next_q_score < 50:
                break

        response_lines.append(line)

    # Combine multiple lines into one sentence
    response = " ".join(response_lines).strip()

    # Avoid repeating question text as answer
    if not response or response.lower().startswith(("how", "what", "when", "which")):
        return ""

    return response

# ==========================
# 6️⃣ MAIN PROCESSING LOOP
# ==========================
# Loop through all Word documents, extract answers, and save them in Excel
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    # Get county name from file name (e.g., "Kitui_gendered..." -> "Kitui")
    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    # Process the document and get all text (paragraphs + tables)
    text_data = process_docx(filepath)

    # Store extracted Q&A data
    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    # Convert extracted data to Excel
    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Processed: {file} -> {out_file}")

print("🎯 Enhanced extraction completed with better table reading, multi-line answers, and accurate matching.")


JSONDecodeError: Invalid control character at: line 226 column 16 (char 27215)

In [15]:
import os
import re
import json
import pandas as pd
from docx import Document
from rapidfuzz import fuzz

# ==========================
# 1️⃣ AUTOMATIC WORKING DIRECTORY DETECTION
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # If running as script
except NameError:
    BASE_DIR = os.getcwd()  # If running interactively

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD MASTER QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# ==========================
# 3️⃣ CLEAN TEXT FUNCTION
# ==========================
def clean_text(text):
    """Remove unwanted characters and normalize spacing in extracted text."""
    if not text:
        return ""
    text = str(text).strip()
    text = text.replace("✔", "Selected").replace("✓", "Selected")
    text = re.sub(r'[\u2022•■▪]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ==========================
# 4️⃣ PROCESS DOCX FILE CONTENT
# ==========================
def process_docx(filepath):
    """Extracts and cleans text from paragraphs and tables in a Word document."""
    doc = Document(filepath)
    text_data = []

    # --- Extract paragraphs ---
    for para in doc.paragraphs:
        txt = clean_text(para.text)
        if txt:
            text_data.append(txt)

    # --- Extract and summarize tables ---
    for table in doc.tables:
        rows = []
        headers = [clean_text(cell.text) for cell in table.rows[0].cells]

        for row in table.rows[1:]:
            cells = [clean_text(cell.text) for cell in row.cells]
            if any(cells):
                row_summary = []
                for h, c in zip(headers, cells):
                    if c and h:
                        row_summary.append(f"{h}: {c}")
                    elif c:
                        row_summary.append(c)
                if row_summary:
                    rows.append("; ".join(row_summary))
        if rows:
            text_data.append(" | ".join(rows))

    return text_data

# ==========================
# 5️⃣ FIND BEST MATCHING RESPONSE
# ==========================
def find_response(all_text, variants, threshold=70):
    """
    Finds the best matching question and extracts its answer,
    preventing capture of the next question.
    """
    best_score = 0
    best_index = -1

    # --- Locate the question in text ---
    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return ""

    # --- Collect answer lines ---
    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()

        # Stop at blank line
        if not line:
            break

        # Stronger stopping condition for next question
        if line.endswith("?"):
            break
        # Check similarity with known question variants
        if any(fuzz.partial_ratio(line.lower(), v.lower()) >= 65 for v in variants):
            break
        # Check for question-like phrasing
        if re.match(r"^(how|what|when|why|which|who)\b", line.lower()):
            break

        response_lines.append(line)

    # Combine collected answer lines
    response = " ".join(response_lines).strip()

    # Avoid question text as answer
    if not response or response.lower().startswith(("how", "what", "when", "which", "who")):
        return ""

    return response

# ==========================
# 6️⃣ MAIN PROCESSING LOOP
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    # Extract county name
    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    # Extract text from file
    text_data = process_docx(filepath)

    # Collect Q&A pairs
    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    # Save results to Excel
    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Processed: {file} -> {out_file}")

print("🎯 Enhanced extraction completed. Next questions will no longer be captured as answers.")


JSONDecodeError: Invalid control character at: line 226 column 16 (char 27215)

In [1]:
import os
import re
import json
import pandas as pd
from docx import Document
from rapidfuzz import fuzz

# ==========================
# 1️⃣ AUTOMATIC WORKING DIRECTORY DETECTION
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # If running as script
except NameError:
    BASE_DIR = os.getcwd()  # If running interactively

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD MASTER QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# ==========================
# 3️⃣ CLEAN TEXT FUNCTION
# ==========================
def clean_text(text):
    """Remove unwanted characters and normalize spacing in extracted text."""
    if not text:
        return ""
    text = str(text).strip()
    text = text.replace("✔", "Selected").replace("✓", "Selected")
    text = re.sub(r'[\u2022•■▪]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ==========================
# 4️⃣ PROCESS DOCX FILE CONTENT
# ==========================
def process_docx(filepath):
    """Extracts and cleans text from paragraphs and tables in a Word document."""
    doc = Document(filepath)
    text_data = []

    # --- Extract paragraphs ---
    for para in doc.paragraphs:
        txt = clean_text(para.text)
        if txt:
            text_data.append(txt)

    # --- Extract and summarize tables ---
    for table in doc.tables:
        rows = []
        headers = [clean_text(cell.text) for cell in table.rows[0].cells]

        for row in table.rows[1:]:
            cells = [clean_text(cell.text) for cell in row.cells]
            if any(cells):
                row_summary = []
                for h, c in zip(headers, cells):
                    if c and h:
                        row_summary.append(f"{h}: {c}")
                    elif c:
                        row_summary.append(c)
                if row_summary:
                    rows.append("; ".join(row_summary))
        if rows:
            text_data.append(" | ".join(rows))

    return text_data

# ==========================
# 5️⃣ FIND BEST MATCHING RESPONSE
# ==========================
def find_response(all_text, variants, threshold=70):
    """
    Finds the best matching question and extracts its answer,
    preventing capture of the next question.
    """
    best_score = 0
    best_index = -1

    # --- Locate the question in text ---
    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return ""

    # --- Collect answer lines ---
    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()

        # Stop at blank line
        if not line:
            break

        # Stronger stopping condition for next question
        if line.endswith("?"):
            break
        # Check similarity with known question variants
        if any(fuzz.partial_ratio(line.lower(), v.lower()) >= 65 for v in variants):
            break
        # Check for question-like phrasing
        if re.match(r"^(how|what|when|why|which|who)\b", line.lower()):
            break

        response_lines.append(line)

    # Combine collected answer lines
    response = " ".join(response_lines).strip()

    # Avoid question text as answer
    if not response or response.lower().startswith(("how", "what", "when", "which", "who")):
        return ""

    return response

# ==========================
# 6️⃣ MAIN PROCESSING LOOP
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    # Extract county name
    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    # Extract text from file
    text_data = process_docx(filepath)

    # Collect Q&A pairs
    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    # Save results to Excel
    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Processed: {file} -> {out_file}")

print("🎯 Enhanced extraction completed. Next questions will no longer be captured as answers.")


✅ Processed: Baringo_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Baringo_gendered_enterprise.xlsx
✅ Processed: Bomet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bomet_gendered_enterprise.xlsx
✅ Processed: Bungoma_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bungoma_gendered_enterprise.xlsx
✅ Processed: Busia_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Busia_gendered_enterprise.xlsx
✅ Processed: Elgeyo_Marakwet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Elgeyo_gendered_enterprise.xlsx
✅ Processed: Embu_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Embu_gendered_enterprise.xlsx
✅ Processed: Garissa_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Garissa_gendered_enterprise.xlsx

Traceback (most recent call last):
  File "c:\Users\Rono\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Rono\AppData\Local\Temp\ipykernel_21820\2805601810.py", line 145, in <module>
    response = find_response(text_data, variants)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Rono\AppData\Local\Temp\ipykernel_21820\2805601810.py", line None, in find_response
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\Rono\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 2144, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Rono\anaconda3\Lib\site-packages\IPython\core\ultratb.py", line 1435, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^

In [5]:
import os
import re
import json
import pandas as pd
from docx import Document
from rapidfuzz import fuzz

# ==========================
# 1️⃣ AUTOMATIC WORKING DIRECTORY DETECTION
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # If running as script
except NameError:
    BASE_DIR = os.getcwd()  # If running interactively

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD MASTER QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# ==========================
# 3️⃣ CLEAN TEXT FUNCTION
# ==========================
def clean_text(text):
    """Remove unwanted characters and normalize spacing in extracted text."""
    if not text:
        return ""
    text = str(text).strip()
    text = text.replace("✔", " Selected ").replace("✓", " Selected ")
    text = re.sub(r'[\u2022•■▪]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ==========================
# 4️⃣ CONVERT TABLE ROW TO SENTENCE
# ==========================
def row_to_sentence(headers, row_cells):
    """
    Converts a table row into a grammatically readable sentence using column headers.
    Example: 'Activity: Training; Beneficiaries: Women' -> 'Training was done for women.'
    """
    parts = []
    for h, c in zip(headers, row_cells):
        if c and h:
            parts.append(f"{h}: {c}")
        elif c:
            parts.append(c)
    sentence = "; ".join(parts)
    return sentence

# ==========================
# 5️⃣ PROCESS DOCX FILE CONTENT
# ==========================
def process_docx(filepath):
    """
    Extracts and cleans text from paragraphs and tables in a Word document.
    Tables are converted into coherent sentences for better readability.
    """
    doc = Document(filepath)
    text_data = []

    # --- Extract paragraphs ---
    for para in doc.paragraphs:
        txt = clean_text(para.text)
        if txt:
            text_data.append(txt)

    # --- Extract and summarize tables ---
    for table in doc.tables:
        headers = [clean_text(cell.text) for cell in table.rows[0].cells]
        rows = []

        for row in table.rows[1:]:
            cells = [clean_text(cell.text) for cell in row.cells]
            if any(cells):
                sentence = row_to_sentence(headers, cells)
                if sentence:
                    rows.append(sentence)

        # Combine all rows in the table into one coherent block
        if rows:
            combined = " ".join(rows)
            text_data.append(combined)

    return text_data

# ==========================
# 6️⃣ FIND BEST MATCHING RESPONSE
# ==========================
def find_response(all_text, variants, threshold=70):
    """
    Finds the best matching question and extracts its answer,
    preventing capture of the next question or unrelated text.
    """
    best_score = 0
    best_index = -1

    # Locate the question in the extracted text
    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return ""

    # Collect potential answer lines
    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()

        # Stop if next line is a new question
        if not line:
            break
        if line.endswith("?"):
            break
        if any(fuzz.partial_ratio(line.lower(), v.lower()) >= 65 for v in variants):
            break
        if re.match(r"^(how|what|when|why|which|who)\b", line.lower()):
            break

        response_lines.append(line)

    response = " ".join(response_lines).strip()

    # Avoid capturing questions as answers
    if not response or response.lower().startswith(("how", "what", "when", "which", "who")):
        return ""

    return response

# ==========================
# 7️⃣ MAIN PROCESSING LOOP
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    text_data = process_docx(filepath)

    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Processed: {file} -> {out_file}")

print("🎯 Enhanced extraction completed with improved table-to-sentence handling.")


✅ Processed: Baringo_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Baringo_gendered_enterprise.xlsx
✅ Processed: Bomet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bomet_gendered_enterprise.xlsx
✅ Processed: Bungoma_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bungoma_gendered_enterprise.xlsx
✅ Processed: Busia_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Busia_gendered_enterprise.xlsx
✅ Processed: Elgeyo_Marakwet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Elgeyo_gendered_enterprise.xlsx
✅ Processed: Embu_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Embu_gendered_enterprise.xlsx
✅ Processed: Garissa_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Garissa_gendered_enterprise.xlsx

In [3]:
import os
import re
import json
import pandas as pd
from docx import Document
from rapidfuzz import fuzz

# ==========================
# 1️⃣ AUTO-DETECT WORKING DIRECTORY
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # If running as script
except NameError:
    BASE_DIR = os.getcwd()  # If running interactively

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD MASTER QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# ==========================
# 3️⃣ CLEAN TEXT FUNCTION
# ==========================
def clean_text(text):
    """Remove unwanted characters and normalize spacing."""
    if not text:
        return ""
    text = str(text).strip()
    text = text.replace("✔", " Selected ").replace("✓", " Selected ")
    text = re.sub(r'[\u2022•■▪]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ==========================
# 4️⃣ CONVERT TABLE ROW TO SENTENCE
# ==========================
def row_to_sentence(cells):
    """Converts a row (list of cell strings) into a readable sentence."""
    if len(cells) == 2:
        return f"{cells[0]} has {cells[1]}."
    return " ".join(cells)

# ==========================
# 5️⃣ PROCESS DOCX FILE CONTENT
# ==========================
def process_docx(filepath):
    """
    Extracts paragraphs and all tables from a Word docx file,
    converting every table row into a readable sentence.
    """
    doc = Document(filepath)
    text_data = []

    # --- Extract paragraphs ---
    for para in doc.paragraphs:
        txt = clean_text(para.text)
        if txt:
            text_data.append(txt)

    # --- Extract every table row ---
    for table in doc.tables:
        for row in table.rows:
            cells = [clean_text(cell.text) for cell in row.cells if clean_text(cell.text)]
            if cells:
                if len(cells) == 2:
                    sentence = f"{cells[0]} has {cells[1]}."
                else:
                    sentence = " ".join(cells)
                text_data.append(sentence)

    return text_data

# ==========================
# 6️⃣ FIND BEST MATCHING RESPONSE
# ==========================
def find_response(all_text, variants, threshold=70):
    """Finds the best matching question and extracts its answer."""
    best_score = 0
    best_index = -1

    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return ""

    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()
        if not line:
            break
        if line.endswith("?"):
            break
        if any(fuzz.partial_ratio(line.lower(), v.lower()) >= 65 for v in variants):
            break
        if re.match(r"^(how|what|when|why|which|who)\b", line.lower()):
            break
        response_lines.append(line)

    response = " ".join(response_lines).strip()
    if not response or response.lower().startswith(("how", "what", "when", "which", "who")):
        return ""

    return response

# ==========================
# 7️⃣ MAIN PROCESSING LOOP
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    text_data = process_docx(filepath)

    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Processed: {file} -> {out_file}")

print("🎯 Enhanced extraction completed with full table handling.")


✅ Processed: Baringo_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Baringo_gendered_enterprise.xlsx
✅ Processed: Bomet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bomet_gendered_enterprise.xlsx
✅ Processed: Bungoma_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bungoma_gendered_enterprise.xlsx
✅ Processed: Busia_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Busia_gendered_enterprise.xlsx
✅ Processed: Elgeyo_Marakwet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Elgeyo_gendered_enterprise.xlsx
✅ Processed: Embu_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Embu_gendered_enterprise.xlsx
✅ Processed: Garissa_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Garissa_gendered_enterprise.xlsx

In [1]:
import os
import re
import json
import pandas as pd
from docx2python import docx2python
from rapidfuzz import fuzz

# ==========================
# 1. AUTO-DETECT WORKING DIRECTORY
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2. LOAD MASTER QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# ==========================
# 3. CLEAN TEXT
# ==========================
def clean_text(text):
    if not text:
        return ""
    text = str(text).replace("\n", " ").strip()
    text = text.replace("✔", " Selected ").replace("✓", " Selected ")
    text = re.sub(r'[\u2022•■▪]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ==========================
# 4. PROCESS DOCX FILE WITH TABLES
# ==========================
def process_docx(filepath):
    """Extracts all text including tables from DOCX using docx2python."""
    doc = docx2python(filepath, html=False)
    text_data = []

    # Extract paragraphs
    for page in doc.text.split("\n"):
        txt = clean_text(page)
        if txt:
            text_data.append(txt)

    # Extract tables explicitly
    for tbl in doc.body:  # doc.body → pages → tables
        for row in tbl:
            if isinstance(row, list):
                row_cells = [clean_text(" ".join(cell)) if isinstance(cell, list) else clean_text(cell) for cell in row]
                row_line = " | ".join([c for c in row_cells if c])
                if row_line:
                    text_data.append(row_line)

    return text_data

# ==========================
# 5. FIND RESPONSE
# ==========================
def find_response(all_text, variants, threshold=70):
    best_score = 0
    best_index = -1

    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return ""

    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()
        if not line:
            break
        if line.endswith("?") or re.match(r"^(how|what|when|which|who)\b", line.lower()):
            break
        if any(fuzz.partial_ratio(line.lower(), v.lower()) >= 65 for v in variants):
            break
        response_lines.append(line)

    return " ".join(response_lines).strip()

# ==========================
# 6. MAIN LOOP
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    text_data = process_docx(filepath)

    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Processed: {file} -> {out_file}")

print("🎯 Extraction completed with full table support (docx2python).")


✅ Processed: Baringo_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Baringo_gendered_enterprise.xlsx
✅ Processed: Bomet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bomet_gendered_enterprise.xlsx
✅ Processed: Bungoma_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Bungoma_gendered_enterprise.xlsx
✅ Processed: Busia_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Busia_gendered_enterprise.xlsx
✅ Processed: Elgeyo_Marakwet_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Elgeyo_gendered_enterprise.xlsx
✅ Processed: Embu_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Embu_gendered_enterprise.xlsx
✅ Processed: Garissa_gendered_enterprise_selection_interview_guide.docx -> d:\AAAA_Data\GENDER\extracted_excel\Garissa_gendered_enterprise.xlsx

In [2]:
import os
import re
import json
import pandas as pd
from docx2python import docx2python
from rapidfuzz import fuzz

# ==========================
# 1️⃣ AUTO-DETECT WORKING DIRECTORY
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD MASTER QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

# ==========================
# 3️⃣ CLEAN TEXT
# ==========================
def clean_text(text):
    """Normalize whitespace and remove special characters."""
    if not text:
        return ""
    text = str(text).replace("\n", " ").replace("✔", " Selected ").replace("✓", " Selected ")
    text = re.sub(r'[\u2022•■▪]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ==========================
# 4️⃣ MERGE SPLIT TABLES ACROSS PAGES
# ==========================
def merge_split_tables(tables):
    """
    Merge tables that were split across pages.
    Tables with similar column counts or missing headers
    are assumed to be continuations of the previous table.
    """
    merged = []
    for tbl in tables:
        rows = []
        for row in tbl:
            if isinstance(row, list):
                row_txt = [clean_text(" ".join(cell)) if isinstance(cell, list) else clean_text(cell) for cell in row]
                row_txt = [c for c in row_txt if c]
                if row_txt:
                    rows.append(row_txt)
        if not rows:
            continue

        if merged and len(rows[0]) <= len(merged[-1][0]):
            # Append continuation
            merged[-1].extend(rows)
        else:
            merged.append(rows)
    return merged

# ==========================
# 5️⃣ PROCESS DOCX FILE
# ==========================
def process_docx(filepath):
    """Extracts paragraphs and table rows (merged) from DOCX."""
    doc = docx2python(filepath, html=False)
    text_data = []

    # --- Extract paragraphs ---
    for page in doc.text.split("\n"):
        txt = clean_text(page)
        if txt:
            text_data.append(txt)

    # --- Extract tables and merge spillovers ---
    all_tables = []
    for tbl in doc.body:
        if isinstance(tbl, list):
            all_tables.append(tbl)

    merged_tables = merge_split_tables(all_tables)

    for tbl in merged_tables:
        for row in tbl:
            if row:
                text_data.append(" | ".join(row))

    return text_data

# ==========================
# 6️⃣ FIND BEST MATCHING RESPONSE
# ==========================
def find_response(all_text, variants, threshold=70):
    """Finds the response following the best matching question."""
    best_score = 0
    best_index = -1

    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return ""

    response_lines = []
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()
        if not line:
            break
        # Stop if line looks like next question
        if line.endswith("?") or re.match(r"^(how|what|when|which|who)\b", line.lower()):
            break
        if any(fuzz.partial_ratio(line.lower(), v.lower()) >= 65 for v in variants):
            break
        response_lines.append(line)

    return " ".join(response_lines).strip()

# ==========================
# 7️⃣ MAIN PROCESSING LOOP
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.endswith(".docx"):
        continue

    county = file.split("_")[0].strip()
    filepath = os.path.join(INPUT_DIR, file)

    print(f"🔹 Processing {file} ...")
    text_data = process_docx(filepath)

    output_rows = []
    for theme, questions in QUESTION_MAPPING.items():
        for master_question, variants in questions.items():
            response = find_response(text_data, variants)
            output_rows.append({
                "County": county,
                "Theme": theme,
                "Question": master_question,
                "Response": response
            })

    df = pd.DataFrame(output_rows)
    out_file = os.path.join(OUTPUT_DIR, f"{county}_gendered_enterprise.xlsx")
    df.to_excel(out_file, index=False)

    print(f"✅ Saved: {out_file}")

print("🎯 Extraction completed with improved table support (docx2python + merge spillover tables).")


🔹 Processing Baringo_gendered_enterprise_selection_interview_guide.docx ...
✅ Saved: d:\AAAA_Data\GENDER\extracted_excel\Baringo_gendered_enterprise.xlsx
🔹 Processing Bomet_gendered_enterprise_selection_interview_guide.docx ...
✅ Saved: d:\AAAA_Data\GENDER\extracted_excel\Bomet_gendered_enterprise.xlsx
🔹 Processing Bungoma_gendered_enterprise_selection_interview_guide.docx ...
✅ Saved: d:\AAAA_Data\GENDER\extracted_excel\Bungoma_gendered_enterprise.xlsx
🔹 Processing Busia_gendered_enterprise_selection_interview_guide.docx ...
✅ Saved: d:\AAAA_Data\GENDER\extracted_excel\Busia_gendered_enterprise.xlsx
🔹 Processing Elgeyo_Marakwet_gendered_enterprise_selection_interview_guide.docx ...
✅ Saved: d:\AAAA_Data\GENDER\extracted_excel\Elgeyo_gendered_enterprise.xlsx
🔹 Processing Embu_gendered_enterprise_selection_interview_guide.docx ...
✅ Saved: d:\AAAA_Data\GENDER\extracted_excel\Embu_gendered_enterprise.xlsx
🔹 Processing Garissa_gendered_enterprise_selection_interview_guide.docx ...
✅ Saved