In [15]:
import os
import re
import json
import pandas as pd
from docx import Document
from rapidfuzz import fuzz
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# ==========================
# 1️⃣ AUTO-DETECT WORKING DIRECTORY
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_DIR = os.path.join(BASE_DIR, "extracted_excel")
LOOKUP_JSON = os.path.join(BASE_DIR, "lookup.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD QUESTION MAPPING
# ==========================
with open(LOOKUP_JSON, "r", encoding="utf-8") as f:
    QUESTION_MAPPING = json.load(f)

ALL_QUESTION_VARIANTS = set()
ALL_THEMES = set(QUESTION_MAPPING.keys())

for theme, question_dict in QUESTION_MAPPING.items():
    for variants in question_dict.values():
        for variant in variants:
            ALL_QUESTION_VARIANTS.add(variant.strip().lower())

# ==========================
# 3️⃣ CLEAN TEXT FUNCTION
# ==========================
def clean_text(text):
    if not text:
        return ""
    return re.sub(r'\s+', ' ', str(text).strip())

# ==========================
# 4️⃣ CHECK IF LINE IS QUESTION OR THEME
# ==========================
def is_question_or_theme(line, threshold=90):
    line = line.strip().lower()
    if not line:
        return False

    for q in ALL_QUESTION_VARIANTS:
        if fuzz.partial_ratio(line, q) >= threshold:
            return True

    for theme in ALL_THEMES:
        if fuzz.partial_ratio(line, theme.lower()) >= threshold:
            return True

    return False

# ==========================
# 5️⃣ SMART FIND_RESPONSE FUNCTION
# ==========================
def find_response(all_text, variants, threshold=70):
    best_score = 0
    best_index = -1

    # Find best match line
    for variant in variants:
        for i, line in enumerate(all_text):
            score = fuzz.partial_ratio(variant.lower(), line.lower())
            if score > best_score:
                best_score = score
                best_index = i

    if best_score < threshold or best_index == -1:
        return ""

    response_lines = []

    # Include tail of matched line (e.g., same-line response)
    matched_line = all_text[best_index].strip()
    for variant in variants:
        if variant.lower() in matched_line.lower():
            split_pos = matched_line.lower().find(variant.lower()) + len(variant)
            tail = matched_line[split_pos:].strip()
            if tail and not is_question_or_theme(tail):
                response_lines.append(tail)
            break

    # Collect all following lines until next question/theme
    for j in range(best_index + 1, len(all_text)):
        line = all_text[j].strip()
        if is_question_or_theme(line):
            break
        response_lines.append(line)

    full_response = " ".join(response_lines).strip()
    sentences = sent_tokenize(full_response)
    valid_sentences = [s for s in sentences if not is_question_or_theme(s)]

    return " ".join(valid_sentences).strip()

# ==========================
# 6️⃣ PROCESS SINGLE DOCX FILE
# ==========================
def process_docx(filepath):
    doc = Document(filepath)
    lines = []

    for para in doc.paragraphs:
        lines.append(clean_text(para.text))

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                lines.append(clean_text(cell.text))

    results = []
    for theme, question_dict in QUESTION_MAPPING.items():
        for canonical, variants in question_dict.items():
            answer = find_response(lines, variants)
            results.append({
                "Theme": theme,
                "Question": canonical,
                "Response": answer
            })

    return results

# ==========================
# 7️⃣ PROCESS ALL DOCX FILES
# ==========================
for filename in os.listdir(INPUT_DIR):
    if not filename.endswith(".docx"):
        continue

    filepath = os.path.join(INPUT_DIR, filename)
    print(f"Processing: {filename}")
    data = process_docx(filepath)
    df = pd.DataFrame(data)

    county = filename.split("_")[0].strip()
    output_name = f"{county}_gendered_enterprise.xlsx"
    output_path = os.path.join(OUTPUT_DIR, output_name)

    df.to_excel(output_path, index=False)

print("✅ Done. Extracted Excel files are saved in:", OUTPUT_DIR)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rono\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing: Baringo_gendered_enterprise_selection_interview_guide.docx
Processing: Bomet_gendered_enterprise_selection_interview_guide.docx
Processing: Bungoma_gendered_enterprise_selection_interview_guide.docx
Processing: Busia_gendered_enterprise_selection_interview_guide.docx
Processing: Elgeyo_Marakwet_gendered_enterprise_selection_interview_guide.docx
Processing: Embu_gendered_enterprise_selection_interview_guide.docx
Processing: Garissa_gendered_enterprise_selection_interview_guide.docx
Processing: Homa_Bay_gendered_enterprise_selection_interview_guide.docx
Processing: Isiolo_gendered_enterprise_selection_interview_guide.docx
Processing: Kajiado_gendered_enterprise_selection_interview_guide.docx
Processing: Kakamega_gendered_enterprise_selection_interview_guide.docx
Processing: Kericho_gendered_enterprise_selection_interview_guide.docx
Processing: Kiambu_gendered_enterprise_selection_interview_guide.docx
Processing: Kilifi_gendered_enterprise_selection_interview_guide.docx
Proces