In [2]:
import os
import re
from docx import Document

# ==========================
# AUTO-DETECT WORKING DIRECTORY
# ==========================
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # Script location
except NameError:
    BASE_DIR = os.getcwd()  # Interactive (Jupyter, console)

INPUT_DIR = os.path.join(BASE_DIR, "Questionnaires")
OUTPUT_FILE = os.path.join(BASE_DIR, "unique_questions.txt")

# ==========================
# CLEAN TEXT FUNCTION
# ==========================
def clean_text(text):
    if not text:
        return ""
    # Keep punctuation, numbers, and spacing as much as possible
    text = text.strip()
    # Normalize multiple spaces/tabs to single space
    text = re.sub(r'[ \t]+', ' ', text)
    return text

# ==========================
# EXTRACT QUESTIONS FROM DOCX
# ==========================
def extract_questions(filepath):
    """Extracts all lines ending with '?' from docx paragraphs and tables"""
    questions = set()
    doc = Document(filepath)

    # Paragraphs
    for para in doc.paragraphs:
        txt = clean_text(para.text)
        if txt.endswith("?"):
            questions.add(txt)

    # Tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                txt = clean_text(cell.text)
                if txt.endswith("?"):
                    questions.add(txt)

    # Return questions exactly as they appear (sorted for readability)
    return sorted(questions, key=lambda x: x.lower())

# ==========================
# MAIN LOOP
# ==========================
results = []

for file in sorted(os.listdir(INPUT_DIR)):
    if file.startswith("~$") or not file.endswith(".docx"):
        continue

    filepath = os.path.join(INPUT_DIR, file)
    questions = extract_questions(filepath)

    results.append(f"===== {file} =====")
    if questions:
        for q in questions:
            results.append(q)  # No dash prefix, preserve exact text
    else:
        results.append("⚠️ No questions found")
    results.append("")  # Blank line separator

# ==========================
# WRITE TO TEXT FILE
# ==========================
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.write("\n".join(results))

print(f"✅ Extraction complete. Unique questions per file saved to: {OUTPUT_FILE}")


✅ Extraction complete. Unique questions per file saved to: d:\AAAA_Data\GENDER\unique_questions.txt
