For Single Word File

In [None]:
from docx import Document
import html
import re
import pandas as pd

# ------------------- Helpers -------------------
def is_list_item(paragraph):
    return paragraph._element.pPr is not None and paragraph._element.pPr.numPr is not None

def convert_run_to_html(run):
    text = html.escape(run.text)
    if not text:
        return ""
    if run.bold:
        text = f"<strong>{text}</strong>"
    if run.italic:
        text = f"<em>{text}</em>"
    if run.underline:
        text = f"<u>{text}</u>"
    return text

def paragraph_to_html(paragraph):
    text = "".join(convert_run_to_html(run) for run in paragraph.runs)
    if not text.strip():
        return ""
    if is_list_item(paragraph):
        return f"<li>{text}</li>"
    style = getattr(paragraph.style, "name", "").lower()
    if "heading" in style:
        level = ''.join(filter(str.isdigit, style)) or "2"
        return f"<h{level}>{text}</h{level}>"
    return f"<p>{text}</p>"

# ------------------- Extractors -------------------
def extract_title(docx_path):
    doc = Document(docx_path)
    capture = False

    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()

        # Detect "Report Title" subheading (numbered, bulleted, emoji, or plain)
        if not capture and ("report title" in low):
            capture = True
            continue  # skip the heading line itself

        # After capture → return first non-empty paragraph
        if capture and text:
            return text

    return ""


# ✅ Description (Introduction → stop before Report Summary/8.)
def extract_description(docx_path):
    doc = Document(docx_path)
    html_output, inside_list = [], False
    start, stop = False, False

    for para in doc.paragraphs:
        text = para.text.strip().lower()

        if not start and ("introduction" in text):
            start = True

        if start and not stop:
            html_part = paragraph_to_html(para)
            if html_part:
                if html_part.startswith("<li>"):
                    if not inside_list:
                        html_output.append("<ul>")
                        inside_list = True
                    html_output.append(html_part)
                else:
                    if inside_list:
                        html_output.append("</ul>")
                        inside_list = False
                    html_output.append(html_part)

        if text.startswith("8.") or "report summary" in text:
            stop = True
            break

    if inside_list:
        html_output.append("</ul>")
    return "\n".join(html_output)

# ✅ TOC (after Heading 9 / Table of Contents)
def extract_toc(docx_path):
    doc = Document(docx_path)
    html_output, inside_list, capture = [], False, False
    end_reached = False

    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()

        # Start condition
        if not capture and "table of contents" in low:
            capture = True
            continue

        if capture:
            # End condition = capture "List of Figures" + its items, then stop
            if "list of figures" in low:
                html_part = paragraph_to_html(para)
                if html_part:
                    html_output.append(html_part)   # add heading "List of Figures"
                end_reached = True
                continue  # don't break yet, because its children may follow

            if end_reached:
                # If koi aur heading/subheading mil gaya to yahan break
                style = getattr(para.style, "name", "").lower()
                if "heading" in style or re.match(r"^\d+[\.\)]\s", text):
                    break  

            html_part = paragraph_to_html(para)
            if html_part:
                if html_part.startswith("<li>"):
                    if not inside_list:
                        html_output.append("<ul>")
                        inside_list = True
                    html_output.append(html_part)
                else:
                    if inside_list:
                        html_output.append("</ul>")
                        inside_list = False
                    html_output.append(html_part)

    if inside_list:
        html_output.append("</ul>")
    return "\n".join(html_output)


# ✅ extract description (Heading 1 → Heading 7 only)

def extract_description(docx_path):
    doc = Document(docx_path)
    html_output = []
    inside_list = False
    capture = False  

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        low = text.lower()

        # --- Start Condition ---
        if not capture and "introduction and strategic context" in low:
            capture = True  

        # --- Stop Condition ---
        if capture and "report summary, faqs, and seo schema" in low:
            break  

        if capture:
            html_part = paragraph_to_html(para)

            if not html_part:
                continue

            if html_part.startswith("<li>"):
                if not inside_list:
                    html_output.append("<ul>")
                    inside_list = True
                html_output.append(html_part)
            else:
                if inside_list:
                    html_output.append("</ul>")
                    inside_list = False
                html_output.append(html_part)

    if inside_list:
        html_output.append("</ul>")

    return "\n".join(html_output)

# ✅ Methodology (FAQ Table or Paragraphs Qn:/A: format)
def extract_methodology(docx_path):
    doc = Document(docx_path)
    faqs, q_count = [], 0

    # --- Case 1: FAQ Table ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]
        if "question" in headers and "answer" in headers:
            for row in table.rows[1:]:
                q_text = row.cells[0].text.strip()
                a_text = row.cells[1].text.strip()
                if q_text and a_text:
                    q_count += 1
                    faqs.append(
                        f"<p><strong>Q{q_count}: {html.escape(q_text)}</strong><br>"
                        f"A{q_count}: {html.escape(a_text)}</p>"
                    )
            if faqs:
                return "\n".join(faqs)  # return if table found

    # --- Case 2 & 3: Text-based FAQs ---
    capture = False
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        # Start after "Top 5 FAQs" heading
        if "top 5 faqs" in text.lower():
            capture = True
            continue

        if capture:
            # ---- Case 3: Q and A in same paragraph ----
            both_match = re.match(r"Q\d+[:.]\s*(.*?)\s*A\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if both_match:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(both_match.group(1).strip())}</strong><br>"
                    f"A{q_count}: {html.escape(both_match.group(2).strip())}</p>"
                )
                continue

            # ---- Case 2: Q... in one para, A... in next ----
            q_match = re.match(r"Q\d+[:.]\s*(.*)", text, re.IGNORECASE)
            if q_match:
                current_q = q_match.group(1).strip()
                continue

            a_match = re.match(r"A\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if a_match and 'current_q' in locals():
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(current_q)}</strong><br>"
                    f"A{q_count}: {html.escape(a_match.group(1).strip())}</p>"
                )
                del current_q  # reset

    return "\n".join(faqs)



# ✅ Meta Description (first para after Introduction)
def extract_meta_description(docx_path):
    doc = Document(docx_path)
    capture = False
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and ("introduction" in low):
            capture = True
            continue
        if capture and text:
            return text
    return ""

# ✅ Detect new subheading (for SeoTitle/BreadcrumbText)
def _is_new_subheading(text: str) -> bool:
    text = text.strip()
    if not text:
        return False
    if text.startswith(("📊", "❓", "🧩")):
        return True
    if re.match(r"^[A-Z](?:\.\d+)?\.\s", text):
        return True
    if re.match(r"^\d+\.\s", text):
        return True
    return False

# ✅ SeoTitle & BreadcrumbText (A.3. Headline block)
def extract_seo_title_and_breadcrumb(docx_path):
    doc = Document(docx_path)
    capture, lines = False, []
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and ("a.3. headline" in low or low.startswith("headline")):
            capture = True
            continue
        if capture:
            if _is_new_subheading(text):
                break
            if text:
                lines.append(text)
                if len(lines) >= 2:
                    break
    seo_title = lines[0] if len(lines) >= 1 else ""
    breadcrumb_text = lines[1] if len(lines) >= 2 else ""
    return seo_title, breadcrumb_text

# ------------------- Runner -------------------

doc_path = r"C:\Users\Vishnu\Documents\extracted_docs\extracted_docs\Aerospace Floor Panel Market.docx"
title=extract_title(doc_path)
description_html = extract_description(doc_path)
toc_html = extract_toc(doc_path)
methodology_html = extract_methodology(doc_path)
meta = extract_meta_description(doc_path)
seo_title, breadcrumb_text = extract_seo_title_and_breadcrumb(doc_path)

# ✅ Save single file result in Excel
df = pd.DataFrame([{
    "Title":title,
    "Description": description_html,
    "TOC": toc_html,
    "Methodology": methodology_html,
    "Meta Description": meta,
    "SeoTitle": seo_title,
    "BreadcrumbText": breadcrumb_text
}])

output_path = r"C:\Users\Vishnu\Documents\extracted_docs\SingleFile_Output9.xlsx"
df.to_excel(output_path, index=False)

print(f"✅ Done! Extracted data saved in {output_path}")


For Multiple Word File


In [None]:
from docx import Document
import html
import re
import os
import pandas as pd

# ------------------- Helpers -------------------
def is_list_item(paragraph):
    return paragraph._element.pPr is not None and paragraph._element.pPr.numPr is not None

def convert_run_to_html(run):
    text = html.escape(run.text)
    if not text:
        return ""
    if run.bold:
        text = f"<strong>{text}</strong>"
    if run.italic:
        text = f"<em>{text}</em>"
    if run.underline:
        text = f"<u>{text}</u>"
    return text

def paragraph_to_html(paragraph):
    text = "".join(convert_run_to_html(run) for run in paragraph.runs)
    if not text.strip():
        return ""
    if is_list_item(paragraph):
        return f"<li>{text}</li>"
    style = getattr(paragraph.style, "name", "").lower()
    if "heading" in style:
        level = ''.join(filter(str.isdigit, style)) or "2"
        return f"<h{level}>{text}</h{level}>"
    return f"<p>{text}</p>"

# ------------------- Extractors -------------------

def extract_title(docx_path):
    doc = Document(docx_path)
    capture = False
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and ("report title" in low):
            capture = True
            continue
        if capture and text:
            return text
    return ""

def extract_description(docx_path):
    doc = Document(docx_path)
    html_output, inside_list = [], False
    start, stop = False, False
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not start and "introduction and strategic context" in low:
            start = True
        if start and not stop:
            if "report summary, faqs, and seo schema" in low:
                stop = True
                break
            html_part = paragraph_to_html(para)
            if html_part:
                if html_part.startswith("<li>"):
                    if not inside_list:
                        html_output.append("<ul>")
                        inside_list = True
                    html_output.append(html_part)
                else:
                    if inside_list:
                        html_output.append("</ul>")
                        inside_list = False
                    html_output.append(html_part)
    if inside_list:
        html_output.append("</ul>")
    return "\n".join(html_output)

def extract_toc(docx_path):
    doc = Document(docx_path)
    html_output, inside_list, capture = [], False, False
    end_reached = False
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and "table of contents" in low:
            capture = True
            continue
        if capture:
            if "list of figures" in low:
                html_part = paragraph_to_html(para)
                if html_part:
                    html_output.append(html_part)
                end_reached = True
                continue
            if end_reached:
                style = getattr(para.style, "name", "").lower()
                if "heading" in style or re.match(r"^\d+[\.\)]\s", text):
                    break  
            html_part = paragraph_to_html(para)
            if html_part:
                if html_part.startswith("<li>"):
                    if not inside_list:
                        html_output.append("<ul>")
                        inside_list = True
                    html_output.append(html_part)
                else:
                    if inside_list:
                        html_output.append("</ul>")
                        inside_list = False
                    html_output.append(html_part)
    if inside_list:
        html_output.append("</ul>")
    return "\n".join(html_output)

def extract_methodology(docx_path):
    doc = Document(docx_path)
    faqs, q_count = [], 0
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]
        if "question" in headers and "answer" in headers:
            for row in table.rows[1:]:
                q_text = row.cells[0].text.strip()
                a_text = row.cells[1].text.strip()
                if q_text and a_text:
                    q_count += 1
                    faqs.append(
                        f"<p><strong>Q{q_count}: {html.escape(q_text)}</strong><br>"
                        f"A{q_count}: {html.escape(a_text)}</p>"
                    )
            if faqs:
                return "\n".join(faqs)
    capture = False
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        if "top 5 faqs" in text.lower():
            capture = True
            continue
        if capture:
            both_match = re.match(r"Q\d+[:.]\s*(.*?)\s*A\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if both_match:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(both_match.group(1).strip())}</strong><br>"
                    f"A{q_count}: {html.escape(both_match.group(2).strip())}</p>"
                )
                continue
            q_match = re.match(r"Q\d+[:.]\s*(.*)", text, re.IGNORECASE)
            if q_match:
                current_q = q_match.group(1).strip()
                continue
            a_match = re.match(r"A\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if a_match and 'current_q' in locals():
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(current_q)}</strong><br>"
                    f"A{q_count}: {html.escape(a_match.group(1).strip())}</p>"
                )
                del current_q
    return "\n".join(faqs)

def extract_meta_description(docx_path):
    doc = Document(docx_path)
    capture = False
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and ("introduction" in low):
            capture = True
            continue
        if capture and text:
            return text
    return ""

def _is_new_subheading(text: str) -> bool:
    text = text.strip()
    if not text:
        return False
    if text.startswith(("📊", "❓", "🧩")):
        return True
    if re.match(r"^[A-Z](?:\.\d+)?\.\s", text):
        return True
    if re.match(r"^\d+\.\s", text):
        return True
    return False

def extract_seo_title_and_breadcrumb(docx_path):
    doc = Document(docx_path)
    capture, lines = False, []
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and ("a.3. headline" in low or low.startswith("headline")):
            capture = True
            continue
        if capture:
            if _is_new_subheading(text):
                break
            if text:
                lines.append(text)
                if len(lines) >= 2:
                    break
    seo_title = lines[0] if len(lines) >= 1 else ""
    breadcrumb_text = lines[1] if len(lines) >= 2 else ""
    return seo_title, breadcrumb_text

# ------------------- Runner for All Files -------------------

folder_path = r"C:\Users\Vishnu\Documents\extracted_docs\extracted_docs"
output_path = r"C:\Users\Vishnu\Documents\extracted_docs\AllFiles_Output.xlsx"

all_data = []
for file in os.listdir(folder_path):
    if not file.endswith(".docx") or file.startswith("~$"):
        continue
    doc_path = os.path.join(folder_path, file)
    print(f"Processing: {file}")

    title = extract_title(doc_path)
    description_html = extract_description(doc_path)
    toc_html = extract_toc(doc_path)
    methodology_html = extract_methodology(doc_path)
    meta = extract_meta_description(doc_path)
    seo_title, breadcrumb_text = extract_seo_title_and_breadcrumb(doc_path)

    all_data.append({
        "File": file,
        "Title": title,
        "Description": description_html,
        "TOC": toc_html,
        "Methodology": methodology_html,
        "Meta Description": meta,
        "SeoTitle": seo_title,
        "BreadcrumbText": breadcrumb_text
    })

df = pd.DataFrame(all_data)
df.to_excel(output_path, index=False)
print(f"✅ Done! Extracted data saved in {output_path}")
