For Single file

In [12]:
from docx import Document
import html
import re
import pandas as pd
import os

# ------------------- Helpers -------------------
def is_list_item(paragraph):
    return paragraph._element.pPr is not None and paragraph._element.pPr.numPr is not None

def convert_run_to_html(run):
    text = html.escape(run.text)
    if not text:
        return ""
    if run.bold:
        text = f"<strong>{text}</strong>"
    if run.italic:
        text = f"<em>{text}</em>"
    if run.underline:
        text = f"<u>{text}</u>"
    return text

def paragraph_to_html(paragraph):
    text = "".join(convert_run_to_html(run) for run in paragraph.runs)
    if not text.strip():
        return ""
    if is_list_item(paragraph):
        return f"<li>{text}</li>"
    style = getattr(paragraph.style, "name", "").lower()
    if "heading" in style:
        level = ''.join(filter(str.isdigit, style)) or "2"
        return f"<h{level}>{text}</h{level}>"
    return f"<p>{text}</p>"
# ------------------- Extractors -------------------
def extract_title(docx_path):
    doc = Document(docx_path)
    capture = False

    for i, para in enumerate(doc.paragraphs):
        text = para.text.strip()
        low = text.lower()
        if not text:
            continue

        # --- Case 1: Explicit "Report Title" subheading ---
        if not capture and "report title" in low:
            capture = True
            continue
        if capture and text:
            return text

        # --- Case 2: "Full Title (Long-Form)" format ---
        if "full title" in low:
            # Take next non-empty paragraph
            for j in range(i+1, len(doc.paragraphs)):
                nxt = doc.paragraphs[j].text.strip()
                if nxt:
                    return nxt

    return ""
# ✅ extract description (Heading 1 → Heading 7 only)
def extract_description(docx_path):
    doc = Document(docx_path)
    html_output = []
    inside_list = False
    capture = False  

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        low = text.lower()

        # --- Start Condition ---
        if not capture and "introduction and strategic context" in low:
            capture = True  

        # --- Stop Condition ---
        if capture and "report summary, faqs, and seo schema" in low:
            break  

        if capture:
            html_part = paragraph_to_html(para)

            if not html_part:
                continue

            if html_part.startswith("<li>"):
                if not inside_list:
                    html_output.append("<ul>")
                    inside_list = True
                html_output.append(html_part)
            else:
                if inside_list:
                    html_output.append("</ul>")
                    inside_list = False
                html_output.append(html_part)

    if inside_list:
        html_output.append("</ul>")

    return "\n".join(html_output)
# ✅ TOC (after Heading 9 / Table of Contents)
def extract_toc(docx_path):
    doc = Document(docx_path)
    html_output, inside_list, capture = [], False, False
    end_reached = False

    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()

        # Start condition
        if not capture and "table of contents" in low:
            capture = True
            continue

        if capture:
            # End condition = capture "List of Figures" + its items, then stop
            if "list of figures" in low:
                html_part = paragraph_to_html(para)
                if html_part:
                    html_output.append(html_part)   # add heading "List of Figures"
                end_reached = True
                continue  # don't break yet, because its children may follow

            if end_reached:
                # If koi aur heading/subheading mil gaya to yahan break
                style = getattr(para.style, "name","").lower()
                if "heading" in style or re.match(r"^\d+[\.\)]\s", text):
                    break  

            html_part = paragraph_to_html(para)
            if html_part:
                if html_part.startswith("<li>"):
                    if not inside_list:
                        html_output.append("<ul>")
                        inside_list = True
                    html_output.append(html_part)
                else:
                    if inside_list:
                        html_output.append("</ul>")
                        inside_list = False
                    html_output.append(html_part)

    if inside_list:
        html_output.append("</ul>")
    return "".join(html_output).strip()

# ✅ Methodology (FAQ Table or Paragraphs Qn:/A: format)
def extract_methodology(docx_path):
    from docx import Document
    import re, html

    doc = Document(docx_path)
    faqs, q_count = [], 0

    # --- Case 1: FAQ Table ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]
        if "question" in headers and "answer" in headers:
            for row in table.rows[1:]:
                q_text = row.cells[0].text.strip()
                a_text = row.cells[1].text.strip()
                if q_text and a_text:
                    q_count += 1
                    faqs.append(
                        f"<p><strong>Q{q_count}: {html.escape(q_text)}</strong><br>"
                        f"A{q_count}: {html.escape(a_text)}</p>"
                    )
            if faqs:
                return "\n".join(faqs)

    # --- Case 2: Text-based FAQs ---
    capture, current_q = False, None
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        low = text.lower()

        # --- Start condition
        if not capture and ("top 5 faqs" in low or "faqs" in low or low.startswith("q1") or low.startswith("how ")):
            capture = True
            continue

        # --- Stop condition (don’t eat schemas!)
        if capture and (
            "breadcrumb schema" in low or 
            "faq schema" in low or 
            "json" in low or 
            "schema" in low or 
            text.strip().startswith("{")
        ):
            break

        if capture:
            # Inline Q... A...
            both_match = re.match(r"Q\d*[:.]\s*(.*?)\s*A\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if both_match:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(both_match.group(1).strip())}</strong><br>"
                    f"A{q_count}: {html.escape(both_match.group(2).strip())}</p>"
                )
                continue

            # Question
            q_match = re.match(r"Q\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if q_match:
                current_q = q_match.group(1).strip()
                continue

            # Natural question
            if current_q is None and re.match(r"^(how|what|which|why|when|who)\b", low):
                current_q = text
                continue

            # Answer
            if current_q:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(current_q)}</strong><br>"
                    f"A{q_count}: {html.escape(text)}</p>"
                )
                current_q = None

    return "\n".join(faqs)
# -------------------------------------------------------------Meta Discription--------------------------------
def extract_meta_description(docx_path):
    doc = Document(docx_path)
    capture = False
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and ("introduction" in low):
            capture = True
            continue
        if capture and text:
            return text
    return ""
# --------------------------------------------------------SEO Title----------------------------------------

def extract_seo_title(docx_path):
    doc = Document(docx_path)
    file_name = os.path.splitext(os.path.basename(docx_path))[0]  # File name without extension
    
    revenue_forecast = ""

    # --- Check tables for Report Coverage ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]

        if "report attribute" in headers and "details" in headers:
            attr_idx = headers.index("report attribute")
            details_idx = headers.index("details")

            for row in table.rows[1:]:
                attr = row.cells[attr_idx].text.strip().lower()
                details = row.cells[details_idx].text.strip()

                if "revenue forecast in 2030" in attr:
                    # replace USD with $
                    revenue_forecast = details.replace("USD", "$").strip()
                    break

    if revenue_forecast:
        seo_title = f"{file_name} Market Size ({revenue_forecast}) 2030"
    else:
        seo_title = file_name  # fallback

    return seo_title
# --------------------------------------------------------BreadCrumb Text----------------------------------------
def extract_breadcrumb_text(docx_path):
    doc = Document(docx_path)
    file_name = os.path.splitext(os.path.basename(docx_path))[0]  # File name without extension
    
    revenue_forecast = ""

    # --- Check tables for Report Coverage ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]

        if "report attribute" in headers and "details" in headers:
            attr_idx = headers.index("report attribute")
            details_idx = headers.index("details")

            for row in table.rows[1:]:
                attr = row.cells[attr_idx].text.strip().lower()
                details = row.cells[details_idx].text.strip()

                if "revenue forecast in 2030" in attr:
                    # replace USD with $
                    revenue_forecast = details.replace("USD", "$").strip()
                    break

    if revenue_forecast:
        seo_title = f"{file_name} Report 2030"
    else:
        seo_title = file_name  # fallback

    return seo_title
# ---------------------------------------------BreadCrumb Schema-----------------------------------------------------------------

def extract_breadcrumb_schema(docx_path):
    doc = Document(docx_path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

    capture = False
    breadcrumb_data = []

    for text in paragraphs:
        low = text.lower()

        # ✅ Start condition → JSON block must start with {
        if not capture and text.strip().startswith("{"):
            capture = True
        
        # ✅ End condition → stop when json copy or faq schema heading found
        if capture and ("json copy" in low or "faq schema" in low):
            break

        # ✅ Collect JSON block only
        if capture:
            breadcrumb_data.append(text)

    return "".join(breadcrumb_data).strip()

# -----------------------------------------------------FaqSchema-------------------------------
def extract_faq_schema(docx_path):
    doc = Document(docx_path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

    json_blocks = []
    current_block = []
    capture = False

    for text in paragraphs:
        if text.strip().startswith("{"):   # start of JSON
            capture = True
            current_block = [text]
            continue

        if capture:
            current_block.append(text)
            if text.strip().endswith("}"):   # end of JSON
                json_blocks.append("".join(current_block).strip())
                capture = False

    # ✅ Usually Breadcrumb = first JSON, FAQ Schema = second JSON
    if len(json_blocks) >= 2:
        return json_blocks[1]   # second JSON only (FAQ Schema)
    return ""

# ------------------- Runner -------------------

doc_path = r"C:\Users\Vishnu\Documents\extracted_docs\extracted_docs\Aerial Imaging Market.docx"
# title=extract_title(doc_path)
# description_html = extract_description(doc_path)
# toc_html = extract_toc(doc_path)
# methodology_html = extract_methodology(doc_path)
meta = extract_meta_description(doc_path)
# seo_title=extract_seo_title(doc_path)
# breadcrumb_text=extract_breadcrumb_text(doc_path)
sche2=extract_faq_schema(doc_path)
sche1=extract_breadcrumb_schema(doc_path)

# ✅ Save single file result in Excel
df = pd.DataFrame([{
    # "Title":title,
    # "Description": description_html,
    # "TOC": toc_html,
    # "Methodology": methodology_html,
    # "Meta Description": meta,
    # "SeoTitle": seo_title,
    # "BreadcrumbText": breadcrumb_text,
    'BreadcrumbText':sche1,
    'schema2':sche2,
    # 'BreadcrumbText':sche1
}])

output_path = r"C:\Users\Vishnu\Documents\extracted_docs\Day1.xlsx"
df.to_excel(output_path, index=False)

print(f"✅ Done! Extracted data saved in {output_path}")

✅ Done! Extracted data saved in C:\Users\Vishnu\Documents\extracted_docs\Day1.xlsx


For Multiple File

In [1]:
from docx import Document
import html
import re
import pandas as pd
import os

# ------------------- Helpers -------------------
def is_list_item(paragraph):
    return paragraph._element.pPr is not None and paragraph._element.pPr.numPr is not None

def convert_run_to_html(run):
    text = html.escape(run.text)
    if not text:
        return ""
    if run.bold:
        text = f"<strong>{text}</strong>"
    if run.italic:
        text = f"<em>{text}</em>"
    if run.underline:
        text = f"<u>{text}</u>"
    return text

def paragraph_to_html(paragraph):
    text = "".join(convert_run_to_html(run) for run in paragraph.runs)
    if not text.strip():
        return ""
    if is_list_item(paragraph):
        return f"<li>{text}</li>"
    style = getattr(paragraph.style, "name", "").lower()
    if "heading" in style:
        level = ''.join(filter(str.isdigit, style)) or "2"
        return f"<h{level}>{text}</h{level}>"
    return f"<p>{text}</p>"
# ------------------- Extractors -------------------
def extract_title(docx_path):
    doc = Document(docx_path)
    capture = False

    for i, para in enumerate(doc.paragraphs):
        text = para.text.strip()
        low = text.lower()
        if not text:
            continue

        # --- Case 1: Explicit "Report Title" subheading ---
        if not capture and "report title" in low:
            capture = True
            continue
        if capture and text:
            return text

        # --- Case 2: "Full Title (Long-Form)" format ---
        if "full title" in low:
            # Take next non-empty paragraph
            for j in range(i+1, len(doc.paragraphs)):
                nxt = doc.paragraphs[j].text.strip()
                if nxt:
                    return nxt

    return ""
# ✅ extract description (Heading 1 → Heading 7 only)

def extract_description(docx_path):
    doc = Document(docx_path)
    html_output = []
    inside_list = False
    capture = False  

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        low = text.lower()

        # --- Start Condition ---
        if not capture and "introduction and strategic context" in low:
            capture = True  

        # --- Stop Condition ---
        if capture and "report summary, faqs, and seo schema" in low:
            break  

        if capture:
            html_part = paragraph_to_html(para)

            if not html_part:
                continue

            if html_part.startswith("<li>"):
                if not inside_list:
                    html_output.append("<ul>")
                    inside_list = True
                html_output.append(html_part)
            else:
                if inside_list:
                    html_output.append("</ul>")
                    inside_list = False
                html_output.append(html_part)

    if inside_list:
        html_output.append("</ul>")

    return "\n".join(html_output)
# ✅ TOC (after Heading 9 / Table of Contents)
def extract_toc(docx_path):
    doc = Document(docx_path)
    html_output, inside_list, capture = [], False, False
    end_reached = False

    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()

        # Start condition
        if not capture and "table of contents" in low:
            capture = True
            continue

        if capture:
            # End condition = capture "List of Figures" + its items, then stop
            if "list of figures" in low:
                html_part = paragraph_to_html(para)
                if html_part:
                    html_output.append(html_part)   # add heading "List of Figures"
                end_reached = True
                continue  # don't break yet, because its children may follow

            if end_reached:
                # If koi aur heading/subheading mil gaya to yahan break
                style = getattr(para.style, "name","").lower()
                if "heading" in style or re.match(r"^\d+[\.\)]\s", text):
                    break  

            html_part = paragraph_to_html(para)
            if html_part:
                if html_part.startswith("<li>"):
                    if not inside_list:
                        html_output.append("<ul>")
                        inside_list = True
                    html_output.append(html_part)
                else:
                    if inside_list:
                        html_output.append("</ul>")
                        inside_list = False
                    html_output.append(html_part)

    if inside_list:
        html_output.append("</ul>")
    return "".join(html_output).strip()


# ✅ Methodology (FAQ Table or Paragraphs Qn:/A: format)
def extract_methodology(docx_path):
    from docx import Document
    import re, html

    doc = Document(docx_path)
    faqs, q_count = [], 0

    # --- Case 1: FAQ Table ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]
        if "question" in headers and "answer" in headers:
            for row in table.rows[1:]:
                q_text = row.cells[0].text.strip()
                a_text = row.cells[1].text.strip()
                if q_text and a_text:
                    q_count += 1
                    faqs.append(
                        f"<p><strong>Q{q_count}: {html.escape(q_text)}</strong><br>"
                        f"A{q_count}: {html.escape(a_text)}</p>"
                    )
            if faqs:
                return "\n".join(faqs)  # ✅ return if table found

    # --- Case 2 & 3: Text-based FAQs ---
    capture = False
    current_q = None

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        low = text.lower()

        # ✅ Start condition
        if not capture and ("top 5 faqs" in low or low.startswith("how big")):
            capture = True
            continue

        # ✅ End condition
        if capture and ("breadcrumb" in low or "json-ld" in low or "schema" in low or "json copy" in low):
            break

        if capture:
            # --- Case 1: Question: ... Answer: ...
            qa_match = re.match(r"question[:：]\s*(.*)", text, re.IGNORECASE)
            if qa_match:
                current_q = qa_match.group(1).strip()
                continue
            ans_match = re.match(r"answer[:：]\s*(.*)", text, re.IGNORECASE)
            if ans_match and current_q:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(current_q)}</strong><br>"
                    f"A{q_count}: {html.escape(ans_match.group(1).strip())}</p>"
                )
                current_q = None
                continue

            # --- Case 2: Q1: ... (ignore numbering) ---
            q_match = re.match(r"q?\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if q_match and re.match(r"^(how|what|which|why|when|who)\b", q_match.group(1).lower()):
                current_q = q_match.group(1).strip()
                continue

            # --- Case 3: 1. How big ... ---
            q_num_match = re.match(r"\d+\.\s*(.*)", text)
            if q_num_match and re.match(r"^(how|what|which|why|when|who)\b", q_num_match.group(1).lower()):
                current_q = q_num_match.group(1).strip()
                continue

            # --- Case 4: Inline Q & A (Q... A...) ---
            both_match = re.match(r"Q\d*[:.]\s*(.*?)\s*A\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if both_match:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(both_match.group(1).strip())}</strong><br>"
                    f"A{q_count}: {html.escape(both_match.group(2).strip())}</p>"
                )
                continue

            # --- Answer line (if current_q is set) ---
            if current_q and text:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(current_q)}</strong><br>"
                    f"A{q_count}: {html.escape(text)}</p>"
                )
                current_q = None

    return "\n".join(faqs)
def extract_methodology(docx_path):
    from docx import Document
    import re, html

    doc = Document(docx_path)
    faqs, q_count = [], 0

    # --- Case 1: FAQ Table ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]
        if "question" in headers and "answer" in headers:
            for row in table.rows[1:]:
                q_text = row.cells[0].text.strip()
                a_text = row.cells[1].text.strip()
                if q_text and a_text:
                    q_count += 1
                    faqs.append(
                        f"<p><strong>Q{q_count}: {html.escape(q_text)}</strong><br>"
                        f"A{q_count}: {html.escape(a_text)}</p>"
                    )
            if faqs:
                return "\n".join(faqs)

    # --- Case 2: Text-based FAQs ---
    capture, current_q = False, None
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        low = text.lower()

        # --- Start condition
        if not capture and ("top 5 faqs" in low or "faqs" in low or low.startswith("q1") or low.startswith("how ")):
            capture = True
            continue

        # --- Stop condition (don’t eat schemas!)
        if capture and (
            "breadcrumb schema" in low or 
            "faq schema" in low or 
            "json" in low or 
            "schema" in low or 
            text.strip().startswith("{")
        ):
            break

        if capture:
            # Inline Q... A...
            both_match = re.match(r"Q\d*[:.]\s*(.*?)\s*A\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if both_match:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(both_match.group(1).strip())}</strong><br>"
                    f"A{q_count}: {html.escape(both_match.group(2).strip())}</p>"
                )
                continue

            # Question
            q_match = re.match(r"Q\d*[:.]\s*(.*)", text, re.IGNORECASE)
            if q_match:
                current_q = q_match.group(1).strip()
                continue

            # Natural question
            if current_q is None and re.match(r"^(how|what|which|why|when|who)\b", low):
                current_q = text
                continue

            # Answer
            if current_q:
                q_count += 1
                faqs.append(
                    f"<p><strong>Q{q_count}: {html.escape(current_q)}</strong><br>"
                    f"A{q_count}: {html.escape(text)}</p>"
                )
                current_q = None

    return "\n".join(faqs)
# -----------------------------------------------------------Meta Discription--------------------------------------------
def extract_meta_description(docx_path):
    doc = Document(docx_path)
    capture = False
    for para in doc.paragraphs:
        text = para.text.strip()
        low = text.lower()
        if not capture and ("introduction" in low):
            capture = True
            continue
        if capture and text:
            return text
    return ""

# -----------------------------------------------------------------------------SEO title-------------------------------------------------------------
def extract_seo_title(docx_path):
    doc = Document(docx_path)
    file_name = os.path.splitext(os.path.basename(docx_path))[0]  # File name without extension
    
    revenue_forecast = ""

    # --- Check tables for Report Coverage ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]

        if "report attribute" in headers and "details" in headers:
            attr_idx = headers.index("report attribute")
            details_idx = headers.index("details")

            for row in table.rows[1:]:
                attr = row.cells[attr_idx].text.strip().lower()
                details = row.cells[details_idx].text.strip()

                if "revenue forecast in 2030" in attr:
                    # replace USD with $
                    revenue_forecast = details.replace("USD", "$").strip()
                    break

    if revenue_forecast:
        seo_title = f"{file_name} Size ({revenue_forecast}) 2030"
    else:
        seo_title = file_name  # fallback

    return seo_title
# --------------------------------------------------------BreadCrumb Text----------------------------------------
def extract_breadcrumb_text(docx_path):
    doc = Document(docx_path)
    file_name = os.path.splitext(os.path.basename(docx_path))[0]  # File name without extension
    
    revenue_forecast = ""

    # --- Check tables for Report Coverage ---
    for table in doc.tables:
        headers = [cell.text.strip().lower() for cell in table.rows[0].cells]

        if "report attribute" in headers and "details" in headers:
            attr_idx = headers.index("report attribute")
            details_idx = headers.index("details")

            for row in table.rows[1:]:
                attr = row.cells[attr_idx].text.strip().lower()
                details = row.cells[details_idx].text.strip()

                if "revenue forecast in 2030" in attr:
                    # replace USD with $
                    revenue_forecast = details.replace("USD", "$").strip()
                    break

    if revenue_forecast:
        seo_title = f"{file_name} Report 2030"
    else:
        seo_title = file_name  # fallback

    return seo_title

# ---------------------------------------------SkuCode-Extraction------------------------------
def extract_sku_code(docx_path):
    filename = os.path.basename(docx_path)
    sku_code = os.path.splitext(filename)[0].lower()
    return sku_code
# ---------------------------------------------URLRP------------------------------
def extract_sku_url(docx_path):
    filename = os.path.basename(docx_path)
    sku_code = os.path.splitext(filename)[0].lower()
    return sku_code

# ---------------------------------------------BreadCrumb Schema----------------------------
def extract_breadcrumb_schema(docx_path):
    doc = Document(docx_path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

    capture = False
    breadcrumb_data = []

    for text in paragraphs:
        low = text.lower()

        # ✅ Start condition → JSON block must start with {
        if not capture and text.strip().startswith("{"):
            capture = True
        
        # ✅ End condition → stop when json copy or faq schema heading found
        if capture and ("json copy" in low or "faq schema" in low):
            break

        # ✅ Collect JSON block only
        if capture:
            breadcrumb_data.append(text)

    return "".join(breadcrumb_data).strip()

# -----------------------------------------------------FaqSchema-------------------------------
def extract_faq_schema(docx_path):
    doc = Document(docx_path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

    json_blocks = []
    current_block = []
    capture = False

    for text in paragraphs:
        if text.strip().startswith("{"):   # start of JSON
            capture = True
            current_block = [text]
            continue

        if capture:
            current_block.append(text)
            if text.strip().endswith("}"):   # end of JSON
                json_blocks.append("".join(current_block).strip())
                capture = False

    # ✅ Usually Breadcrumb = first JSON, FAQ Schema = second JSON
    if len(json_blocks) >= 2:
        return json_blocks[1]   # second JSON only (FAQ Schema)
    return ""

# --------------------------------------------------------Runner------------------------------------------------------------

folder_path = r"C:\Users\Vishnu\Desktop\oldcontent\20-21-22-23-24-25 June Files\20-21-22-23-24-25 June Files"
output_path = r"C:\Users\Vishnu\Documents\extracted_docs\Extraction.xlsx"

all_data = []

for file in os.listdir(folder_path):
    if not file.endswith(".docx") or file.startswith("~$"):
        continue

    doc_path = os.path.join(folder_path, file)
    print(f"Processing: {file}")
    title = extract_title(doc_path)
    description_html = extract_description(doc_path)
    toc_html = extract_toc(doc_path)
    methodology_html = extract_methodology(doc_path)
    meta = extract_meta_description(doc_path)
    seo_title=extract_seo_title(doc_path)
    breadcrumb_text=extract_breadcrumb_text(doc_path)
    sku=extract_sku_code(doc_path)
    url=extract_sku_url(doc_path)
    schema1=extract_breadcrumb_schema(doc_path)
    schema2=extract_faq_schema(doc_path)
    # schema2=extract_faq_schema(doc_path)
    # schema2=extract_breadcrumb_schema2(doc_path)
   

    all_data.append({
        "File": file,
        "Title": title,
        "Description": description_html,
        "TOC": toc_html, 
        "Methodology": methodology_html,
        "Meta Description": meta,
        "SeoTitle": seo_title,
        "BreatdCrumb_Text":breadcrumb_text,
        "SkuCode":sku,
        'urlrp':url,
        "Schema1": schema1,
        'schema2':schema2
        })

df = pd.DataFrame(all_data)
df.to_excel(output_path, index=False)

print(f"✅ Done! Extracted data saved in {output_path}")


Processing: 3D Cardiac Mapping System Market.docx
Processing: 3D Laparoscopy Imaging Market.docx
Processing: 3D Medical Imaging Devices Market.docx
Processing: 3D Printed Brain Model Market.docx
Processing: 3D Ultrasound Market.docx
Processing: Achondrogenesis Market.docx
Processing: Active Phased Array Radar Market.docx
Processing: Adult Spinal Deformity Market.docx
Processing: Advanced Cardiovascular Life Support (ACLS) Market.docx
Processing: Advanced Wound Dressings Market.docx
Processing: Aeroderivative Sensor Market.docx
Processing: Aerospace Accumulator Market.docx
Processing: Aerospace And Defense C-Class Parts Market.docx
Processing: Aerospace Cold Forgings Market.docx
Processing: Aerospace Engine Vibration Monitoring System Market.docx
Processing: Aerospace Filter Market.docx
Processing: Aerospace Forging Market.docx
Processing: Aerospace Fuel Tank Inerting System.docx
Processing: Aerospace Insulation Market.docx
Processing: Aerospace Pressure Bulkhead Market.docx
Processing: