### Input Processing Module 

In [1]:
import os
import fitz  # PyMuPDF for PDF processing

def detect_file_type(file_path):
    """
    Detect the type of the document based on the file extension.
    """
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == '.pdf':
        return 'PDF'
    elif file_extension.lower() == '.docx':
        return 'DOCX'
    else:
        raise ValueError("Unsupported file type. Please upload a PDF or DOCX file.")

def extract_pdf_content(file_path):
    """
    Extract text and metadata from a PDF file using PyMuPDF.
    """
    try:
        pdf_document = fitz.open(file_path)
        text_content = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text_content += page.get_text()
        pdf_document.close()
        return text_content
    except Exception as e:
        raise RuntimeError(f"Error processing PDF file: {e}")

def process_input_file(file_path):
    """
    Detect the file type and extract raw content.
    """
    file_type = detect_file_type(file_path)
    if file_type == 'PDF':
        return extract_pdf_content(file_path)
    elif file_type == 'DOCX':
        raise RuntimeError("DOCX processing not implemented yet.")
    else:
        raise ValueError("Unsupported file type.")

### Text Processing Module

In [3]:
import re
import spacy
from langdetect import detect  # For language detection

# Load SpaCy language model
nlp = spacy.load("en_core_web_sm")

def clean_text(raw_text):
    """
    Cleans the extracted text by removing unwanted elements like extra spaces,
    headers, footers, and non-informative lines.
    """
    cleaned_text = re.sub(r"\n\s*\n", "\n", raw_text)  # Remove multiple newlines
    cleaned_text = re.sub(r"Page\s\d+(\s(of)\s\d+)?", "", cleaned_text, flags=re.IGNORECASE)  # Remove page numbers
    cleaned_text = re.sub(r"^\s*[\d\W]+\s*$", "", cleaned_text, flags=re.MULTILINE)  # Remove non-informative lines
    cleaned_text = cleaned_text.strip()  # Remove leading/trailing whitespace
    return cleaned_text

def tokenize_text(cleaned_text):
    """
    Tokenizes the text into sentences and words using SpaCy.
    """
    doc = nlp(cleaned_text)
    sentences = [sent.text.strip() for sent in doc.sents]
    words = [token.text for token in doc if not token.is_space]
    return sentences, words

def segment_text(cleaned_text):
    """
    Segments text into sections based on headings or predefined delimiters.
    """
    # Simple heuristic for heading-based segmentation
    sections = re.split(r"\n[A-Z][^\n]+:\n", cleaned_text)
    sections = [section.strip() for section in sections if section.strip()]
    return sections

def detect_language(cleaned_text):
    """
    Detects the language of the text using the langdetect library.
    """
    try:
        return detect(cleaned_text)
    except Exception:
        return "Unknown"

def preprocess_text(raw_text, detect_lang=True):
    """
    Integrates cleaning, tokenization, segmentation, and optional language detection.
    """
    print("Cleaning text...")
    cleaned_text = clean_text(raw_text)

    print("Tokenizing text...")
    sentences, words = tokenize_text(cleaned_text)

    print("Segmenting text...")
    sections = segment_text(cleaned_text)

    language = "Not Detected"
    if detect_lang:
        print("Detecting language...")
        language = detect_language(cleaned_text)

    return {
        "cleaned_text": cleaned_text,
        "sentences": sentences,
        "words": words,
        "sections": sections,
        "language": language
    }

### Document Structure Recognition Module

In [None]:
import re
import spacy

# Load SpaCy language model for NER
nlp = spacy.load("en_core_web_sm")

def extract_title(cleaned_text):
    """
    Extracts the title of the document, assuming it spans multiple lines at the beginning.
    """
    lines = cleaned_text.split("\n")
    title_lines = []
    for line in lines:
        if len(line.strip()) > 0 and not line.isupper():  # Avoid footers or all-uppercase text
            title_lines.append(line.strip())
        if len(title_lines) >= 3:  # Assuming title won't exceed 3 lines
            break
    return " ".join(title_lines)

def extract_abstract(cleaned_text):
    """
    Extracts the abstract using a pattern or keyword match.
    """
    match = re.search(r"\bAbstract\b.*?(?=\n[A-Z][^\n]+\n|$)", cleaned_text, re.DOTALL | re.IGNORECASE)
    return match.group(0).strip() if match else "Abstract not found"

def extract_metadata(cleaned_text):
    """
    Performs Named Entity Recognition (NER) to extract authors, affiliations, and other metadata.
    """
    doc = nlp(cleaned_text)
    authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    organizations = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    return {
        "authors": authors[:5],  # Limit to first 5 authors
        "affiliations": organizations
    }

def segment_hierarchy(cleaned_text):
    """
    Segments the document into sections and handles footers separately.
    """
    # Extract footers (e.g., page numbers, footnotes)
    footer_pattern = r"Page\s\d+(\s(of)\s\d+)?|©|All rights reserved|doi:"
    main_content = re.sub(footer_pattern, "", cleaned_text, flags=re.IGNORECASE)
    
    # Identify main sections using common headings (a general approach for first time)
    section_patterns = [
        r"\bAbstract\b",
        r"\bIntroduction\b",
        r"\bMethodology\b",
        r"\bMethods\b",
        r"\bResults\b",
        r"\bDiscussion\b",
        r"\bConclusion\b",
        r"\bReferences\b"
    ]
    sections = re.split(r"|".join(section_patterns), main_content, flags=re.IGNORECASE)
    headings = re.findall(r"|".join(section_patterns), main_content, flags=re.IGNORECASE)
    
    # Create hierarchical structure
    hierarchy = {}
    for i, heading in enumerate(headings):
        content = sections[i + 1].strip() if i + 1 < len(sections) else ""
        hierarchy[heading] = content

    return hierarchy

def recognize_structure(cleaned_text):
    """
    Integrates title, metadata, and hierarchical segmentation.
    """
    print("Extracting title...")
    title = extract_title(cleaned_text)

    print("Extracting abstract...")
    abstract = extract_abstract(cleaned_text)

    print("Extracting metadata...")
    metadata = extract_metadata(cleaned_text)

    print("Segmenting document into sections...")
    hierarchy = segment_hierarchy(cleaned_text)

    return {
        "title": title,
        "abstract": abstract,
        "metadata": metadata,
        "hierarchy": hierarchy
    }

### Main Script for Integration for modular functions 

In [5]:
# if __name__ == "__main__":
#     # File path to the PDF document
#     file_path = r"D:\Voice Assistants\ai_assistant\modules\research_papers\precision meds for hypertension.pdf"
    
#     try:
#         # Step 1: Use Input Processing Module to extract raw text
#         from input_processing_module import process_input_file
#         raw_text = process_input_file(file_path)
        
#         # Step 2: Use Text Preprocessing Module to clean, tokenize, and segment text
#         from text_processing_module import preprocess_text
#         processed_data = preprocess_text(raw_text, detect_lang=True)
        
#         # Step 3: Use Document Structure Recognition Module to identify sections
#         from document_structure_module import recognize_structure
#         document_structure = recognize_structure(processed_data["cleaned_text"])
        
#         # Output Results
#         print("\nTitle:")
#         print(document_structure["title"])
        
#         print("\nAbstract:")
#         print(document_structure["abstract"])
        
#         print("\nMetadata:")
#         print("Authors:", document_structure["metadata"]["authors"])
#         print("Affiliations:", document_structure["metadata"]["affiliations"])
        
#         print("\nSections:")
#         for heading, content in document_structure["hierarchy"].items():
#             print(f"\n{heading}:\n{content[:500]}")  # Preview first 500 characters of each section
#     except Exception as e:
#         print(f"Error: {e}")


In [None]:
if __name__ == "__main__":
    # File path to the PDF document
    file_path = r"provide path here" #path to the pdf file
    
    try:
        # Step 1: Use Input Processing Module to extract raw text
        # from input_processing_module import process_input_file
        raw_text = process_input_file(file_path)
        
        # Step 2: Use Text Preprocessing Module to clean, tokenize, and segment text
        # from text_processing_module import preprocess_text
        processed_data = preprocess_text(raw_text, detect_lang=True)
        
        # Step 3: Use Document Structure Recognition Module to identify sections
        # from document_structure_module import recognize_structure
        document_structure = recognize_structure(processed_data["cleaned_text"])
        
        # Output Results
        print("\nTitle:")
        print(document_structure["title"])
        
        print("\nAbstract:")
        print(document_structure["abstract"])
        
        print("\nMetadata:")
        print("Authors:", document_structure["metadata"]["authors"])
        print("Affiliations:", document_structure["metadata"]["affiliations"])
        
        print("\nSections:")
        for heading, content in document_structure["hierarchy"].items():
            print(f"\n{heading}:\n{content}")  # Preview 
    except Exception as e:
        print(f"Error: {e}")