In [2]:
import pdfplumber
from pathlib import Path
from docx import Document

In [3]:
def extract_text(file_path):
    """
    Extracts text from PDF/TXT/DOCX files.
    Returns: List of text chunks (pages/sections) or None if failed
    """
    try:
        if not Path(file_path).exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        # PDF handling
        if file_path.lower().endswith('.pdf'):
            text = []
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    content = page.extract_text(x_tolerance=3)  # PPT-friendly
                    if content:
                        text.append(content)
            return text if text else None
        
        # TXT handling
        elif file_path.lower().endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                return [f.read()]
        
        # DOCX handling
        elif file_path.lower().endswith('.docx'):
            doc = Document(file_path)
            return ["\n".join(para.text for para in doc.paragraphs)]
        
        else:
            raise ValueError("Unsupported file type. Use PDF/TXT/DOCX")
            
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return None

In [4]:
# Replace with your actual file path
test_file = "/home/dibi/Documents/pdf-chatbot-metada/ppt1.pdf"  

# Execute extraction
result = extract_text(test_file)

# Display results
if result:
    print(f"✅ Extracted {len(result)} sections")
    print("\nSample content:")
    # print(result) # for text
    print(result[4][:1000] + "...")  # First n chars of first section for pdf and docx files
else:
    print("❌ Extraction failed")

✅ Extracted 42 sections

Sample content:
Connectivity Verification
ICMPv4 Messages (Contd.)
Destination or Service Unreachable
• When a host or gateway receives a packet that it cannot deliver, it can use an ICMP
Destination Unreachable message to notify the source that the destination or service is
unreachable.
• The message will include a code that indicates why the packet could not be delivered. The
Destination Unreachable codes for ICMPv4 includes the following:
• 0 - Net unreachable
• 1 - Host unreachable
• 2 - Protocol unreachable
• 3 - Port unreachable
Note: ICMPv6 has slightly different codes for Destination Unreachable messages....
