In [None]:
# If running in Jupyter, uncomment the line below and run it once to install libraries:
!pip install pypdf docx2pdf pywin32 Pillow

import os
import sys
import logging
from PIL import Image

# Suppress pypdf warnings (wrong pointing object, etc.)
logging.getLogger("pypdf").setLevel(logging.ERROR)

from pypdf import PdfWriter
from docx2pdf import convert

# Try importing pythoncom for COM initialization (required for Word in Jupyter)
try:
    import pythoncom
except ImportError:
    print("CRITICAL ERROR: 'pywin32' is not installed.")
    print("Please run: !pip install pywin32")
    # Stop execution if library is missing
    sys.exit(1) 

def get_file_prefix(filename):
    """
    Extracts the part of the filename before the first underscore.
    Example: 'ahmedestiak_786_86_.pdf' -> 'ahmedestiak'
    """
    name_no_ext = os.path.splitext(filename)[0]
    if '_' in name_no_ext:
        return name_no_ext.split('_')[0]
    return name_no_ext

def merge_files_by_prefix(source_folder):
    # Ensure we use absolute paths for safety
    source_folder = os.path.abspath(source_folder)
    
    # Create the output folder INSIDE the source folder
    output_folder = os.path.join(source_folder, 'Merged_Output')
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Dictionary to hold lists of files
    file_groups = {}

    print(f"Scanning directory: {source_folder}")
    try:
        all_files = os.listdir(source_folder)
    except FileNotFoundError:
        print(f"Error: The folder {source_folder} was not found.")
        return

    # 1. SCAN AND GROUP FILES
    for filename in all_files:
        lower_name = filename.lower()
        # skip temporary files created by Word (~$file.docx)
        if lower_name.startswith('~$'):
            continue
            
        if lower_name.endswith(('.pdf', '.docx', '.jpg', '.jpeg')):
            prefix = get_file_prefix(filename)
            if prefix not in file_groups:
                file_groups[prefix] = []
            file_groups[prefix].append(filename)

    # 2. PROCESS EACH GROUP
    for prefix, files in file_groups.items():
        print(f"Processing group: {prefix} ({len(files)} files)")
        
        merger = PdfWriter()
        temp_files_to_remove = []

        # Sort files to ensure deterministic order
        files.sort()

        try:
            for filename in files:
                file_path = os.path.join(source_folder, filename)
                lower_filename = filename.lower()
                
                if lower_filename.endswith('.pdf'):
                    print(f"  - Appending PDF: {filename}")
                    merger.append(file_path)
                
                elif lower_filename.endswith('.docx'):
                    print(f"  - Converting DOCX: {filename}")
                    # Create temp file path (use absolute path)
                    temp_pdf_name = f"temp_{os.path.splitext(filename)[0]}.pdf"
                    temp_pdf_path = os.path.join(source_folder, temp_pdf_name)
                    
                    # Initialize COM for this thread (Fixes Jupyter/Loop errors)
                    pythoncom.CoInitialize()
                    
                    # convert() handles the file creation
                    convert(file_path, temp_pdf_path)
                    
                    merger.append(temp_pdf_path)
                    temp_files_to_remove.append(temp_pdf_path)
                
                elif lower_filename.endswith(('.jpg', '.jpeg')):
                    print(f"  - Converting Image: {filename}")
                    # Create temp file path
                    temp_pdf_name = f"temp_{os.path.splitext(filename)[0]}.pdf"
                    temp_pdf_path = os.path.join(source_folder, temp_pdf_name)

                    # Open image and convert to RGB (standard PDF color mode)
                    image = Image.open(file_path)
                    image = image.convert('RGB')
                    image.save(temp_pdf_path)

                    merger.append(temp_pdf_path)
                    temp_files_to_remove.append(temp_pdf_path)

            # 3. WRITE THE FINAL MERGED PDF
            output_filename = f"{prefix}.pdf"
            output_path = os.path.join(output_folder, output_filename)
            
            print(f"  > Saving merged file to: {output_path}")
            merger.write(output_path)

        except Exception as e:
            if "pywintypes" in str(e):
                print(f"Error processing group {prefix}: dependency error. Try restarting the Jupyter Kernel.")
            else:
                print(f"Error processing group {prefix}: {e}")
        
        finally:
            merger.close()
            # 4. CLEANUP TEMPORARY FILES
            for temp_path in temp_files_to_remove:
                if os.path.exists(temp_path):
                    try:
                        os.remove(temp_path)
                    except:
                        pass
            print("------------------------------------------------")

# Define the path variable
folder_path = r'C:\Users\eahmed\Downloads\Compressed\A'  # Change this to your source folder path

# Run the function
if __name__ == "__main__":
    if os.path.exists(folder_path):
        merge_files_by_prefix(folder_path)
        print("Done! Check the 'Merged_Output' folder inside your source directory.")
    else:
        print(f"Error: The path '{folder_path}' does not exist.")