# Confluence Markdown Exporter

In [None]:
# Configuration
CONFLUENCE_URL = 'https://your-confluence-url.com'
USERNAME = 'your-username'
API_TOKEN = 'your-api-token'
SPACE_KEY = 'YOUR_SPACE_KEY'
OUTPUT_DIR = "export_directory"

import os
import re
import time
import requests
from bs4 import BeautifulSoup
from atlassian import Confluence
from markdownify import markdownify as md

# Initialize Confluence API
confluence = Confluence(
    url=CONFLUENCE_URL,
    username=USERNAME,
    password=API_TOKEN
)

def sanitize_filename(title, max_length=50):
    """Sanitizes filenames and limits length."""
    sanitized = re.sub(r'[\\/:*?"<>|]', '_', title)
    return sanitized[:max_length] + "..." if len(sanitized) > max_length else sanitized

def process_html_content(page_id, title, html_content):
    """Process HTML content, removing images and other non-textual elements."""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove all images
    for img in soup.find_all('img'):
        img.decompose()
    
    # Remove all ac:image elements
    for ac_image in soup.find_all('ac:image'):
        ac_image.decompose()
    
    # Remove all draw.io diagrams
    for drawio in soup.find_all('div', attrs={'data-macro-name': 'drawio'}):
        drawio.decompose()
    
    # Remove any other attachment-related elements
    for attachment in soup.find_all('ri:attachment'):
        attachment.decompose()
    
    # Remove any file-related elements
    for file_elem in soup.find_all('ac:structured-macro', {'ac:name': 'attachments'}):
        file_elem.decompose()
    
    # Convert the modified HTML to markdown
    modified_html = str(soup)
    markdown_content = md(modified_html)
    
    return markdown_content

def export_space_to_markdown(space_key, output_dir):
    """Exports all pages in a Confluence space to Markdown with only textual content."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Use pagination to get all pages
    start = 0
    limit = 100  # Smaller batch size for better error handling
    all_pages = []
    
    while True:
        batch = confluence.get_all_pages_from_space(space_key, start=start, limit=limit)
        if not batch:
            break
            
        all_pages.extend(batch)
        start += limit
        print(f"Retrieved {len(all_pages)} pages so far...")
        
        # If we got fewer pages than the limit, we've reached the end
        if len(batch) < limit:
            break
        
        # Add a small delay to avoid rate limiting
        time.sleep(0.5)
    
    print(f"Total pages found: {len(all_pages)}")
    
    for page in all_pages:
        title = page['title']
        sanitized_title = sanitize_filename(title)
        markdown_file_path = os.path.join(output_dir, f"{sanitized_title}.md")

        if os.path.exists(markdown_file_path):
            print(f"⚠️  Skipped (already exported): {title}")
            continue

        if len(markdown_file_path) > 255:
            print(f"❌ Error: File path too long -> {markdown_file_path}")
            continue

        try:
            page_content = confluence.get_page_by_id(page['id'], expand='body.storage')
            html_content = page_content['body']['storage']['value']
        except Exception as e:
            print(f"❌ Error retrieving page '{title}': {e}")
            continue
        
        try:
            # Process HTML content to remove images and other non-textual elements
            updated_markdown = process_html_content(page['id'], title, html_content)

            with (markdown_file_path, 'w', encoding='utf-8') as file:
                file.write(updated_markdown)
            print(f"✅ Exported: {title}")
            time.sleep(0.2)
        except Exception as e:
            print(f"❌ Error processing page '{title}': {e}")
        
        # Add a small delay between page processing to avoid rate limiting
        time.sleep(0.2)

# Run the export
if __name__ == "__main__":
    export_space_to_markdown(SPACE_KEY, OUTPUT_DIR)

In [None]:
# Configuration
CONFLUENCE_URL = 'https://your-confluence-url.com'
USERNAME = 'your-username'
API_TOKEN = 'your-api-token'

#your-api-token
SPACE_KEY = 'YOUR KEY'
OUTPUT_DIR = "export_directory"

import os
import re
import time
import requests
import random
import string
from bs4 import BeautifulSoup
from atlassian import Confluence
from markdownify import markdownify as md
from urllib.parse import urlparse, unquote

# Initialize Confluence API
confluence = Confluence(
    url=CONFLUENCE_URL,
    username=USERNAME,
    password=API_TOKEN
)

def sanitize_filename(title, max_length=50):
    """Sanitizes filenames and limits length."""
    sanitized = re.sub(r'[\\/:*?"<>|]', '_', title)
    return sanitized[:max_length] + "..." if len(sanitized) > max_length else sanitized

def create_image_filename(page_title, counter, max_length=30):
    """Create an image filename based on page title with only letters, max 30 chars."""
    # Remove non-letter characters and convert to lowercase
    letters_only = ''.join(c for c in page_title if c.isalpha())
    
    # If counter is provided, add it as prefix
    if counter > 0:
        prefix = f"{counter}"
        # Ensure total length doesn't exceed max_length
        available_length = max_length - len(prefix)
        if available_length <= 0:
            # If prefix is too long, just use the prefix
            return prefix[:max_length]
        # Use prefix + truncated title
        return f"{prefix}{letters_only[:available_length]}"
    else:
        # Just use the title, truncated if needed
        return letters_only[:max_length]

def get_file_extension(filename):
    """Extract the file extension from a filename."""
    _, ext = os.path.splitext(filename)
    return ext.lower() if ext else ".png"  # Default to .png if no extension

def get_attachments(page_id):
    """Retrieves a list of all attachments for a given Confluence page."""
    attachments = []
    start = 0
    limit = 50
    
    while True:
        url = f"{CONFLUENCE_URL}/rest/api/content/{page_id}/child/attachment?start={start}&limit={limit}"
        response = requests.get(url, auth=(USERNAME, API_TOKEN), headers={"Accept": "application/json"})
        
        if response.status_code != 200:
            print(f"❌ Failed to fetch attachments for page {page_id}: {response.status_code}")
            break
            
        result = response.json()
        batch = result.get('results', [])
        attachments.extend(batch)
        
        if len(batch) < limit or 'next' not in result['_links']:
            break
            
        start += limit
    
    return attachments

def extract_diagram_id_from_filename(filename):
    """Extract the diagram ID from a filename (e.g., 'Untitled Diagram-1742929746623')."""
    match = re.search(r'Diagram-(\d+)', filename)
    if match:
        return match.group(1)
    return None

def find_png_for_drawio(diagram_id, attachments):
    """Find the PNG file for a draw.io diagram based on its ID."""
    if not diagram_id:
        return None
    
    for attachment in attachments:
        title = attachment['title']
        if title.lower().endswith('.png') and diagram_id in title:
            return attachment
    
    return None

def extract_image_references_from_html(html_content):
    """Extract all image references from HTML content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    image_refs = {}
    
    for img in soup.find_all('img'):
        src = img.get('src', '')
        if src:
            filename = os.path.basename(unquote(urlparse(src).path))
            image_refs[filename] = src
    
    for drawio in soup.find_all('div', attrs={'data-macro-name': 'drawio'}):
        for attr, value in drawio.attrs.items():
            if attr.startswith('data-') and value:
                image_refs[value] = value
        
        img = drawio.find('img')
        if img and img.get('src'):
            src = img.get('src')
            filename = os.path.basename(unquote(urlparse(src).path))
            image_refs[filename] = src
    
    for ac_image in soup.find_all('ac:image'):
        ri_value = ac_image.find('ri:attachment', {'ri:filename': True})
        if ri_value:
            filename = ri_value.get('ri:filename')
            if filename:
                image_refs[filename] = filename
    
    return image_refs

def process_html_content(page_id, page_title, html_content, images_dir):
    """Process HTML content, download images with unique IDs, and create markdown with proper references."""
    # Get all attachments for the page
    attachments = get_attachments(page_id)
    attachment_map = {att['title']: att for att in attachments}
    
    # Extract image references from HTML
    image_refs = extract_image_references_from_html(html_content)
    
    # Track downloaded files and their unique IDs
    filename_to_unique_path = {}
    id_to_filename_map = {}  # For logging/debugging
    drawio_id_to_png_map = {}  # Map draw.io IDs to their PNG files
    
    # First pass: identify draw.io diagrams and their PNG representations
    for filename, attachment in attachment_map.items():
        # Check if this is a draw.io file
        is_drawio = (filename.lower().endswith('.drawio') or 
                    'draw.io' in filename.lower() or 
                    filename.endswith('.tmp'))
        
        if is_drawio:
            # Extract the diagram ID
            diagram_id = extract_diagram_id_from_filename(filename)
            if diagram_id:
                # Find the corresponding PNG file
                png_attachment = find_png_for_drawio(diagram_id, attachments)
                if png_attachment:
                    drawio_id_to_png_map[diagram_id] = png_attachment['title']
                    print(f"🔗 Found PNG for diagram ID {diagram_id}: {png_attachment['title']}")
    
    # Make sure images directory exists
    if not os.path.exists(images_dir):
        os.makedirs(images_dir)
    
    # Counter for images from this page
    image_counter = 0
    
    # Download all images and PNG representations
    for filename, attachment in attachment_map.items():
        # Skip non-image files that aren't referenced
        is_image = filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp'))
        if not is_image and filename not in image_refs:
            continue
        
        # Increment counter for each image from this page
        image_counter += 1
        
        # Create filename based on page title with a counter
        base_filename = create_image_filename(page_title, image_counter)
        file_ext = get_file_extension(filename)
        unique_filename = f"{base_filename}{file_ext}"
        
        download_url = attachment['_links']['download']
        full_url = f"{CONFLUENCE_URL}{download_url}"
        
        local_path = os.path.join(images_dir, unique_filename)
        
        # Download the file
        try:
            response = requests.get(full_url, auth=(USERNAME, API_TOKEN), stream=True)
            if response.status_code == 200:
                with open(local_path, 'wb') as file:
                    for chunk in response.iter_content(1024):
                        file.write(chunk)
                print(f"📥 Downloaded: {filename} as {unique_filename}")
                
                # Store the relative path for markdown
                rel_path = os.path.join("images", unique_filename)
                filename_to_unique_path[filename] = rel_path
                id_to_filename_map[base_filename] = filename  # For reference
                
                # If this is a PNG for a draw.io diagram, store the diagram ID -> unique path mapping
                diagram_id = extract_diagram_id_from_filename(filename)
                if diagram_id and filename.lower().endswith('.png'):
                    drawio_id_to_png_map[diagram_id] = rel_path
            else:
                print(f"❌ Failed to download: {filename}")
        except Exception as e:
            print(f"❌ Error downloading {filename}: {e}")
        
        time.sleep(1)  # Be nice to the server
    
    # Create a mapping file to track original filenames to unique IDs
    mapping_file_path = os.path.join(images_dir, "_image_mapping.txt")
    try:
        with open(mapping_file_path, 'a', encoding='utf-8') as f:
            f.write(f"\n--- Page: {page_title} ---\n")
            f.write("Original Filename -> New Filename\n")
            f.write("===========================\n")
            for original_name, local_path in filename_to_unique_path.items():
                new_filename = os.path.basename(local_path)  # Extract filename from path
                f.write(f"{original_name} -> {new_filename}\n")
            
            # Also document draw.io ID to PNG mappings
            if drawio_id_to_png_map:
                f.write("\nDraw.io ID -> PNG Path\n")
                f.write("===========================\n")
                for diagram_id, png_path in drawio_id_to_png_map.items():
                    f.write(f"{diagram_id} -> {png_path}\n")
    except Exception as e:
        print(f"❌ Error writing mapping file: {e}")
    
    # Now process HTML to replace image references before converting to markdown
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Replace image sources in the HTML
    for img in soup.find_all('img'):
        src = img.get('src', '')
        if src:
            filename = os.path.basename(unquote(urlparse(src).path))
            if filename in filename_to_unique_path:
                img['src'] = filename_to_unique_path[filename]
    
    # Replace draw.io diagrams with their PNG representations
    for drawio in soup.find_all('div', attrs={'data-macro-name': 'drawio'}):
        # Try to find the diagram ID from attributes
        diagram_id = None
        for attr, value in drawio.attrs.items():
            if attr.startswith('data-') and 'diagram' in attr.lower():
                # Try to extract a diagram ID from the value
                id_match = re.search(r'(\d{10,})', value)
                if id_match:
                    diagram_id = id_match.group(1)
                    break
        
        # If we didn't find an ID in the attributes, try the content
        if not diagram_id:
            # Check if there's text content that might contain the ID
            text = drawio.get_text()
            id_match = re.search(r'(\d{10,})', text)
            if id_match:
                diagram_id = id_match.group(1)
        
        # If we found a diagram ID and have a PNG for it, replace the draw.io div
        if diagram_id and diagram_id in drawio_id_to_png_map:
            png_path = drawio_id_to_png_map[diagram_id]
            new_img = soup.new_tag('img')
            new_img['src'] = png_path
            new_img['alt'] = f"Draw.io diagram {diagram_id}"
            drawio.replace_with(new_img)
            print(f"🔄 Replaced draw.io diagram {diagram_id} with PNG image")
        else:
            # If we can't find a match, add a comment so we can find it later
            drawio['data-export-note'] = "Unable to find PNG representation"
    
    # Replace ac:image elements
    for ac_image in soup.find_all('ac:image'):
        ri_value = ac_image.find('ri:attachment', {'ri:filename': True})
        if ri_value:
            filename = ri_value.get('ri:filename')
            if filename and filename in filename_to_unique_path:
                new_img = soup.new_tag('img')
                new_img['src'] = filename_to_unique_path[filename]
                ac_image.replace_with(new_img)
    
    # Convert the modified HTML to markdown
    modified_html = str(soup)
    markdown_content = md(modified_html)
    
    # Post-process the markdown to fix any remaining issues
    
    # 1. Fix standard image references
    for original_name, local_path in filename_to_unique_path.items():
        markdown_content = markdown_content.replace(f"![]({original_name})", f"![]({local_path})")
        markdown_content = markdown_content.replace(f"![{original_name}]({original_name})", f"![]({local_path})")
        
        # Also check for full URLs
        full_url_pattern = f"{CONFLUENCE_URL}/download/attachments/{page_id}/{original_name}"
        markdown_content = markdown_content.replace(f"![]({full_url_pattern})", f"![]({local_path})")
        markdown_content = markdown_content.replace(f"![{original_name}]({full_url_pattern})", f"![]({local_path})")
    
    # 2. Fix draw.io specific patterns
    # Look for patterns like "trueUntitled Diagram-1742929746623falseautoptrue61811"
    drawio_pattern = r'true([^f]+)false[^t]*true\d+'
    for match in re.finditer(drawio_pattern, markdown_content):
        full_match = match.group(0)
        diagram_text = match.group(1).strip()
        
        # Try to extract the diagram ID
        id_match = re.search(r'(\d{10,})', diagram_text)
        if id_match:
            diagram_id = id_match.group(1)
            if diagram_id in drawio_id_to_png_map:
                png_path = drawio_id_to_png_map[diagram_id]
                markdown_content = markdown_content.replace(full_match, f"![]({png_path})")
                print(f"🔍 Replaced draw.io pattern for diagram {diagram_id}")
    
    # 3. Remove any remaining draw.io artifacts
    markdown_content = re.sub(r'true[^f]*false[^t]*true\d+', '', markdown_content)
    
    return markdown_content

def export_space_to_markdown(space_key, output_dir):
    """Exports all pages in a Confluence space to Markdown with uniquely identified images."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    images_dir = os.path.join(output_dir, "images")
    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    pages = confluence.get_all_pages_from_space(space_key, start=0, limit=1000)
    
    for page in pages:
        title = page['title']
        sanitized_title = sanitize_filename(title)
        markdown_file_path = os.path.join(output_dir, f"{sanitized_title}.md")

        if os.path.exists(markdown_file_path):
            print(f"⚠️  Skipped (already exported): {title}")
            continue

        if len(markdown_file_path) > 255:
            print(f"❌ Error: File path too long -> {markdown_file_path}")
            continue

        try:
            page_content = confluence.get_page_by_id(page['id'], expand='body.storage')
            html_content = page_content['body']['storage']['value']
        except Exception as e:
            print(f"❌ Error retrieving page '{title}': {e}")
            continue
        
        try:
            updated_markdown = process_html_content(page['id'], title, html_content, images_dir)

            with open(markdown_file_path, 'w', encoding='utf-8') as file:
                file.write(updated_markdown)
            print(f"✅ Exported: {title}")
        except Exception as e:
            print(f"❌ Error processing page '{title}': {e}")

# Run the export
if __name__ == "__main__":
    export_space_to_markdown(SPACE_KEY, OUTPUT_DIR)