From 26f4d477aa37fc76d8f7c0962cfbebf1da427d03 Mon Sep 17 00:00:00 2001 From: Jonathan Kingston Date: Fri, 10 Oct 2025 10:59:40 +0100 Subject: [PATCH 1/6] Support shadow DOM --- injected/src/features/page-context.js | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/injected/src/features/page-context.js b/injected/src/features/page-context.js index 2202481119..025c254be5 100644 --- a/injected/src/features/page-context.js +++ b/injected/src/features/page-context.js @@ -30,6 +30,19 @@ function isHtmlElement(node) { return node.nodeType === Node.ELEMENT_NODE; } +function domToMarkdownChildren(childNodes, maxLength = Infinity, excludeSelectors) { + let children = ''; + for (const childNode of childNodes) { + const childContent = domToMarkdown(childNode, maxLength - children.length, excludeSelectors); + children += childContent; + if (children.length > maxLength) { + children = children.substring(0, maxLength) + '...'; + break; + } + } + return children; +} + /** * Convert a DOM node to markdown * @param {Node} node @@ -51,15 +64,10 @@ function domToMarkdown(node, maxLength = Infinity, excludeSelectors) { const tag = node.tagName.toLowerCase(); // Build children string incrementally to exit early when maxLength is exceeded - let children = ''; - for (const childNode of node.childNodes) { - const childContent = domToMarkdown(childNode, maxLength - children.length, excludeSelectors); - children += childContent; + let children = domToMarkdownChildren(node.childNodes, maxLength, excludeSelectors); - if (children.length > maxLength) { - children = children.substring(0, maxLength) + '...'; - break; - } + if (node.shadowRoot) { + children += domToMarkdownChildren(node.shadowRoot.childNodes, maxLength - children.length, excludeSelectors); } switch (tag) { From 2f58a64629633466449451e95402b6efbd1dc452 Mon Sep 17 00:00:00 2001 From: Jonathan Kingston Date: Fri, 10 Oct 2025 12:13:23 +0100 Subject: [PATCH 2/6] Add depth filter and refactor to settings object --- injected/src/features/page-context.js | 37 +++++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/injected/src/features/page-context.js b/injected/src/features/page-context.js index 025c254be5..c84942d76f 100644 --- a/injected/src/features/page-context.js +++ b/injected/src/features/page-context.js @@ -30,13 +30,16 @@ function isHtmlElement(node) { return node.nodeType === Node.ELEMENT_NODE; } -function domToMarkdownChildren(childNodes, maxLength = Infinity, excludeSelectors) { +function domToMarkdownChildren(childNodes, settings, depth = 0) { + if (depth > settings.maxDepth) { + return ''; + } let children = ''; for (const childNode of childNodes) { - const childContent = domToMarkdown(childNode, maxLength - children.length, excludeSelectors); + const childContent = domToMarkdown(childNode, settings, depth + 1); children += childContent; - if (children.length > maxLength) { - children = children.substring(0, maxLength) + '...'; + if (children.length > settings.maxLength) { + children = children.substring(0, settings.maxLength) + '...'; break; } } @@ -46,28 +49,34 @@ function domToMarkdownChildren(childNodes, maxLength = Infinity, excludeSelector /** * Convert a DOM node to markdown * @param {Node} node - * @param {number} maxLength - * @param {string} excludeSelectors + * @param {Object} settings - Settings object with maxLength, maxDepth, and excludeSelectors + * @param {number} settings.maxLength - Maximum length of content + * @param {number} settings.maxDepth - Maximum depth to traverse + * @param {string} settings.excludeSelectors - CSS selectors to exclude from processing + * @param {number} depth * @returns {string} */ -function domToMarkdown(node, maxLength = Infinity, excludeSelectors) { +function domToMarkdown(node, settings, depth = 0) { + if (depth > settings.maxDepth) { + return ''; + } if (node.nodeType === Node.TEXT_NODE) { return collapseWhitespace(node.textContent); } if (!isHtmlElement(node)) { return ''; } - if (!checkNodeIsVisible(node) || node.matches(excludeSelectors)) { + if (!checkNodeIsVisible(node) || node.matches(settings.excludeSelectors)) { return ''; } const tag = node.tagName.toLowerCase(); // Build children string incrementally to exit early when maxLength is exceeded - let children = domToMarkdownChildren(node.childNodes, maxLength, excludeSelectors); + let children = domToMarkdownChildren(node.childNodes, settings, depth + 1); if (node.shadowRoot) { - children += domToMarkdownChildren(node.shadowRoot.childNodes, maxLength - children.length, excludeSelectors); + children += domToMarkdownChildren(node.shadowRoot.childNodes, settings, depth + 1); } switch (tag) { @@ -363,6 +372,8 @@ export default class PageContext extends ContentFeature { const maxLength = this.getFeatureSetting('maxContentLength') || 9500; // Used to avoid large content serialization const upperLimit = this.getFeatureSetting('upperLimit') || 500000; + // We should refactor to use iteration but for now this just caps overflow. + const maxDepth = this.getFeatureSetting('maxDepth') || 5000; let excludeSelectors = this.getFeatureSetting('excludeSelectors') || ['.ad', '.sidebar', '.footer', '.nav', '.header']; const excludedInertElements = this.getFeatureSetting('excludedInertElements') || [ 'script', @@ -388,7 +399,11 @@ export default class PageContext extends ContentFeature { if (contentRoot) { this.log.info('Getting main content', contentRoot); - content += domToMarkdown(contentRoot, upperLimit, excludeSelectorsString); + content += domToMarkdown(contentRoot, { + maxLength: upperLimit, + maxDepth, + excludeSelectors: excludeSelectorsString + }); this.log.info('Content markdown', content, contentRoot); } content = content.trim(); From 35c1503b603986551912dffb6862bedac7e223d7 Mon Sep 17 00:00:00 2001 From: Jonathan Kingston Date: Fri, 10 Oct 2025 12:04:54 +0100 Subject: [PATCH 3/6] Add typing --- injected/src/features/page-context.js | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/injected/src/features/page-context.js b/injected/src/features/page-context.js index c84942d76f..e5eaf0b8a4 100644 --- a/injected/src/features/page-context.js +++ b/injected/src/features/page-context.js @@ -30,6 +30,13 @@ function isHtmlElement(node) { return node.nodeType === Node.ELEMENT_NODE; } +/** + * Stringify the children of a node to markdown + * @param {NodeListOf} childNodes + * @param {DomToMarkdownSettings} settings + * @param {number} depth + * @returns {string} + */ function domToMarkdownChildren(childNodes, settings, depth = 0) { if (depth > settings.maxDepth) { return ''; @@ -46,13 +53,17 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) { return children; } +/** + * @typedef {Object} DomToMarkdownSettings + * @property {number} maxLength - Maximum length of content + * @property {number} maxDepth - Maximum depth to traverse + * @property {string} excludeSelectors - CSS selectors to exclude from processing + */ + /** * Convert a DOM node to markdown * @param {Node} node - * @param {Object} settings - Settings object with maxLength, maxDepth, and excludeSelectors - * @param {number} settings.maxLength - Maximum length of content - * @param {number} settings.maxDepth - Maximum depth to traverse - * @param {string} settings.excludeSelectors - CSS selectors to exclude from processing + * @param {DomToMarkdownSettings} settings * @param {number} depth * @returns {string} */ From 6104b5490dd5c501811eeab070c954eacae16ab9 Mon Sep 17 00:00:00 2001 From: Jonathan Kingston Date: Fri, 10 Oct 2025 12:45:03 +0100 Subject: [PATCH 4/6] Add setting to control iframe inclusion --- injected/src/features/page-context.js | 42 +++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/injected/src/features/page-context.js b/injected/src/features/page-context.js index e5eaf0b8a4..65ff1d161e 100644 --- a/injected/src/features/page-context.js +++ b/injected/src/features/page-context.js @@ -30,6 +30,25 @@ function isHtmlElement(node) { return node.nodeType === Node.ELEMENT_NODE; } +/** + * Check if an iframe is same-origin and return its content document + * @param {HTMLIFrameElement} iframe + * @returns {Document | null} + */ +function getSameOriginIframeDocument(iframe) { + try { + // Try to access the contentDocument - this will throw if cross-origin + const doc = iframe.contentDocument; + if (doc && doc.documentElement) { + return doc; + } + } catch (e) { + // Cross-origin iframe - cannot access content + return null; + } + return null; +} + /** * Stringify the children of a node to markdown * @param {NodeListOf} childNodes @@ -58,6 +77,7 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) { * @property {number} maxLength - Maximum length of content * @property {number} maxDepth - Maximum depth to traverse * @property {string} excludeSelectors - CSS selectors to exclude from processing + * @property {boolean} includeIframes - Whether to include iframe content */ /** @@ -113,6 +133,19 @@ function domToMarkdown(node, settings, depth = 0) { return `\n- ${children.trim()}\n`; case 'a': return getLinkText(node); + case 'iframe': { + if (!settings.includeIframes) { + return children; + } + // Try to access same-origin iframe content + const iframeDoc = getSameOriginIframeDocument(/** @type {HTMLIFrameElement} */ (node)); + if (iframeDoc && iframeDoc.body) { + const iframeContent = domToMarkdown(iframeDoc.body, settings, depth + 1); + return iframeContent ? `\n\n--- Iframe Content ---\n${iframeContent}\n--- End Iframe ---\n\n` : children; + } + // If we can't access the iframe content (cross-origin), return the children or empty string + return children; + } default: return children; } @@ -412,7 +445,8 @@ export default class PageContext extends ContentFeature { this.log.info('Getting main content', contentRoot); content += domToMarkdown(contentRoot, { maxLength: upperLimit, - maxDepth, + maxDepth, + includeIframes: this.getFeatureSetting('includeIframes') || true, excludeSelectors: excludeSelectorsString }); this.log.info('Content markdown', content, contentRoot); @@ -424,7 +458,11 @@ export default class PageContext extends ContentFeature { // Limit content length if (content.length > maxLength) { - this.log.info('Truncating content', content); + this.log.info('Truncating content', { + content, + contentLength: content.length, + maxLength, + }); content = content.substring(0, maxLength) + '...'; } From 17ee18c7dd14a8b636d2e52a5ddc284e54b4a4bf Mon Sep 17 00:00:00 2001 From: Jonathan Kingston Date: Fri, 10 Oct 2025 13:38:59 +0100 Subject: [PATCH 5/6] Lint fix --- injected/src/features/page-context.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/injected/src/features/page-context.js b/injected/src/features/page-context.js index 65ff1d161e..8af81783cb 100644 --- a/injected/src/features/page-context.js +++ b/injected/src/features/page-context.js @@ -443,11 +443,11 @@ export default class PageContext extends ContentFeature { if (contentRoot) { this.log.info('Getting main content', contentRoot); - content += domToMarkdown(contentRoot, { - maxLength: upperLimit, + content += domToMarkdown(contentRoot, { + maxLength: upperLimit, maxDepth, includeIframes: this.getFeatureSetting('includeIframes') || true, - excludeSelectors: excludeSelectorsString + excludeSelectors: excludeSelectorsString, }); this.log.info('Content markdown', content, contentRoot); } From 1759698a4e1291b16711ba5947a326f07d035dd3 Mon Sep 17 00:00:00 2001 From: Jonathan Kingston Date: Fri, 10 Oct 2025 14:50:03 +0100 Subject: [PATCH 6/6] Check with feature setting enabled --- injected/src/features/page-context.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/injected/src/features/page-context.js b/injected/src/features/page-context.js index 8af81783cb..930ce104d2 100644 --- a/injected/src/features/page-context.js +++ b/injected/src/features/page-context.js @@ -446,7 +446,7 @@ export default class PageContext extends ContentFeature { content += domToMarkdown(contentRoot, { maxLength: upperLimit, maxDepth, - includeIframes: this.getFeatureSetting('includeIframes') || true, + includeIframes: this.getFeatureSettingEnabled('includeIframes', 'enabled'), excludeSelectors: excludeSelectorsString, }); this.log.info('Content markdown', content, contentRoot);