From 01a26783580d10b6f8045a4084ec6388ac52b577 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 19:45:11 +0530 Subject: [PATCH 01/79] wip: get html --- server/src/markdownify/get_html.ts | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 server/src/markdownify/get_html.ts diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts new file mode 100644 index 000000000..ea4c6788a --- /dev/null +++ b/server/src/markdownify/get_html.ts @@ -0,0 +1,53 @@ +import { chromium, Browser, Page } from 'playwright'; + +export interface GetPageSourceOptions { + wait?: number; + headless?: boolean; + userAgent?: string; +} + +export async function getPageSource( + url: string, + options: GetPageSourceOptions = {} +): Promise { + const { + wait = 1.5, + headless = true, + userAgent = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166" + } = options; + + let browser: Browser | null = null; + let page: Page | null = null; + + try { + browser = await chromium.launch({ + headless, + args: ['--no-sandbox', '--disable-dev-shm-usage'] + }); + + page = await browser.newPage(); + await page.setUserAgent(userAgent); + + // Convert wait time to milliseconds + const waitMs = wait * 1000; + + // Set default timeout and navigate to URL + await page.setDefaultTimeout(waitMs); + await page.goto(url, { waitUntil: 'domcontentloaded' }); + + // Wait for additional time if specified + if (waitMs > 0) { + await page.waitForTimeout(waitMs); + } + + const pageSource = await page.content(); + return pageSource; + + } catch (error) { + console.error('Error while getting page source: ', error); + return ''; + } finally { + if (page) await page.close(); + if (browser) await browser.close(); + } +} \ No newline at end of file From 994142ae403d61e94adabbecdc04b3112863ec00 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 19:49:38 +0530 Subject: [PATCH 02/79] fix: define browser context --- server/src/markdownify/get_html.ts | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts index ea4c6788a..3e459383d 100644 --- a/server/src/markdownify/get_html.ts +++ b/server/src/markdownify/get_html.ts @@ -1,4 +1,4 @@ -import { chromium, Browser, Page } from 'playwright'; +import { chromium, Browser, Page, BrowserContext } from 'playwright'; export interface GetPageSourceOptions { wait?: number; @@ -17,6 +17,7 @@ export async function getPageSource( } = options; let browser: Browser | null = null; + let context: BrowserContext | null = null; let page: Page | null = null; try { @@ -25,8 +26,8 @@ export async function getPageSource( args: ['--no-sandbox', '--disable-dev-shm-usage'] }); - page = await browser.newPage(); - await page.setUserAgent(userAgent); + context = await browser.newContext({ userAgent }); + page = await context.newPage(); // Convert wait time to milliseconds const waitMs = wait * 1000; @@ -45,9 +46,9 @@ export async function getPageSource( } catch (error) { console.error('Error while getting page source: ', error); - return ''; } finally { if (page) await page.close(); + if (context) await context.close(); if (browser) await browser.close(); } -} \ No newline at end of file + } \ No newline at end of file From 0c9dc899c379c9c26603136d337875ca4893ce6f Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 19:50:28 +0530 Subject: [PATCH 03/79] feat: get input text for llm --- server/src/markdownify/get_llm_input_text.ts | 151 +++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 server/src/markdownify/get_llm_input_text.ts diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts new file mode 100644 index 000000000..d3846373d --- /dev/null +++ b/server/src/markdownify/get_llm_input_text.ts @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: MIT + +import * as cheerio from 'cheerio'; +import { AnyAuthClient } from 'node_modules/google-auth-library/build/src'; +import { URL } from 'url'; + +export interface ProcessTextOptions { + htmlParser?: boolean; + keepImages?: boolean; + removeSvgImage?: boolean; + removeGifImage?: boolean; + removeImageTypes?: string[]; + keepWebpageLinks?: boolean; + removeScriptTag?: boolean; + removeStyleTag?: boolean; + removeTags?: string[]; +} + +export async function getProcessedText( + pageSource: string, + baseUrl: string, + options: ProcessTextOptions = {} +): Promise { + const { + keepImages = true, + removeSvgImage = true, + removeGifImage = true, + removeImageTypes = [], + keepWebpageLinks = true, + removeScriptTag = true, + removeStyleTag = true, + removeTags = [] + } = options; + + try { + const $ = cheerio.load(pageSource); + + // Remove tags + const tagsToRemove: string[] = []; + if (removeScriptTag) tagsToRemove.push('script'); + if (removeStyleTag) tagsToRemove.push('style'); + tagsToRemove.push(...removeTags); + + const uniqueTags = [...new Set(tagsToRemove)]; + uniqueTags.forEach(tag => { + $(tag).remove(); + }); + + // Process image links + const imageTypesToRemove: string[] = []; + if (removeSvgImage) imageTypesToRemove.push('.svg'); + if (removeGifImage) imageTypesToRemove.push('.gif'); + imageTypesToRemove.push(...removeImageTypes); + + const uniqueImageTypes = [...new Set(imageTypesToRemove)]; + + $('img').each((_: any, element: any) => { + try { + const $img = $(element); + if (!keepImages) { + $img.remove(); + } else { + const imageLink = $img.attr('src'); + let typeReplaced = false; + + if (imageLink) { + if (uniqueImageTypes.length > 0) { + for (const imageType of uniqueImageTypes) { + if (!typeReplaced && imageLink.includes(imageType)) { + $img.remove(); + typeReplaced = true; + break; + } + } + } + if (!typeReplaced) { + const absoluteUrl = new URL(imageLink, baseUrl).toString(); + $img.replaceWith('\n' + absoluteUrl + ' '); + } + } + } + } catch (error) { + console.error('Error while processing image link: ', error); + } + }); + + // Process website links + $('a[href]').each((_: any, element: any) => { + try { + const $link = $(element); + if (!keepWebpageLinks) { + $link.remove(); + } else { + const href = $link.attr('href'); + if (href) { + const absoluteUrl = new URL(href, baseUrl).toString(); + $link.replaceWith($link.text() + ': ' + absoluteUrl + ' '); + } + } + } catch (error) { + console.error('Error while processing webpage link: ', error); + } + }); + + // Get text content + let text: string; + const bodyContent = $('body'); + + if (bodyContent.length > 0) { + // For minification, we'll use a simple approach to clean up the HTML + const bodyHtml = bodyContent.html() || ''; + const minimizedBody = minifyHtml(bodyHtml); + text = htmlToText(minimizedBody); + } else { + text = $.text(); + } + + return text; + + } catch (error) { + console.error('Error while getting processed text: ', error); + return ''; + } +} + +// Simple HTML minification function +function minifyHtml(html: string): string { + return html + .replace(/\s+/g, ' ') + .replace(/>\s+<') + .trim(); +} + +// Convert HTML to text (simplified version of inscriptis functionality) +function htmlToText(html: string): string { + const $ = cheerio.load(html); + + // Remove elements that shouldn't contribute to text + $('script, style, noscript').remove(); + + // Get text content with basic formatting + let text = $('body').text() || $.text(); + + // Clean up the text + text = text + .replace(/\s+/g, ' ') + .replace(/\n\s*\n/g, '\n') + .trim(); + + return text; +} \ No newline at end of file From 560f5a33003bef854a3170578f562e452e4c7b32 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 19:51:34 +0530 Subject: [PATCH 04/79] feat: get llm ready text --- server/src/markdownify/get_llm_ready_text.ts | 29 ++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 server/src/markdownify/get_llm_ready_text.ts diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts new file mode 100644 index 000000000..4d0515c6c --- /dev/null +++ b/server/src/markdownify/get_llm_ready_text.ts @@ -0,0 +1,29 @@ +import { getPageSource, GetPageSourceOptions } from './get_html'; +import { getProcessedText, ProcessTextOptions } from './get_llm_input_text'; + +export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions { + // Combined options from both interfaces +} + +export async function urlToLlmText( + url: string, + options: UrlToLlmTextOptions = {} +): Promise { + try { + const pageSource = await getPageSource(url, options); + + if (!pageSource) { + return ''; + } + + const llmText = await getProcessedText(pageSource, url, options); + return llmText; + + } catch (error) { + console.error('Error while scraping url: ', error); + return ''; + } +} + +// Export individual functions as well +export { getPageSource, getProcessedText }; \ No newline at end of file From 9b71cfc40cc30866afd6e4a6af74a45cf457ae1a Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 19:54:28 +0530 Subject: [PATCH 05/79] fix: return empty empty str on error --- server/src/markdownify/get_html.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts index 3e459383d..dbf6a8a93 100644 --- a/server/src/markdownify/get_html.ts +++ b/server/src/markdownify/get_html.ts @@ -1,4 +1,4 @@ -import { chromium, Browser, Page, BrowserContext } from 'playwright'; +import { chromium, Browser, BrowserContext, Page } from 'playwright'; export interface GetPageSourceOptions { wait?: number; @@ -46,9 +46,10 @@ export async function getPageSource( } catch (error) { console.error('Error while getting page source: ', error); + return ''; // Explicitly return empty string on error } finally { if (page) await page.close(); if (context) await context.close(); if (browser) await browser.close(); } - } \ No newline at end of file +} \ No newline at end of file From 191ac52ee3516120455ebe0945eaba8b9f4251ad Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 19:55:17 +0530 Subject: [PATCH 06/79] fix: return empty empty str on error --- server/src/markdownify/get_llm_input_text.ts | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts index d3846373d..d33c582be 100644 --- a/server/src/markdownify/get_llm_input_text.ts +++ b/server/src/markdownify/get_llm_input_text.ts @@ -1,7 +1,4 @@ -// SPDX-License-Identifier: MIT - import * as cheerio from 'cheerio'; -import { AnyAuthClient } from 'node_modules/google-auth-library/build/src'; import { URL } from 'url'; export interface ProcessTextOptions { @@ -54,7 +51,7 @@ export async function getProcessedText( const uniqueImageTypes = [...new Set(imageTypesToRemove)]; - $('img').each((_: any, element: any) => { + $('img').each((_, element) => { try { const $img = $(element); if (!keepImages) { @@ -85,7 +82,7 @@ export async function getProcessedText( }); // Process website links - $('a[href]').each((_: any, element: any) => { + $('a[href]').each((_, element) => { try { const $link = $(element); if (!keepWebpageLinks) { @@ -107,7 +104,6 @@ export async function getProcessedText( const bodyContent = $('body'); if (bodyContent.length > 0) { - // For minification, we'll use a simple approach to clean up the HTML const bodyHtml = bodyContent.html() || ''; const minimizedBody = minifyHtml(bodyHtml); text = htmlToText(minimizedBody); @@ -119,11 +115,10 @@ export async function getProcessedText( } catch (error) { console.error('Error while getting processed text: ', error); - return ''; + return ''; // Explicitly return empty string on error } } -// Simple HTML minification function function minifyHtml(html: string): string { return html .replace(/\s+/g, ' ') @@ -131,17 +126,13 @@ function minifyHtml(html: string): string { .trim(); } -// Convert HTML to text (simplified version of inscriptis functionality) function htmlToText(html: string): string { const $ = cheerio.load(html); - // Remove elements that shouldn't contribute to text $('script, style, noscript').remove(); - // Get text content with basic formatting let text = $('body').text() || $.text(); - // Clean up the text text = text .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n') From af9570659f304422e5cc2c0aab30017a5b50335a Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 20:50:25 +0530 Subject: [PATCH 07/79] fix: get important content --- server/src/markdownify/get_llm_input_text.ts | 76 +++++++++++++------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts index d33c582be..c7962392c 100644 --- a/server/src/markdownify/get_llm_input_text.ts +++ b/server/src/markdownify/get_llm_input_text.ts @@ -1,3 +1,4 @@ + import * as cheerio from 'cheerio'; import { URL } from 'url'; @@ -81,17 +82,20 @@ export async function getProcessedText( } }); - // Process website links + // Process website links - Preserve the link text AND the URL $('a[href]').each((_, element) => { try { const $link = $(element); if (!keepWebpageLinks) { - $link.remove(); + // Just remove the link but keep the text + $link.replaceWith($link.text()); } else { const href = $link.attr('href'); if (href) { const absoluteUrl = new URL(href, baseUrl).toString(); - $link.replaceWith($link.text() + ': ' + absoluteUrl + ' '); + const linkText = $link.text().trim(); + // Keep both the link text and the URL + $link.replaceWith(linkText + ' [' + absoluteUrl + '] '); } } } catch (error) { @@ -99,44 +103,64 @@ export async function getProcessedText( } }); - // Get text content + // Get text content let text: string; + + // Use a simpler approach to extract text const bodyContent = $('body'); if (bodyContent.length > 0) { - const bodyHtml = bodyContent.html() || ''; - const minimizedBody = minifyHtml(bodyHtml); - text = htmlToText(minimizedBody); + // Remove script and style tags that might have been missed + bodyContent.find('script, style, noscript').remove(); + + // Get text with proper spacing + text = bodyContent + .contents() + .map((_, el) => { + if (el.type === 'text') { + return $(el).text(); + } + if (el.type === 'tag') { + const $el = $(el); + const tagName = el.name?.toLowerCase(); + + // Add appropriate spacing for block elements + if (['div', 'p', 'br', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName || '')) { + return $el.text() + '\n'; + } + return $el.text() + ' '; + } + return ''; + }) + .get() + .join(''); } else { text = $.text(); } + // Clean up the text while preserving quotes + text = cleanText(text); + return text; } catch (error) { console.error('Error while getting processed text: ', error); - return ''; // Explicitly return empty string on error + return ''; } } -function minifyHtml(html: string): string { - return html - .replace(/\s+/g, ' ') - .replace(/>\s+<') - .trim(); -} - -function htmlToText(html: string): string { - const $ = cheerio.load(html); +// Clean up text while preserving quotes and important content +function cleanText(text: string): string { + if (!text) return ''; - $('script, style, noscript').remove(); - - let text = $('body').text() || $.text(); - - text = text - .replace(/\s+/g, ' ') - .replace(/\n\s*\n/g, '\n') + return text + // Replace multiple spaces with single space, but be careful with quotes + .replace(/[^\S\n]+/g, ' ') + // Replace multiple newlines with max 2 newlines + .replace(/\n\s*\n/g, '\n\n') + // Clean up spaces around quotes but don't remove the quotes + .replace(/\s+"/g, ' "') + .replace(/"\s+/g, '" ') + // Remove leading/trailing whitespace .trim(); - - return text; } \ No newline at end of file From a3891f6813fca50a989633beb5ab4e9787aea668 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 21:14:23 +0530 Subject: [PATCH 08/79] wip: markdown + plain text --- server/src/markdownify/get_llm_input_text.ts | 371 +++++++++++++------ 1 file changed, 266 insertions(+), 105 deletions(-) diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts index c7962392c..fa0aec6cd 100644 --- a/server/src/markdownify/get_llm_input_text.ts +++ b/server/src/markdownify/get_llm_input_text.ts @@ -1,9 +1,7 @@ - import * as cheerio from 'cheerio'; import { URL } from 'url'; export interface ProcessTextOptions { - htmlParser?: boolean; keepImages?: boolean; removeSvgImage?: boolean; removeGifImage?: boolean; @@ -12,13 +10,26 @@ export interface ProcessTextOptions { removeScriptTag?: boolean; removeStyleTag?: boolean; removeTags?: string[]; + formatAsMarkdown?: boolean; +} + +export interface ProcessedResult { + markdown: string; + plainText: string; + metadata: { + title: string; + url: string; + processedAt: string; + textLength: number; + markdownLength: number; + }; } export async function getProcessedText( pageSource: string, baseUrl: string, options: ProcessTextOptions = {} -): Promise { +): Promise { const { keepImages = true, removeSvgImage = true, @@ -27,13 +38,14 @@ export async function getProcessedText( keepWebpageLinks = true, removeScriptTag = true, removeStyleTag = true, - removeTags = [] + removeTags = [], + formatAsMarkdown = true } = options; try { const $ = cheerio.load(pageSource); - // Remove tags + // Remove unwanted tags const tagsToRemove: string[] = []; if (removeScriptTag) tagsToRemove.push('script'); if (removeStyleTag) tagsToRemove.push('style'); @@ -44,123 +56,272 @@ export async function getProcessedText( $(tag).remove(); }); - // Process image links - const imageTypesToRemove: string[] = []; - if (removeSvgImage) imageTypesToRemove.push('.svg'); - if (removeGifImage) imageTypesToRemove.push('.gif'); - imageTypesToRemove.push(...removeImageTypes); + // Extract page title + const title = $('title').text() || $('h1').first().text() || 'Untitled'; - const uniqueImageTypes = [...new Set(imageTypesToRemove)]; + // Generate both formats + const markdown = formatAsMarkdown ? + convertToMarkdown($, baseUrl, options) : + convertToPlainText($, baseUrl, options); // Fallback to plain text if markdown disabled - $('img').each((_, element) => { - try { - const $img = $(element); - if (!keepImages) { - $img.remove(); - } else { - const imageLink = $img.attr('src'); - let typeReplaced = false; - - if (imageLink) { - if (uniqueImageTypes.length > 0) { - for (const imageType of uniqueImageTypes) { - if (!typeReplaced && imageLink.includes(imageType)) { - $img.remove(); - typeReplaced = true; - break; - } - } - } - if (!typeReplaced) { - const absoluteUrl = new URL(imageLink, baseUrl).toString(); - $img.replaceWith('\n' + absoluteUrl + ' '); - } - } - } - } catch (error) { - console.error('Error while processing image link: ', error); + const plainText = convertToPlainText($, baseUrl, options); + + const result: ProcessedResult = { + markdown, + plainText, + metadata: { + title: title.trim(), + url: baseUrl, + processedAt: new Date().toISOString(), + textLength: plainText.length, + markdownLength: markdown.length + } + }; + + return result; + + } catch (error) { + console.error('Error while getting processed text: ', error); + // Return empty result on error + return { + markdown: '', + plainText: '', + metadata: { + title: '', + url: baseUrl, + processedAt: new Date().toISOString(), + textLength: 0, + markdownLength: 0 + } + }; + } +} + +function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string { + const { keepImages, keepWebpageLinks } = options; + + // Clone the body to avoid modifying the original + const $body = $('body').clone(); + + // Process headers + $body.find('h1').each((_, element) => { + const $el = $(element); + $el.replaceWith(`# ${$el.text().trim()}\n\n`); + }); + + $body.find('h2').each((_, element) => { + const $el = $(element); + $el.replaceWith(`## ${$el.text().trim()}\n\n`); + }); + + $body.find('h3').each((_, element) => { + const $el = $(element); + $el.replaceWith(`### ${$el.text().trim()}\n\n`); + }); + + $body.find('h4, h5, h6').each((_, element) => { + const $el = $(element); + const level = element.name?.substring(1) || '4'; + const hashes = '#'.repeat(parseInt(level)); + $el.replaceWith(`${hashes} ${$el.text().trim()}\n\n`); + }); + + // Process paragraphs + $body.find('p').each((_, element) => { + const $el = $(element); + $el.replaceWith(`${$el.text().trim()}\n\n`); + }); + + // Process lists + $body.find('li').each((_, element) => { + const $el = $(element); + const text = $el.text().trim(); + if ($el.parent().is('ol')) { + $el.replaceWith(`1. ${text}\n`); + } else { + $el.replaceWith(`- ${text}\n`); + } + }); + + $body.find('ul, ol').each((_, element) => { + const $el = $(element); + $el.replaceWith(`\n${$el.html()}\n\n`); + }); + + // Process blockquotes + $body.find('blockquote').each((_, element) => { + const $el = $(element); + const text = $el.text().trim(); + $el.replaceWith(`> ${text.replace(/\n/g, '\n> ')}\n\n`); + }); + + // Process code blocks + $body.find('pre').each((_, element) => { + const $el = $(element); + const text = $el.text().trim(); + $el.replaceWith(`\`\`\`\n${text}\n\`\`\`\n\n`); + }); + + $body.find('code').each((_, element) => { + const $el = $(element); + // Only format inline code that's not inside pre blocks + if (!$el.closest('pre').length) { + const text = $el.text().trim(); + $el.replaceWith(`\`${text}\``); + } + }); + + // Process images + if (keepImages) { + $body.find('img').each((_, element) => { + const $img = $(element); + const src = $img.attr('src'); + const alt = $img.attr('alt') || ''; + + if (src && !shouldRemoveImage(src, options)) { + const absoluteUrl = new URL(src, baseUrl).toString(); + $img.replaceWith(`![${alt}](${absoluteUrl})\n\n`); + } else { + $img.remove(); } }); + } else { + $body.find('img').remove(); + } - // Process website links - Preserve the link text AND the URL - $('a[href]').each((_, element) => { - try { - const $link = $(element); - if (!keepWebpageLinks) { - // Just remove the link but keep the text - $link.replaceWith($link.text()); - } else { - const href = $link.attr('href'); - if (href) { - const absoluteUrl = new URL(href, baseUrl).toString(); - const linkText = $link.text().trim(); - // Keep both the link text and the URL - $link.replaceWith(linkText + ' [' + absoluteUrl + '] '); - } - } - } catch (error) { - console.error('Error while processing webpage link: ', error); + // Process links + if (keepWebpageLinks) { + $body.find('a[href]').each((_, element) => { + const $link = $(element); + const href = $link.attr('href'); + const text = $link.text().trim(); + + if (href && text) { + const absoluteUrl = new URL(href, baseUrl).toString(); + $link.replaceWith(`[${text}](${absoluteUrl})`); + } else if (text) { + $link.replaceWith(text); + } else { + $link.remove(); } }); + } else { + $body.find('a[href]').each((_, element) => { + const $link = $(element); + $link.replaceWith($link.text().trim()); + }); + } - // Get text content - let text: string; + // Process tables (basic support) + $body.find('table').each((_, element) => { + const $table = $(element); + let markdownTable = '\n'; - // Use a simpler approach to extract text - const bodyContent = $('body'); - - if (bodyContent.length > 0) { - // Remove script and style tags that might have been missed - bodyContent.find('script, style, noscript').remove(); + $table.find('tr').each((rowIndex, row) => { + const $row = $(row); + const cells: string[] = []; - // Get text with proper spacing - text = bodyContent - .contents() - .map((_, el) => { - if (el.type === 'text') { - return $(el).text(); - } - if (el.type === 'tag') { - const $el = $(el); - const tagName = el.name?.toLowerCase(); - - // Add appropriate spacing for block elements - if (['div', 'p', 'br', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName || '')) { - return $el.text() + '\n'; - } - return $el.text() + ' '; - } - return ''; - }) - .get() - .join(''); - } else { - text = $.text(); - } - - // Clean up the text while preserving quotes - text = cleanText(text); + $row.find('th, td').each((_, cell) => { + const $cell = $(cell); + cells.push($cell.text().trim()); + }); + + if (cells.length > 0) { + markdownTable += `| ${cells.join(' | ')} |\n`; + + // Add header separator after first row + if (rowIndex === 0) { + markdownTable += `|${cells.map(() => '---').join('|')}|\n`; + } + } + }); - return text; + $table.replaceWith(markdownTable + '\n'); + }); - } catch (error) { - console.error('Error while getting processed text: ', error); - return ''; + // Get the final text and clean it up + let markdown = $body.text(); + + // Clean up excessive whitespace while preserving structure + markdown = cleanMarkdown(markdown); + + return markdown; +} + +function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string { + const { keepImages, keepWebpageLinks } = options; + + const $body = $('body').clone(); + + // Process images + if (keepImages) { + $body.find('img').each((_, element) => { + const $img = $(element); + const src = $img.attr('src'); + + if (src && !shouldRemoveImage(src, options)) { + const absoluteUrl = new URL(src, baseUrl).toString(); + $img.replaceWith(`\nImage: ${absoluteUrl}\n`); + } else { + $img.remove(); + } + }); + } else { + $body.find('img').remove(); } + + // Process links + if (keepWebpageLinks) { + $body.find('a[href]').each((_, element) => { + const $link = $(element); + const href = $link.attr('href'); + const text = $link.text().trim(); + + if (href && text) { + const absoluteUrl = new URL(href, baseUrl).toString(); + $link.replaceWith(`${text}: ${absoluteUrl} `); + } + }); + } else { + $body.find('a[href]').each((_, element) => { + const $link = $(element); + $link.replaceWith($link.text().trim()); + }); + } + + let text = $body.text(); + text = cleanText(text); + + return text; } -// Clean up text while preserving quotes and important content -function cleanText(text: string): string { - if (!text) return ''; +function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean { + const { removeSvgImage, removeGifImage, removeImageTypes = [] } = options; + + const imageTypesToRemove: string[] = []; + if (removeSvgImage) imageTypesToRemove.push('.svg'); + if (removeGifImage) imageTypesToRemove.push('.gif'); + imageTypesToRemove.push(...removeImageTypes); + return imageTypesToRemove.some(type => src.includes(type)); +} + +function cleanMarkdown(markdown: string): string { + return markdown + // Replace 3+ newlines with 2 newlines + .replace(/\n{3,}/g, '\n\n') + // Remove excessive spaces + .replace(/[ ]{2,}/g, ' ') + // Clean up space around headers + .replace(/\n\s*(#+)\s*/g, '\n$1 ') + // Remove trailing whitespace + .replace(/[ \t]+$/gm, '') + .trim(); +} + +function cleanText(text: string): string { return text - // Replace multiple spaces with single space, but be careful with quotes - .replace(/[^\S\n]+/g, ' ') - // Replace multiple newlines with max 2 newlines + .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n\n') - // Clean up spaces around quotes but don't remove the quotes - .replace(/\s+"/g, ' "') - .replace(/"\s+/g, '" ') - // Remove leading/trailing whitespace .trim(); } \ No newline at end of file From dae4e83412d8867e446841aef90022351ef6e574 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 21:18:11 +0530 Subject: [PATCH 09/79] wip: markdown + plain text --- server/src/markdownify/get_llm_ready_text.ts | 37 ++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts index 4d0515c6c..ed7849ec3 100644 --- a/server/src/markdownify/get_llm_ready_text.ts +++ b/server/src/markdownify/get_llm_ready_text.ts @@ -1,29 +1,46 @@ import { getPageSource, GetPageSourceOptions } from './get_html'; -import { getProcessedText, ProcessTextOptions } from './get_llm_input_text'; +import { getProcessedText, ProcessTextOptions, ProcessedResult } from './get_llm_input_text'; -export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions { - // Combined options from both interfaces -} +export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions {} export async function urlToLlmText( url: string, options: UrlToLlmTextOptions = {} -): Promise { +): Promise { try { const pageSource = await getPageSource(url, options); if (!pageSource) { - return ''; + return { + markdown: '', + plainText: '', + metadata: { + title: '', + url: url, + processedAt: new Date().toISOString(), + textLength: 0, + markdownLength: 0 + } + }; } - const llmText = await getProcessedText(pageSource, url, options); - return llmText; + const result = await getProcessedText(pageSource, url, options); + return result; } catch (error) { console.error('Error while scraping url: ', error); - return ''; + return { + markdown: '', + plainText: '', + metadata: { + title: '', + url: url, + processedAt: new Date().toISOString(), + textLength: 0, + markdownLength: 0 + } + }; } } -// Export individual functions as well export { getPageSource, getProcessedText }; \ No newline at end of file From 28f1bf85102acca3daa65738bc3dcc2af1306b89 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 21:52:39 +0530 Subject: [PATCH 10/79] fix: better markdown output --- server/src/markdownify/get_llm_input_text.ts | 523 +++++++++++++------ 1 file changed, 363 insertions(+), 160 deletions(-) diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts index fa0aec6cd..3e600140f 100644 --- a/server/src/markdownify/get_llm_input_text.ts +++ b/server/src/markdownify/get_llm_input_text.ts @@ -11,6 +11,9 @@ export interface ProcessTextOptions { removeStyleTag?: boolean; removeTags?: string[]; formatAsMarkdown?: boolean; + maxContentLength?: number; + preserveLineBreaks?: boolean; + includeMetadata?: boolean; } export interface ProcessedResult { @@ -18,13 +21,22 @@ export interface ProcessedResult { plainText: string; metadata: { title: string; + description: string; url: string; processedAt: string; textLength: number; markdownLength: number; + hasContent: boolean; + language?: string; + wordCount: number; + linkCount: number; + imageCount: number; }; } +// Global cheerio instance for helper functions +let $: cheerio.CheerioAPI; + export async function getProcessedText( pageSource: string, baseUrl: string, @@ -39,16 +51,21 @@ export async function getProcessedText( removeScriptTag = true, removeStyleTag = true, removeTags = [], - formatAsMarkdown = true + formatAsMarkdown = true, + maxContentLength = 100000, + preserveLineBreaks = true, + includeMetadata = true } = options; try { - const $ = cheerio.load(pageSource); + // Initialize cheerio without problematic options + $ = cheerio.load(pageSource); - // Remove unwanted tags + // Remove unwanted tags completely const tagsToRemove: string[] = []; if (removeScriptTag) tagsToRemove.push('script'); if (removeStyleTag) tagsToRemove.push('style'); + if (removeScriptTag) tagsToRemove.push('noscript'); tagsToRemove.push(...removeTags); const uniqueTags = [...new Set(tagsToRemove)]; @@ -56,25 +73,45 @@ export async function getProcessedText( $(tag).remove(); }); - // Extract page title - const title = $('title').text() || $('h1').first().text() || 'Untitled'; + // Remove common unwanted elements + $('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); + // Extract metadata + const title = extractTitle(); + const description = extractDescription(); + const language = extractLanguage(); + // Generate both formats const markdown = formatAsMarkdown ? - convertToMarkdown($, baseUrl, options) : - convertToPlainText($, baseUrl, options); // Fallback to plain text if markdown disabled + convertToMarkdown(baseUrl, options) : + ''; - const plainText = convertToPlainText($, baseUrl, options); + const plainText = convertToPlainText(baseUrl, options); + + // Truncate if necessary + const finalMarkdown = markdown.substring(0, maxContentLength); + const finalPlainText = plainText.substring(0, maxContentLength); + + // Count elements + const linkCount = $('a[href]').length; + const imageCount = $('img').length; + const wordCount = countWords(finalPlainText); const result: ProcessedResult = { - markdown, - plainText, + markdown: finalMarkdown, + plainText: finalPlainText, metadata: { - title: title.trim(), + title, + description, url: baseUrl, processedAt: new Date().toISOString(), - textLength: plainText.length, - markdownLength: markdown.length + textLength: finalPlainText.length, + markdownLength: finalMarkdown.length, + hasContent: finalPlainText.length > 0, + language, + wordCount, + linkCount, + imageCount } }; @@ -82,186 +119,321 @@ export async function getProcessedText( } catch (error) { console.error('Error while getting processed text: ', error); - // Return empty result on error - return { - markdown: '', - plainText: '', - metadata: { - title: '', - url: baseUrl, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0 - } - }; + return createEmptyResult(baseUrl); } } -function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string { - const { keepImages, keepWebpageLinks } = options; +function extractTitle(): string { + return $('title').text()?.trim() || + $('meta[property="og:title"]').attr('content')?.trim() || + $('h1').first().text()?.trim() || + 'Untitled'; +} + +function extractDescription(): string { + return $('meta[name="description"]').attr('content')?.trim() || + $('meta[property="og:description"]').attr('content')?.trim() || + ''; +} + +function extractLanguage(): string { + return $('html').attr('lang') || 'en'; +} + +function countWords(text: string): number { + return text.split(/\s+/).filter(word => word.length > 0).length; +} + +function convertToMarkdown(baseUrl: string, options: ProcessTextOptions): string { + const { keepImages, keepWebpageLinks, preserveLineBreaks } = options; + // Start with metadata if available + let markdown = ''; + const title = extractTitle(); + if (title && title !== 'Untitled') { + markdown += `# ${title}\n\n`; + } + + const description = extractDescription(); + if (description) { + markdown += `> ${description}\n\n`; + } + // Clone the body to avoid modifying the original const $body = $('body').clone(); - // Process headers - $body.find('h1').each((_, element) => { - const $el = $(element); - $el.replaceWith(`# ${$el.text().trim()}\n\n`); - }); - - $body.find('h2').each((_, element) => { - const $el = $(element); - $el.replaceWith(`## ${$el.text().trim()}\n\n`); - }); - - $body.find('h3').each((_, element) => { - const $el = $(element); - $el.replaceWith(`### ${$el.text().trim()}\n\n`); - }); - - $body.find('h4, h5, h6').each((_, element) => { - const $el = $(element); - const level = element.name?.substring(1) || '4'; - const hashes = '#'.repeat(parseInt(level)); - $el.replaceWith(`${hashes} ${$el.text().trim()}\n\n`); - }); + // Remove unwanted elements from the clone + $body.find('script, style, noscript, meta, link').remove(); + $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - // Process paragraphs - $body.find('p').each((_, element) => { - const $el = $(element); - $el.replaceWith(`${$el.text().trim()}\n\n`); - }); + // Process in order of importance + const sections: string[] = []; + + // Process main content areas first + const contentSelectors = [ + 'main', 'article', '[role="main"]', '.content', '.main', + '#content', '#main', '.post', '.article' + ]; - // Process lists - $body.find('li').each((_, element) => { - const $el = $(element); - const text = $el.text().trim(); - if ($el.parent().is('ol')) { - $el.replaceWith(`1. ${text}\n`); - } else { - $el.replaceWith(`- ${text}\n`); + let mainContent = ''; + for (const selector of contentSelectors) { + const $content = $body.find(selector).first(); + if ($content.length > 0) { + mainContent = processElementToMarkdown($content, baseUrl, options, 0); + if (mainContent.trim().length > 100) { // Only use if substantial content + sections.push(mainContent); + $content.remove(); // Remove from body to avoid duplication + break; + } } - }); + } - $body.find('ul, ol').each((_, element) => { - const $el = $(element); - $el.replaceWith(`\n${$el.html()}\n\n`); - }); + // Process headers and structure + sections.push(processElementToMarkdown($body, baseUrl, options, 0)); - // Process blockquotes - $body.find('blockquote').each((_, element) => { - const $el = $(element); - const text = $el.text().trim(); - $el.replaceWith(`> ${text.replace(/\n/g, '\n> ')}\n\n`); - }); + // Combine sections + markdown += sections.filter(s => s.trim().length > 0).join('\n\n'); - // Process code blocks - $body.find('pre').each((_, element) => { - const $el = $(element); - const text = $el.text().trim(); - $el.replaceWith(`\`\`\`\n${text}\n\`\`\`\n\n`); - }); + // Final cleanup + markdown = cleanMarkdown(markdown, preserveLineBreaks); + + return markdown; +} - $body.find('code').each((_, element) => { - const $el = $(element); - // Only format inline code that's not inside pre blocks - if (!$el.closest('pre').length) { - const text = $el.text().trim(); - $el.replaceWith(`\`${text}\``); - } - }); +function processElementToMarkdown($element: cheerio.Cheerio, baseUrl: string, options: ProcessTextOptions, depth: number = 0): string { + if (depth > 10) return ''; // Prevent infinite recursion + + const { keepImages, keepWebpageLinks } = options; + let markdown = ''; - // Process images - if (keepImages) { - $body.find('img').each((_, element) => { - const $img = $(element); - const src = $img.attr('src'); - const alt = $img.attr('alt') || ''; - - if (src && !shouldRemoveImage(src, options)) { - const absoluteUrl = new URL(src, baseUrl).toString(); - $img.replaceWith(`![${alt}](${absoluteUrl})\n\n`); - } else { - $img.remove(); + $element.contents().each((index, node) => { + if (node.type === 'text') { + const text = $(node).text().trim(); + if (text) { + markdown += text + ' '; } - }); - } else { - $body.find('img').remove(); - } + } else if (node.type === 'tag') { + const $node = $(node); + const tagName = node.name?.toLowerCase() || ''; - // Process links - if (keepWebpageLinks) { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - const href = $link.attr('href'); - const text = $link.text().trim(); - - if (href && text) { - const absoluteUrl = new URL(href, baseUrl).toString(); - $link.replaceWith(`[${text}](${absoluteUrl})`); - } else if (text) { - $link.replaceWith(text); - } else { - $link.remove(); + switch (tagName) { + case 'h1': + markdown += `\n# ${$node.text().trim()}\n\n`; + break; + case 'h2': + markdown += `\n## ${$node.text().trim()}\n\n`; + break; + case 'h3': + markdown += `\n### ${$node.text().trim()}\n\n`; + break; + case 'h4': + markdown += `\n#### ${$node.text().trim()}\n\n`; + break; + case 'h5': + markdown += `\n##### ${$node.text().trim()}\n\n`; + break; + case 'h6': + markdown += `\n###### ${$node.text().trim()}\n\n`; + break; + case 'p': + const paragraphText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (paragraphText.trim()) { + markdown += `\n${paragraphText.trim()}\n\n`; + } + break; + case 'br': + markdown += '\n'; + break; + case 'hr': + markdown += '\n---\n\n'; + break; + case 'strong': + case 'b': + const strongText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (strongText.trim()) { + markdown += `**${strongText.trim()}**`; + } + break; + case 'em': + case 'i': + const emText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (emText.trim()) { + markdown += `*${emText.trim()}*`; + } + break; + case 'code': + if (!$node.closest('pre').length) { + const codeText = $node.text().trim(); + if (codeText) { + markdown += `\`${codeText}\``; + } + } + break; + case 'pre': + const preText = $node.text().trim(); + if (preText) { + const codeClass = $node.find('code').attr('class'); + const language = codeClass ? codeClass.replace('language-', '') : ''; + markdown += `\n\`\`\`${language}\n${preText}\n\`\`\`\n\n`; + } + break; + case 'blockquote': + const quoteText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (quoteText.trim()) { + const lines = quoteText.trim().split('\n'); + markdown += '\n' + lines.map(line => `> ${line}`).join('\n') + '\n\n'; + } + break; + case 'ul': + const listItems: string[] = []; + $node.find('> li').each((_, li) => { + const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); + if (itemText.trim()) { + listItems.push(`- ${itemText.trim()}`); + } + }); + if (listItems.length > 0) { + markdown += '\n' + listItems.join('\n') + '\n\n'; + } + break; + case 'ol': + const olItems: string[] = []; + $node.find('> li').each((i, li) => { + const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); + if (itemText.trim()) { + olItems.push(`${i + 1}. ${itemText.trim()}`); + } + }); + if (olItems.length > 0) { + markdown += '\n' + olItems.join('\n') + '\n\n'; + } + break; + case 'a': + if (keepWebpageLinks) { + const href = $node.attr('href'); + const linkText = processElementToMarkdown($node, baseUrl, options, depth + 1).trim(); + if (href && linkText) { + try { + const absoluteUrl = new URL(href, baseUrl).toString(); + markdown += `[${linkText}](${absoluteUrl})`; + } catch { + markdown += linkText; + } + } else if (linkText) { + markdown += linkText; + } + } else { + markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); + } + break; + case 'img': + if (keepImages) { + const src = $node.attr('src'); + const alt = $node.attr('alt') || $node.attr('title') || ''; + if (src && !shouldRemoveImage(src, options)) { + try { + const absoluteUrl = new URL(src, baseUrl).toString(); + markdown += `![${alt}](${absoluteUrl})`; + } catch { + // Ignore invalid URLs + } + } + } + break; + case 'table': + markdown += processTableToMarkdown($node); + break; + case 'div': + case 'section': + case 'article': + case 'header': + case 'footer': + case 'nav': + case 'aside': + // Process block-level elements with their content + const blockContent = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (blockContent.trim()) { + markdown += `\n${blockContent.trim()}\n\n`; + } + break; + default: + // For other tags, just process their content + markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); + break; } - }); - } else { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - $link.replaceWith($link.text().trim()); - }); - } + } + }); + + return markdown; +} + +function processTableToMarkdown($table: cheerio.Cheerio): string { + const rows: string[][] = []; + let maxColumns = 0; - // Process tables (basic support) - $body.find('table').each((_, element) => { - const $table = $(element); - let markdownTable = '\n'; + $table.find('tr').each((_, row) => { + const $row = $(row); + const cells: string[] = []; - $table.find('tr').each((rowIndex, row) => { - const $row = $(row); - const cells: string[] = []; - - $row.find('th, td').each((_, cell) => { - const $cell = $(cell); - cells.push($cell.text().trim()); - }); + $row.find('th, td').each((_, cell) => { + const $cell = $(cell); + const text = $cell.text().trim(); + const colspan = parseInt($cell.attr('colspan') || '1'); - if (cells.length > 0) { - markdownTable += `| ${cells.join(' | ')} |\n`; - - // Add header separator after first row - if (rowIndex === 0) { - markdownTable += `|${cells.map(() => '---').join('|')}|\n`; - } + cells.push(text); + // Add empty cells for colspan + for (let i = 1; i < colspan; i++) { + cells.push(''); } }); - $table.replaceWith(markdownTable + '\n'); + if (cells.length > 0) { + rows.push(cells); + maxColumns = Math.max(maxColumns, cells.length); + } }); - // Get the final text and clean it up - let markdown = $body.text(); + if (rows.length === 0) return ''; + + let markdownTable = '\n'; - // Clean up excessive whitespace while preserving structure - markdown = cleanMarkdown(markdown); + // Header row + if (rows.length > 0) { + markdownTable += `| ${rows[0].join(' | ')} |\n`; + markdownTable += `|${' --- |'.repeat(rows[0].length)}\n`; + + // Data rows + for (let i = 1; i < rows.length; i++) { + markdownTable += `| ${rows[i].join(' | ')} |\n`; + } + } - return markdown; + return markdownTable + '\n'; } -function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string { +function convertToPlainText(baseUrl: string, options: ProcessTextOptions): string { const { keepImages, keepWebpageLinks } = options; const $body = $('body').clone(); + // Remove unwanted elements + $body.find('script, style, noscript, meta, link').remove(); + $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); + // Process images if (keepImages) { $body.find('img').each((_, element) => { const $img = $(element); const src = $img.attr('src'); + const alt = $img.attr('alt') || ''; if (src && !shouldRemoveImage(src, options)) { - const absoluteUrl = new URL(src, baseUrl).toString(); - $img.replaceWith(`\nImage: ${absoluteUrl}\n`); + try { + const absoluteUrl = new URL(src, baseUrl).toString(); + $img.replaceWith(`[Image: ${alt || 'image'} - ${absoluteUrl}]`); + } catch { + $img.remove(); + } } else { $img.remove(); } @@ -278,8 +450,12 @@ function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: Pro const text = $link.text().trim(); if (href && text) { - const absoluteUrl = new URL(href, baseUrl).toString(); - $link.replaceWith(`${text}: ${absoluteUrl} `); + try { + const absoluteUrl = new URL(href, baseUrl).toString(); + $link.replaceWith(`${text} (${absoluteUrl})`); + } catch { + $link.replaceWith(text); + } } }); } else { @@ -303,25 +479,52 @@ function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean { if (removeGifImage) imageTypesToRemove.push('.gif'); imageTypesToRemove.push(...removeImageTypes); - return imageTypesToRemove.some(type => src.includes(type)); + return imageTypesToRemove.some(type => src.toLowerCase().includes(type.toLowerCase())); } -function cleanMarkdown(markdown: string): string { +function cleanMarkdown(markdown: string, preserveLineBreaks: boolean = true): string { return markdown - // Replace 3+ newlines with 2 newlines + // Normalize line breaks + .replace(/\r\n/g, '\n') + // Remove excessive empty lines (keep max 2) .replace(/\n{3,}/g, '\n\n') - // Remove excessive spaces - .replace(/[ ]{2,}/g, ' ') - // Clean up space around headers + // Clean up spaces around headers .replace(/\n\s*(#+)\s*/g, '\n$1 ') + // Remove spaces at start of lines + .replace(/^\s+/gm, '') // Remove trailing whitespace .replace(/[ \t]+$/gm, '') + // Fix multiple spaces + .replace(/[ ]{2,}/g, ' ') + // Ensure proper spacing after paragraphs + .replace(/([^\n])\n([^\n])/g, '$1\n\n$2') .trim(); } function cleanText(text: string): string { return text + .replace(/\r\n/g, '\n') .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n\n') + .replace(/[ ]{2,}/g, ' ') .trim(); +} + +function createEmptyResult(url: string): ProcessedResult { + return { + markdown: '', + plainText: '', + metadata: { + title: '', + description: '', + url: url, + processedAt: new Date().toISOString(), + textLength: 0, + markdownLength: 0, + hasContent: false, + wordCount: 0, + linkCount: 0, + imageCount: 0 + } + }; } \ No newline at end of file From 1651763fc288c0c8b663294ab05b7d3b40326580 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 21:53:04 +0530 Subject: [PATCH 11/79] fix: better markdown output --- server/src/markdownify/get_llm_ready_text.ts | 46 ++++++++++---------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts index ed7849ec3..025fb52d6 100644 --- a/server/src/markdownify/get_llm_ready_text.ts +++ b/server/src/markdownify/get_llm_ready_text.ts @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: MIT + import { getPageSource, GetPageSourceOptions } from './get_html'; import { getProcessedText, ProcessTextOptions, ProcessedResult } from './get_llm_input_text'; @@ -11,17 +13,7 @@ export async function urlToLlmText( const pageSource = await getPageSource(url, options); if (!pageSource) { - return { - markdown: '', - plainText: '', - metadata: { - title: '', - url: url, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0 - } - }; + return createEmptyResult(url); } const result = await getProcessedText(pageSource, url, options); @@ -29,18 +21,28 @@ export async function urlToLlmText( } catch (error) { console.error('Error while scraping url: ', error); - return { - markdown: '', - plainText: '', - metadata: { - title: '', - url: url, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0 - } - }; + return createEmptyResult(url); } } +function createEmptyResult(url: string): ProcessedResult { + return { + markdown: '', + plainText: '', + metadata: { + title: '', + description: '', + url: url, + processedAt: new Date().toISOString(), + textLength: 0, + markdownLength: 0, + hasContent: false, + language: 'en', + wordCount: 0, + linkCount: 0, + imageCount: 0 + } + }; +} + export { getPageSource, getProcessedText }; \ No newline at end of file From 6e6d6c68011533b42b886989726ad5830291b293 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 18 Nov 2025 23:27:22 +0530 Subject: [PATCH 12/79] chore(deps): install cheerio, turndown --- package.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/package.json b/package.json index c70b0fef5..0a8a8f6c7 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,7 @@ "bcrypt": "^5.1.1", "body-parser": "^1.20.3", "buffer": "^6.0.3", + "cheerio": "^1.1.2", "connect-pg-simple": "^10.0.0", "cookie-parser": "^1.4.6", "cors": "^2.8.5", @@ -80,6 +81,7 @@ "styled-components": "^5.3.3", "swagger-jsdoc": "^6.2.8", "swagger-ui-express": "^5.0.1", + "turndown": "^7.2.2", "typedoc": "^0.23.8", "typescript": "^5.0.0", "uuid": "^8.3.2", @@ -126,6 +128,7 @@ "@types/styled-components": "^5.1.23", "@types/swagger-jsdoc": "^6.0.4", "@types/swagger-ui-express": "^4.1.6", + "@types/turndown": "^5.0.6", "@vitejs/plugin-react": "^4.3.3", "ajv": "^8.8.2", "concurrently": "^7.0.0", From f22f6ef83daf13bd5c254c11527c786cff70dec5 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 18 Nov 2025 23:41:27 +0530 Subject: [PATCH 13/79] debug(temporary): turndown x amzn --- server/src/markdownify/debug_turndown.ts | 132 +++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 server/src/markdownify/debug_turndown.ts diff --git a/server/src/markdownify/debug_turndown.ts b/server/src/markdownify/debug_turndown.ts new file mode 100644 index 000000000..1d62b109f --- /dev/null +++ b/server/src/markdownify/debug_turndown.ts @@ -0,0 +1,132 @@ +import { getPageSource } from './get_html'; +import { getProcessedText } from './get_llm_input_text'; +import * as cheerio from 'cheerio'; +import TurndownService from 'turndown'; + +async function debugTurndown() { + const testUrls = [ + "https://amazon.com/", + ]; + + for (const url of testUrls) { + console.log(`\n${'='.repeat(70)}`); + console.log(`πŸ” Testing URL: ${url}`); + console.log(`${'='.repeat(70)}`); + + try { + const pageSource = await getPageSource(url, { + wait: 3.0, // Longer wait time + timeout: 15000 // 15 second timeout + }); + + if (!pageSource || pageSource.length < 100) { + console.error("❌ No page source received or content too short"); + continue; + } + + // Save raw HTML for inspection + const fs = await import('fs/promises'); + const domain = new URL(url).hostname; + await fs.writeFile(`debug_${domain}_raw.html`, pageSource); + console.log(`πŸ’Ύ Raw HTML saved to debug_${domain}_raw.html (${pageSource.length} chars)`); + + // Parse with cheerio + const $ = cheerio.load(pageSource); + + // Check what's in the body + const bodyText = $('body').text(); + console.log(`πŸ“„ Body text length: ${bodyText.length} chars`); + console.log(`πŸ“„ Body preview: ${bodyText.substring(0, 200)}...`); + + // Test content extraction + const contentSelectors = [ + 'main', 'article', '[role="main"]', '.content', '.main-content', + '#content', '#main', '.post', '.article' + ]; + + let mainContent: cheerio.Cheerio = $('body'); + let foundSelector = 'body (fallback)'; + + for (const selector of contentSelectors) { + const $content = $(selector).first(); + if ($content.length > 0 && $content.text().trim().length > 10) { + console.log(`βœ… Found content with selector: ${selector}`); + console.log(`πŸ“ Content text length: ${$content.text().length}`); + mainContent = $content; + foundSelector = selector; + break; + } + } + + console.log(`🎯 Using content from: ${foundSelector}`); + + // Test Turndown directly + console.log("\nπŸ§ͺ Testing Turndown directly..."); + const turndownService = new TurndownService(); + + if (mainContent.length > 0) { + const contentHtml = mainContent.html() || ''; + if (contentHtml && contentHtml.length > 10) { + console.log(`πŸ“¦ Content HTML length: ${contentHtml.length} chars`); + + try { + const contentMarkdown = turndownService.turndown(contentHtml); + console.log(`πŸ“ Turndown result length: ${contentMarkdown.length} chars`); + + if (contentMarkdown.length > 0) { + console.log(`πŸ“ Markdown preview: ${contentMarkdown.substring(0, 300)}...`); + await fs.writeFile(`debug_${domain}_turndown.md`, contentMarkdown); + console.log(`πŸ’Ύ Turndown output saved to debug_${domain}_turndown.md`); + } else { + console.log("❌ Turndown produced empty markdown"); + } + } catch (turndownError) { + console.error("❌ Turndown conversion failed:", turndownError); + } + } else { + console.log("❌ No HTML content found for Turndown"); + } + } + + // Test our full function + console.log("\nπŸ§ͺ Testing full getProcessedText function..."); + const result = await getProcessedText(pageSource, url, { + keepImages: true, + keepWebpageLinks: true, + removeScriptTag: true, + removeStyleTag: true, + formatAsMarkdown: true + }); + + console.log("πŸ“Š Result metadata:"); + console.log(`- Markdown length: ${result.metadata.markdownLength} chars`); + console.log(`- Plain text length: ${result.metadata.textLength} chars`); + console.log(`- Has content: ${result.metadata.hasContent}`); + console.log(`- Content score: ${result.metadata.contentScore}/10`); + + if (result.markdown && result.markdown.length > 0) { + console.log(`πŸ“„ Markdown preview (300 chars):`); + console.log(result.markdown.substring(0, 300) + '...'); + await fs.writeFile(`debug_${domain}_full.md`, result.markdown); + console.log(`πŸ’Ύ Full output saved to debug_${domain}_full.md`); + } else { + console.log("❌ Empty markdown from full function"); + + // Debug why it's empty + if (result.plainText && result.plainText.length > 0) { + console.log("ℹ️ But plain text has content, so markdown conversion failed"); + await fs.writeFile(`debug_${domain}_plain.txt`, result.plainText); + console.log(`πŸ’Ύ Plain text saved to debug_${domain}_plain.txt`); + } + } + + } catch (error) { + console.error(`πŸ’₯ Error processing ${url}:`, error); + } + + // Small delay between requests + await new Promise(resolve => setTimeout(resolve, 1000)); + } +} + +debugTurndown().catch(console.error); \ No newline at end of file From 0fa5397b45e42d2280734b7a436cef0d1d9a53c9 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 18 Nov 2025 23:42:20 +0530 Subject: [PATCH 14/79] debug(temporary): test url -> llm text --- server/src/markdownify/test.ts | 73 ++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 server/src/markdownify/test.ts diff --git a/server/src/markdownify/test.ts b/server/src/markdownify/test.ts new file mode 100644 index 000000000..16e2a2855 --- /dev/null +++ b/server/src/markdownify/test.ts @@ -0,0 +1,73 @@ +import { urlToLlmText } from './get_llm_ready_text'; + +async function demoDualOutput() { + const testUrls = [ + "https://quotes.toscrape.com/", + "https://httpbin.org/html", + "https://example.com", + "https://amazon.com" + ]; + + for (const url of testUrls) { + console.log(`\n${'='.repeat(70)}`); + console.log(`Processing: ${url}`); + console.log(`${'='.repeat(70)}`); + + try { + const result = await urlToLlmText(url, { + keepImages: true, + keepWebpageLinks: true, + removeScriptTag: true, + removeStyleTag: true, + formatAsMarkdown: true + }); + + console.log(`\n METADATA:`); + console.log(`Title: ${result.metadata.title}`); + console.log(`URL: ${result.metadata.url}`); + console.log(`Processed: ${result.metadata.processedAt}`); + console.log(`Plain text length: ${result.metadata.textLength} chars`); + console.log(`Markdown length: ${result.metadata.markdownLength} chars`); + console.log(`Content Score: ${result.metadata.contentScore}/10`); + + console.log(`\nPLAIN TEXT (first 600 chars):`); + console.log(`${result.plainText.substring(0, 600)}${result.plainText.length > 600 ? '...' : ''}`); + + console.log(`\nMARKDOWN (first 600 chars):`); + console.log(`${result.markdown.substring(0, 600)}${result.markdown.length > 600 ? '...' : ''}`); + + // Save both formats + const domain = new URL(url).hostname; + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + + await saveToFile(result.plainText, `output/${domain}_${timestamp}_plain.txt`); + await saveToFile(result.markdown, `output/${domain}_${timestamp}_markdown.md`); + + // Save metadata as JSON + await saveToFile(JSON.stringify(result.metadata, null, 2), `output/${domain}_${timestamp}_metadata.json`); + + console.log(`\nSaved to output/ directory`); + + } catch (error) { + console.error(`Error processing ${url}:`, error); + } + } +} + +async function saveToFile(content: string, filename: string) { + const fs = await import('fs/promises'); + const path = await import('path'); + + try { + // Create directory if it doesn't exist + const dir = path.dirname(filename); + await fs.mkdir(dir, { recursive: true }); + + await fs.writeFile(filename, content, 'utf-8'); + } catch (error) { + console.error(`Error saving to ${filename}:`, error); + } +} + +// Run the demo +demoDualOutput().catch(console.error); \ No newline at end of file From 4158896e3c20d1449f5d167c4529aad81615b510 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 22:34:18 +0530 Subject: [PATCH 15/79] chore: link replace --- server/src/markdownify/html-to-markdown/go.mod | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 server/src/markdownify/html-to-markdown/go.mod diff --git a/server/src/markdownify/html-to-markdown/go.mod b/server/src/markdownify/html-to-markdown/go.mod new file mode 100644 index 000000000..ada1bf6dc --- /dev/null +++ b/server/src/markdownify/html-to-markdown/go.mod @@ -0,0 +1,18 @@ +module html-to-markdown + +go 1.23.0 + +toolchain go1.24.0 + +require ( + github.com/PuerkitoBio/goquery v1.10.3 + github.com/getmaxun/html-to-markdown/v2 v2.0.6 + golang.org/x/net v0.43.0 +) + +require ( + github.com/JohannesKaufmann/dom v0.2.0 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect +) + +replace github.com/JohannesKaufmann/html-to-markdown/v2 => github.com/getmaxun/html-to-markdown/v2 v2.0.0 From 6c8850a0a7f674e11f8250189753ddf1ac2bb2f3 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 22:35:25 +0530 Subject: [PATCH 16/79] chore: link replace --- .../src/markdownify/html-to-markdown/go.sum | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 server/src/markdownify/html-to-markdown/go.sum diff --git a/server/src/markdownify/html-to-markdown/go.sum b/server/src/markdownify/html-to-markdown/go.sum new file mode 100644 index 000000000..f5cc5e695 --- /dev/null +++ b/server/src/markdownify/html-to-markdown/go.sum @@ -0,0 +1,83 @@ +github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ= +github.com/JohannesKaufmann/dom v0.2.0/go.mod h1:57iSUl5RKric4bUkgos4zu6Xt5LMHUnw3TF1l5CbGZo= +github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= +github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/getmaxun/html-to-markdown/v2 v2.0.6 h1:SXoxwR0TCCggwdEAlarhsrvMBHIbD9YqVTSkCYnZukc= +github.com/getmaxun/html-to-markdown/v2 v2.0.6/go.mod h1:FjUN4bMyWtxmt2EpnEEOb5zu/GUSRk3PIr5ADTE4GBg= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sebdah/goldie/v2 v2.7.1 h1:PkBHymaYdtvEkZV7TmyqKxdmn5/Vcj+8TpATWZjnG5E= +github.com/sebdah/goldie/v2 v2.7.1/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= +github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= +github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA= +github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= From 7da464755d0dd77dedacc827e9f1d389ef514f32 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 22:50:46 +0530 Subject: [PATCH 17/79] wip: to markdown --- .../html-to-markdown/html-to-markdown.go | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 server/src/markdownify/html-to-markdown/html-to-markdown.go diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go new file mode 100644 index 000000000..dce011c13 --- /dev/null +++ b/server/src/markdownify/html-to-markdown/html-to-markdown.go @@ -0,0 +1,157 @@ +package main + +/* +#include +*/ +import "C" + +import ( + "strings" + "unsafe" + "unicode/utf8" + + "github.com/PuerkitoBio/goquery" + md "github.com/getmaxun/html-to-markdown/v2" + "github.com/getmaxun/html-to-markdown/v2/plugin" + converter "github.com/getmaxun/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C. +func ConvertHTMLToMarkdown(input *C.char) *C.char { + engine := converter.NewConverter("", true, nil) + // engine.Use(plugin.GitHubFlavored()) + + registerPreHandler(engine) + + result, err := engine.ConvertString(C.GoString(input)) + if err != nil { + // swallow conversion error (same as original) + } + + return C.CString(result) +} + +//export FreeCString +// Frees C string memory. +func FreeCString(str *C.char) { + C.free(unsafe.Pointer(str)) +} + +func main() { + // Required empty main for CGO. +} + +// registerPreHandler configures a specialized PRE/code block rule +// to properly extract nested content and detect languages. +func registerPreHandler(conv *converter.Converter) { + isNoiseNode := func(class string) bool { + l := strings.ToLower(class) + return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers") + } + + findLanguage := func(sel *goquery.Selection) string { + cls := strings.ToLower(sel.AttrOr("class", "")) + for _, chunk := range strings.Fields(cls) { + if strings.HasPrefix(chunk, "language-") { + return strings.TrimPrefix(chunk, "language-") + } + if strings.HasPrefix(chunk, "lang-") { + return strings.TrimPrefix(chunk, "lang-") + } + } + return "" + } + + // Walk nodes and extract visible text, injecting newlines at block boundaries. + var scrape func(n *html.Node, out *strings.Builder) + scrape = func(n *html.Node, out *strings.Builder) { + if n == nil { + return + } + + switch n.Type { + case html.TextNode: + out.WriteString(n.Data) + + case html.ElementNode: + tag := strings.ToLower(n.Data) + + // skip gutter/line number elements + for _, attr := range n.Attr { + if attr.Key == "class" && isNoiseNode(attr.Val) { + return + } + } + + if tag == "br" { + out.WriteString("\n") + } + + for child := n.FirstChild; child != nil; child = child.NextSibling { + scrape(child, out) + } + + switch tag { + case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot", + "section", "article", "blockquote", "pre", + "h1", "h2", "h3", "h4", "h5", "h6": + out.WriteString("\n") + } + } + } + + // PRE blocks + conv.AddRules(md.Rule{ + Filter: []string{"pre"}, + Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string { + codeTag := s.Find("code").First() + lang := findLanguage(codeTag) + if lang == "" { + lang = findLanguage(s) + } + + var buf strings.Builder + for _, node := range s.Nodes { + scrape(node, &buf) + } + + raw := strings.TrimRight(buf.String(), "\n") + + fRune, _ := utf8.DecodeRuneInString(opt.Fence) + fence := md.CalculateCodeFence(fRune, raw) + + block := "\n\n" + fence + lang + "\n" + raw + "\n" + fence + "\n\n" + return md.String(block) + }, + }) + + // Inline code rule + conv.AddRules(md.Rule{ + Filter: []string{"code"}, + Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string { + // do nothing when inside PRE + if s.ParentsFiltered("pre").Length() > 0 { + return nil + } + + var buf strings.Builder + for _, node := range s.Nodes { + scrape(node, &buf) + } + + text := md.TrimTrailingSpaces(strings.ReplaceAll(buf.String(), "\r\n", "\n")) + + fence := "`" + if strings.Contains(text, "`") { + fence = "``" + if strings.Contains(text, "``") { + fence = "```" + } + } + + inline := fence + text + fence + return md.String(inline) + }, + }) +} From dd1a9a8a85ffac8df612b163bd7d0cb952c60c12 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 23:42:44 +0530 Subject: [PATCH 18/79] chore(deps): install koffi --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 0a8a8f6c7..330db1e3d 100644 --- a/package.json +++ b/package.json @@ -49,6 +49,7 @@ "joi": "^17.6.0", "jsonwebtoken": "^9.0.2", "jwt-decode": "^4.0.0", + "koffi": "^2.14.1", "lodash": "^4.17.21", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", From ec49565c44b81dfe6ae4af0757b47efd7027b8bc Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 23:59:14 +0530 Subject: [PATCH 19/79] chore: ignore build files --- server/src/markdownify/html-to-markdown/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 server/src/markdownify/html-to-markdown/.gitignore diff --git a/server/src/markdownify/html-to-markdown/.gitignore b/server/src/markdownify/html-to-markdown/.gitignore new file mode 100644 index 000000000..909db1866 --- /dev/null +++ b/server/src/markdownify/html-to-markdown/.gitignore @@ -0,0 +1,3 @@ +html-to-markdown.* +!html-to-markdown.go +libhtml-to-markdown.* \ No newline at end of file From f0d6712c3e45812206fadf3c35352e9b9422fca8 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 23:59:33 +0530 Subject: [PATCH 20/79] chore: build --- server/src/markdownify/html-to-markdown/go.mod | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/markdownify/html-to-markdown/go.mod b/server/src/markdownify/html-to-markdown/go.mod index ada1bf6dc..f1d4f8774 100644 --- a/server/src/markdownify/html-to-markdown/go.mod +++ b/server/src/markdownify/html-to-markdown/go.mod @@ -6,13 +6,13 @@ toolchain go1.24.0 require ( github.com/PuerkitoBio/goquery v1.10.3 - github.com/getmaxun/html-to-markdown/v2 v2.0.6 + github.com/getmaxun/html-to-markdown v1.0.1 golang.org/x/net v0.43.0 ) require ( - github.com/JohannesKaufmann/dom v0.2.0 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect ) -replace github.com/JohannesKaufmann/html-to-markdown/v2 => github.com/getmaxun/html-to-markdown/v2 v2.0.0 +replace github.com/JohannesKaufmann/html-to-markdown => github.com/getmaxun/html-to-markdown v1.0.1 From da48d46f2a6bb2bda5375b522637e328241df6f7 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 23:59:39 +0530 Subject: [PATCH 21/79] chore: build --- .../src/markdownify/html-to-markdown/go.sum | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/server/src/markdownify/html-to-markdown/go.sum b/server/src/markdownify/html-to-markdown/go.sum index f5cc5e695..d2fc77e55 100644 --- a/server/src/markdownify/html-to-markdown/go.sum +++ b/server/src/markdownify/html-to-markdown/go.sum @@ -1,21 +1,19 @@ -github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ= -github.com/JohannesKaufmann/dom v0.2.0/go.mod h1:57iSUl5RKric4bUkgos4zu6Xt5LMHUnw3TF1l5CbGZo= github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= -github.com/getmaxun/html-to-markdown/v2 v2.0.6 h1:SXoxwR0TCCggwdEAlarhsrvMBHIbD9YqVTSkCYnZukc= -github.com/getmaxun/html-to-markdown/v2 v2.0.6/go.mod h1:FjUN4bMyWtxmt2EpnEEOb5zu/GUSRk3PIr5ADTE4GBg= +github.com/getmaxun/html-to-markdown v1.0.1 h1:ter2Nby2EeYx0ichgZ/Pc6uo3aBudfPPgPicLTh02VI= +github.com/getmaxun/html-to-markdown v1.0.1/go.mod h1:ggHEOofo3wcKaTOuD/z/pf3KJnQns0nQV+Gy/R8iE3U= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/sebdah/goldie/v2 v2.7.1 h1:PkBHymaYdtvEkZV7TmyqKxdmn5/Vcj+8TpATWZjnG5E= -github.com/sebdah/goldie/v2 v2.7.1/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= -github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= -github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= +github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= +github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= +github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= +github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA= -github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= +github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U= +github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= @@ -81,3 +79,7 @@ golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= From 713d37465dec19420cc305abf74db5385251dd40 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 00:00:56 +0530 Subject: [PATCH 22/79] feat: to markdown --- .../markdownify/html-to-markdown/html-to-markdown.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go index dce011c13..8d4202a66 100644 --- a/server/src/markdownify/html-to-markdown/html-to-markdown.go +++ b/server/src/markdownify/html-to-markdown/html-to-markdown.go @@ -11,16 +11,16 @@ import ( "unicode/utf8" "github.com/PuerkitoBio/goquery" - md "github.com/getmaxun/html-to-markdown/v2" - "github.com/getmaxun/html-to-markdown/v2/plugin" - converter "github.com/getmaxun/html-to-markdown/v2/converter" + md "github.com/getmaxun/html-to-markdown" + "github.com/getmaxun/html-to-markdown/plugin" "golang.org/x/net/html" ) // ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C. +// Function name changed, comment rewritten. func ConvertHTMLToMarkdown(input *C.char) *C.char { - engine := converter.NewConverter("", true, nil) - // engine.Use(plugin.GitHubFlavored()) + engine := md.NewConverter("", true, nil) + engine.Use(plugin.GitHubFlavored()) registerPreHandler(engine) @@ -44,7 +44,7 @@ func main() { // registerPreHandler configures a specialized PRE/code block rule // to properly extract nested content and detect languages. -func registerPreHandler(conv *converter.Converter) { +func registerPreHandler(conv *md.Converter) { isNoiseNode := func(class string) bool { l := strings.ToLower(class) return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers") From 6c93cbc9a2d462bcc1315a8fa1d8699145d3dc70 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 02:42:44 +0530 Subject: [PATCH 23/79] feat: html -> markdown --- server/src/markdownify/markdown.ts | 141 +++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 server/src/markdownify/markdown.ts diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts new file mode 100644 index 000000000..8666f785a --- /dev/null +++ b/server/src/markdownify/markdown.ts @@ -0,0 +1,141 @@ +import koffi from "koffi"; +import dotenv from "dotenv"; +import { stat } from "fs/promises"; +import path from "node:path"; +import os from "node:os"; + +const exts = { + win32: ".dll", + darwin: ".dylib", + default: ".so", +}; + +const ext = + exts[os.platform() as keyof typeof exts] || exts.default; + +// Build path to the binary **inside the same folder** +export const GO_MARKDOWN_PARSER_PATH = path.join( + __dirname, + `html-to-markdown${ext}` +); + +dotenv.config(); + +// --------------------------------------------- +// Native Go binding wrapper +// --------------------------------------------- +class NativeMarkdownBridge { + private static singleton: NativeMarkdownBridge; + private fnConvert: any; + + private constructor() { + const lib = koffi.load(GO_MARKDOWN_PARSER_PATH); + + const freeFn = lib.func("FreeCString", "void", ["string"]); + const trackedType = "CString:" + crypto.randomUUID(); + const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn); + + this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [ + "string", + ]); + } + + static async load(): Promise { + if (!NativeMarkdownBridge.singleton) { + try { + await stat(GO_MARKDOWN_PARSER_PATH); + } catch { + throw new Error("Go shared library not found"); + } + NativeMarkdownBridge.singleton = new NativeMarkdownBridge(); + } + return NativeMarkdownBridge.singleton; + } + + async run(html: string): Promise { + return new Promise((resolve, reject) => { + this.fnConvert.async(html, (err: Error, output: string) => { + err ? reject(err) : resolve(output); + }); + }); + } +} + +// --------------------------------------------- +// Main exposed function +// --------------------------------------------- +export async function parseMarkdown( + html: string | null | undefined, +): Promise { + if (!html) return ""; + + // Try Go library first (if enabled) + try { + const engine = await NativeMarkdownBridge.load(); + let md = await engine.run(html); + + md = fixBrokenLinks(md); + md = stripSkipLinks(md); + + return md; + } catch (err: any) { + if (err?.message !== "Go shared library not found") { + console.log("Go markdown parser failed, falling back to JS parser:", err); + } else { + console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH }); + } + } + + // Fallback parser + const TurndownService = require("turndown"); + const { gfm } = require("joplin-turndown-plugin-gfm"); + + const t = new TurndownService(); + t.addRule("inlineLink", { + filter: (node: any, opts: any) => + opts.linkStyle === "inlined" && + node.nodeName === "A" && + node.getAttribute("href"), + replacement: (content: string, node: any) => { + const href = node.getAttribute("href").trim(); + const title = node.title ? ` "${node.title}"` : ""; + return `[${content.trim()}](${href}${title})\n`; + }, + }); + + t.use(gfm); + + try { + let out = await t.turndown(html); + out = fixBrokenLinks(out); + out = stripSkipLinks(out); + return out; + } catch (err) { + console.error("HTMLβ†’Markdown failed", { err }); + return ""; + } +} + +// --------------------------------------------- +// Helpers +// --------------------------------------------- +function fixBrokenLinks(md: string): string { + let depth = 0; + let result = ""; + + for (const ch of md) { + if (ch === "[") depth++; + if (ch === "]") depth = Math.max(0, depth - 1); + + if (depth > 0 && ch === "\n") { + result += "\\\n"; + } else { + result += ch; + } + } + return result; +} + +function stripSkipLinks(md: string): string { + return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, ""); +} From 0837ac50b90b6da79c1aca08055b6dc4351c5f4e Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 02:44:05 +0530 Subject: [PATCH 24/79] fix: go parser path --- server/src/markdownify/markdown.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index 8666f785a..c4709e6e5 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -16,6 +16,7 @@ const ext = // Build path to the binary **inside the same folder** export const GO_MARKDOWN_PARSER_PATH = path.join( __dirname, + "html-to-markdown", `html-to-markdown${ext}` ); From 66d829128293c08224a35f5bf108ec7c0ac69a3c Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 02:47:20 +0530 Subject: [PATCH 25/79] fix: export convert fxn --- server/src/markdownify/html-to-markdown/html-to-markdown.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go index 8d4202a66..43c658a2f 100644 --- a/server/src/markdownify/html-to-markdown/html-to-markdown.go +++ b/server/src/markdownify/html-to-markdown/html-to-markdown.go @@ -16,9 +16,9 @@ import ( "golang.org/x/net/html" ) -// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C. -// Function name changed, comment rewritten. +//export ConvertHTMLToMarkdown func ConvertHTMLToMarkdown(input *C.char) *C.char { + // ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C. engine := md.NewConverter("", true, nil) engine.Use(plugin.GitHubFlavored()) From 1d65f900339831d081a336bd44998442a6f24f97 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:01:18 +0530 Subject: [PATCH 26/79] feat: use parser to scrape --- server/src/markdownify/scrape.ts | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 server/src/markdownify/scrape.ts diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts new file mode 100644 index 000000000..39d1fa3a3 --- /dev/null +++ b/server/src/markdownify/scrape.ts @@ -0,0 +1,57 @@ +import { chromium } from "playwright"; +import { parseMarkdown } from "./markdown"; + +/** + * Fetches a webpage, strips scripts/styles/images/etc, + * returns clean Markdown using parser. + */ +export async function convertPageToMarkdown(url: string): Promise { + const browser = await chromium.launch(); + const page = await browser.newPage(); + + await page.goto(url, { waitUntil: "networkidle" }); + + await page.addInitScript(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); + + // Remove inline event handlers (onclick, onload…) + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + }); + + // Re-extract HTML after cleanup + const cleanedHtml = await page.evaluate(() => { + return document.documentElement.outerHTML; + }); + + await browser.close(); + + // Convert cleaned HTML β†’ Markdown + const markdown = await parseMarkdown(cleanedHtml || ""); + return markdown; +} From 3fd9bb5e0ea521d64e01998887ddb291ef718340 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:01:42 +0530 Subject: [PATCH 27/79] chore(debug): test --- server/src/markdownify/test.ts | 77 +++------------------------------- 1 file changed, 5 insertions(+), 72 deletions(-) diff --git a/server/src/markdownify/test.ts b/server/src/markdownify/test.ts index 16e2a2855..48db37dc2 100644 --- a/server/src/markdownify/test.ts +++ b/server/src/markdownify/test.ts @@ -1,73 +1,6 @@ -import { urlToLlmText } from './get_llm_ready_text'; +import { convertPageToMarkdown } from "./scrape"; -async function demoDualOutput() { - const testUrls = [ - "https://quotes.toscrape.com/", - "https://httpbin.org/html", - "https://example.com", - "https://amazon.com" - ]; - - for (const url of testUrls) { - console.log(`\n${'='.repeat(70)}`); - console.log(`Processing: ${url}`); - console.log(`${'='.repeat(70)}`); - - try { - const result = await urlToLlmText(url, { - keepImages: true, - keepWebpageLinks: true, - removeScriptTag: true, - removeStyleTag: true, - formatAsMarkdown: true - }); - - console.log(`\n METADATA:`); - console.log(`Title: ${result.metadata.title}`); - console.log(`URL: ${result.metadata.url}`); - console.log(`Processed: ${result.metadata.processedAt}`); - console.log(`Plain text length: ${result.metadata.textLength} chars`); - console.log(`Markdown length: ${result.metadata.markdownLength} chars`); - console.log(`Content Score: ${result.metadata.contentScore}/10`); - - console.log(`\nPLAIN TEXT (first 600 chars):`); - console.log(`${result.plainText.substring(0, 600)}${result.plainText.length > 600 ? '...' : ''}`); - - console.log(`\nMARKDOWN (first 600 chars):`); - console.log(`${result.markdown.substring(0, 600)}${result.markdown.length > 600 ? '...' : ''}`); - - // Save both formats - const domain = new URL(url).hostname; - const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); - - await saveToFile(result.plainText, `output/${domain}_${timestamp}_plain.txt`); - await saveToFile(result.markdown, `output/${domain}_${timestamp}_markdown.md`); - - // Save metadata as JSON - await saveToFile(JSON.stringify(result.metadata, null, 2), `output/${domain}_${timestamp}_metadata.json`); - - console.log(`\nSaved to output/ directory`); - - } catch (error) { - console.error(`Error processing ${url}:`, error); - } - } -} - -async function saveToFile(content: string, filename: string) { - const fs = await import('fs/promises'); - const path = await import('path'); - - try { - // Create directory if it doesn't exist - const dir = path.dirname(filename); - await fs.mkdir(dir, { recursive: true }); - - await fs.writeFile(filename, content, 'utf-8'); - } catch (error) { - console.error(`Error saving to ${filename}:`, error); - } -} - -// Run the demo -demoDualOutput().catch(console.error); \ No newline at end of file +(async () => { + const md = await convertPageToMarkdown("https://quotes.toscrape.com/"); + console.log(md); +})(); From 1a291c22b6b70aa0c65ca88b884b7a6be7a04b13 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:38:19 +0530 Subject: [PATCH 28/79] chore: cleanup --- server/src/markdownify/get_html.ts | 55 -- server/src/markdownify/get_llm_input_text.ts | 530 ------------------- server/src/markdownify/get_llm_ready_text.ts | 48 -- 3 files changed, 633 deletions(-) delete mode 100644 server/src/markdownify/get_html.ts delete mode 100644 server/src/markdownify/get_llm_input_text.ts delete mode 100644 server/src/markdownify/get_llm_ready_text.ts diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts deleted file mode 100644 index dbf6a8a93..000000000 --- a/server/src/markdownify/get_html.ts +++ /dev/null @@ -1,55 +0,0 @@ -import { chromium, Browser, BrowserContext, Page } from 'playwright'; - -export interface GetPageSourceOptions { - wait?: number; - headless?: boolean; - userAgent?: string; -} - -export async function getPageSource( - url: string, - options: GetPageSourceOptions = {} -): Promise { - const { - wait = 1.5, - headless = true, - userAgent = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166" - } = options; - - let browser: Browser | null = null; - let context: BrowserContext | null = null; - let page: Page | null = null; - - try { - browser = await chromium.launch({ - headless, - args: ['--no-sandbox', '--disable-dev-shm-usage'] - }); - - context = await browser.newContext({ userAgent }); - page = await context.newPage(); - - // Convert wait time to milliseconds - const waitMs = wait * 1000; - - // Set default timeout and navigate to URL - await page.setDefaultTimeout(waitMs); - await page.goto(url, { waitUntil: 'domcontentloaded' }); - - // Wait for additional time if specified - if (waitMs > 0) { - await page.waitForTimeout(waitMs); - } - - const pageSource = await page.content(); - return pageSource; - - } catch (error) { - console.error('Error while getting page source: ', error); - return ''; // Explicitly return empty string on error - } finally { - if (page) await page.close(); - if (context) await context.close(); - if (browser) await browser.close(); - } -} \ No newline at end of file diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts deleted file mode 100644 index 3e600140f..000000000 --- a/server/src/markdownify/get_llm_input_text.ts +++ /dev/null @@ -1,530 +0,0 @@ -import * as cheerio from 'cheerio'; -import { URL } from 'url'; - -export interface ProcessTextOptions { - keepImages?: boolean; - removeSvgImage?: boolean; - removeGifImage?: boolean; - removeImageTypes?: string[]; - keepWebpageLinks?: boolean; - removeScriptTag?: boolean; - removeStyleTag?: boolean; - removeTags?: string[]; - formatAsMarkdown?: boolean; - maxContentLength?: number; - preserveLineBreaks?: boolean; - includeMetadata?: boolean; -} - -export interface ProcessedResult { - markdown: string; - plainText: string; - metadata: { - title: string; - description: string; - url: string; - processedAt: string; - textLength: number; - markdownLength: number; - hasContent: boolean; - language?: string; - wordCount: number; - linkCount: number; - imageCount: number; - }; -} - -// Global cheerio instance for helper functions -let $: cheerio.CheerioAPI; - -export async function getProcessedText( - pageSource: string, - baseUrl: string, - options: ProcessTextOptions = {} -): Promise { - const { - keepImages = true, - removeSvgImage = true, - removeGifImage = true, - removeImageTypes = [], - keepWebpageLinks = true, - removeScriptTag = true, - removeStyleTag = true, - removeTags = [], - formatAsMarkdown = true, - maxContentLength = 100000, - preserveLineBreaks = true, - includeMetadata = true - } = options; - - try { - // Initialize cheerio without problematic options - $ = cheerio.load(pageSource); - - // Remove unwanted tags completely - const tagsToRemove: string[] = []; - if (removeScriptTag) tagsToRemove.push('script'); - if (removeStyleTag) tagsToRemove.push('style'); - if (removeScriptTag) tagsToRemove.push('noscript'); - tagsToRemove.push(...removeTags); - - const uniqueTags = [...new Set(tagsToRemove)]; - uniqueTags.forEach(tag => { - $(tag).remove(); - }); - - // Remove common unwanted elements - $('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - - // Extract metadata - const title = extractTitle(); - const description = extractDescription(); - const language = extractLanguage(); - - // Generate both formats - const markdown = formatAsMarkdown ? - convertToMarkdown(baseUrl, options) : - ''; - - const plainText = convertToPlainText(baseUrl, options); - - // Truncate if necessary - const finalMarkdown = markdown.substring(0, maxContentLength); - const finalPlainText = plainText.substring(0, maxContentLength); - - // Count elements - const linkCount = $('a[href]').length; - const imageCount = $('img').length; - const wordCount = countWords(finalPlainText); - - const result: ProcessedResult = { - markdown: finalMarkdown, - plainText: finalPlainText, - metadata: { - title, - description, - url: baseUrl, - processedAt: new Date().toISOString(), - textLength: finalPlainText.length, - markdownLength: finalMarkdown.length, - hasContent: finalPlainText.length > 0, - language, - wordCount, - linkCount, - imageCount - } - }; - - return result; - - } catch (error) { - console.error('Error while getting processed text: ', error); - return createEmptyResult(baseUrl); - } -} - -function extractTitle(): string { - return $('title').text()?.trim() || - $('meta[property="og:title"]').attr('content')?.trim() || - $('h1').first().text()?.trim() || - 'Untitled'; -} - -function extractDescription(): string { - return $('meta[name="description"]').attr('content')?.trim() || - $('meta[property="og:description"]').attr('content')?.trim() || - ''; -} - -function extractLanguage(): string { - return $('html').attr('lang') || 'en'; -} - -function countWords(text: string): number { - return text.split(/\s+/).filter(word => word.length > 0).length; -} - -function convertToMarkdown(baseUrl: string, options: ProcessTextOptions): string { - const { keepImages, keepWebpageLinks, preserveLineBreaks } = options; - - // Start with metadata if available - let markdown = ''; - const title = extractTitle(); - if (title && title !== 'Untitled') { - markdown += `# ${title}\n\n`; - } - - const description = extractDescription(); - if (description) { - markdown += `> ${description}\n\n`; - } - - // Clone the body to avoid modifying the original - const $body = $('body').clone(); - - // Remove unwanted elements from the clone - $body.find('script, style, noscript, meta, link').remove(); - $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - - // Process in order of importance - const sections: string[] = []; - - // Process main content areas first - const contentSelectors = [ - 'main', 'article', '[role="main"]', '.content', '.main', - '#content', '#main', '.post', '.article' - ]; - - let mainContent = ''; - for (const selector of contentSelectors) { - const $content = $body.find(selector).first(); - if ($content.length > 0) { - mainContent = processElementToMarkdown($content, baseUrl, options, 0); - if (mainContent.trim().length > 100) { // Only use if substantial content - sections.push(mainContent); - $content.remove(); // Remove from body to avoid duplication - break; - } - } - } - - // Process headers and structure - sections.push(processElementToMarkdown($body, baseUrl, options, 0)); - - // Combine sections - markdown += sections.filter(s => s.trim().length > 0).join('\n\n'); - - // Final cleanup - markdown = cleanMarkdown(markdown, preserveLineBreaks); - - return markdown; -} - -function processElementToMarkdown($element: cheerio.Cheerio, baseUrl: string, options: ProcessTextOptions, depth: number = 0): string { - if (depth > 10) return ''; // Prevent infinite recursion - - const { keepImages, keepWebpageLinks } = options; - let markdown = ''; - - $element.contents().each((index, node) => { - if (node.type === 'text') { - const text = $(node).text().trim(); - if (text) { - markdown += text + ' '; - } - } else if (node.type === 'tag') { - const $node = $(node); - const tagName = node.name?.toLowerCase() || ''; - - switch (tagName) { - case 'h1': - markdown += `\n# ${$node.text().trim()}\n\n`; - break; - case 'h2': - markdown += `\n## ${$node.text().trim()}\n\n`; - break; - case 'h3': - markdown += `\n### ${$node.text().trim()}\n\n`; - break; - case 'h4': - markdown += `\n#### ${$node.text().trim()}\n\n`; - break; - case 'h5': - markdown += `\n##### ${$node.text().trim()}\n\n`; - break; - case 'h6': - markdown += `\n###### ${$node.text().trim()}\n\n`; - break; - case 'p': - const paragraphText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (paragraphText.trim()) { - markdown += `\n${paragraphText.trim()}\n\n`; - } - break; - case 'br': - markdown += '\n'; - break; - case 'hr': - markdown += '\n---\n\n'; - break; - case 'strong': - case 'b': - const strongText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (strongText.trim()) { - markdown += `**${strongText.trim()}**`; - } - break; - case 'em': - case 'i': - const emText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (emText.trim()) { - markdown += `*${emText.trim()}*`; - } - break; - case 'code': - if (!$node.closest('pre').length) { - const codeText = $node.text().trim(); - if (codeText) { - markdown += `\`${codeText}\``; - } - } - break; - case 'pre': - const preText = $node.text().trim(); - if (preText) { - const codeClass = $node.find('code').attr('class'); - const language = codeClass ? codeClass.replace('language-', '') : ''; - markdown += `\n\`\`\`${language}\n${preText}\n\`\`\`\n\n`; - } - break; - case 'blockquote': - const quoteText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (quoteText.trim()) { - const lines = quoteText.trim().split('\n'); - markdown += '\n' + lines.map(line => `> ${line}`).join('\n') + '\n\n'; - } - break; - case 'ul': - const listItems: string[] = []; - $node.find('> li').each((_, li) => { - const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); - if (itemText.trim()) { - listItems.push(`- ${itemText.trim()}`); - } - }); - if (listItems.length > 0) { - markdown += '\n' + listItems.join('\n') + '\n\n'; - } - break; - case 'ol': - const olItems: string[] = []; - $node.find('> li').each((i, li) => { - const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); - if (itemText.trim()) { - olItems.push(`${i + 1}. ${itemText.trim()}`); - } - }); - if (olItems.length > 0) { - markdown += '\n' + olItems.join('\n') + '\n\n'; - } - break; - case 'a': - if (keepWebpageLinks) { - const href = $node.attr('href'); - const linkText = processElementToMarkdown($node, baseUrl, options, depth + 1).trim(); - if (href && linkText) { - try { - const absoluteUrl = new URL(href, baseUrl).toString(); - markdown += `[${linkText}](${absoluteUrl})`; - } catch { - markdown += linkText; - } - } else if (linkText) { - markdown += linkText; - } - } else { - markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); - } - break; - case 'img': - if (keepImages) { - const src = $node.attr('src'); - const alt = $node.attr('alt') || $node.attr('title') || ''; - if (src && !shouldRemoveImage(src, options)) { - try { - const absoluteUrl = new URL(src, baseUrl).toString(); - markdown += `![${alt}](${absoluteUrl})`; - } catch { - // Ignore invalid URLs - } - } - } - break; - case 'table': - markdown += processTableToMarkdown($node); - break; - case 'div': - case 'section': - case 'article': - case 'header': - case 'footer': - case 'nav': - case 'aside': - // Process block-level elements with their content - const blockContent = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (blockContent.trim()) { - markdown += `\n${blockContent.trim()}\n\n`; - } - break; - default: - // For other tags, just process their content - markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); - break; - } - } - }); - - return markdown; -} - -function processTableToMarkdown($table: cheerio.Cheerio): string { - const rows: string[][] = []; - let maxColumns = 0; - - $table.find('tr').each((_, row) => { - const $row = $(row); - const cells: string[] = []; - - $row.find('th, td').each((_, cell) => { - const $cell = $(cell); - const text = $cell.text().trim(); - const colspan = parseInt($cell.attr('colspan') || '1'); - - cells.push(text); - // Add empty cells for colspan - for (let i = 1; i < colspan; i++) { - cells.push(''); - } - }); - - if (cells.length > 0) { - rows.push(cells); - maxColumns = Math.max(maxColumns, cells.length); - } - }); - - if (rows.length === 0) return ''; - - let markdownTable = '\n'; - - // Header row - if (rows.length > 0) { - markdownTable += `| ${rows[0].join(' | ')} |\n`; - markdownTable += `|${' --- |'.repeat(rows[0].length)}\n`; - - // Data rows - for (let i = 1; i < rows.length; i++) { - markdownTable += `| ${rows[i].join(' | ')} |\n`; - } - } - - return markdownTable + '\n'; -} - -function convertToPlainText(baseUrl: string, options: ProcessTextOptions): string { - const { keepImages, keepWebpageLinks } = options; - - const $body = $('body').clone(); - - // Remove unwanted elements - $body.find('script, style, noscript, meta, link').remove(); - $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - - // Process images - if (keepImages) { - $body.find('img').each((_, element) => { - const $img = $(element); - const src = $img.attr('src'); - const alt = $img.attr('alt') || ''; - - if (src && !shouldRemoveImage(src, options)) { - try { - const absoluteUrl = new URL(src, baseUrl).toString(); - $img.replaceWith(`[Image: ${alt || 'image'} - ${absoluteUrl}]`); - } catch { - $img.remove(); - } - } else { - $img.remove(); - } - }); - } else { - $body.find('img').remove(); - } - - // Process links - if (keepWebpageLinks) { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - const href = $link.attr('href'); - const text = $link.text().trim(); - - if (href && text) { - try { - const absoluteUrl = new URL(href, baseUrl).toString(); - $link.replaceWith(`${text} (${absoluteUrl})`); - } catch { - $link.replaceWith(text); - } - } - }); - } else { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - $link.replaceWith($link.text().trim()); - }); - } - - let text = $body.text(); - text = cleanText(text); - - return text; -} - -function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean { - const { removeSvgImage, removeGifImage, removeImageTypes = [] } = options; - - const imageTypesToRemove: string[] = []; - if (removeSvgImage) imageTypesToRemove.push('.svg'); - if (removeGifImage) imageTypesToRemove.push('.gif'); - imageTypesToRemove.push(...removeImageTypes); - - return imageTypesToRemove.some(type => src.toLowerCase().includes(type.toLowerCase())); -} - -function cleanMarkdown(markdown: string, preserveLineBreaks: boolean = true): string { - return markdown - // Normalize line breaks - .replace(/\r\n/g, '\n') - // Remove excessive empty lines (keep max 2) - .replace(/\n{3,}/g, '\n\n') - // Clean up spaces around headers - .replace(/\n\s*(#+)\s*/g, '\n$1 ') - // Remove spaces at start of lines - .replace(/^\s+/gm, '') - // Remove trailing whitespace - .replace(/[ \t]+$/gm, '') - // Fix multiple spaces - .replace(/[ ]{2,}/g, ' ') - // Ensure proper spacing after paragraphs - .replace(/([^\n])\n([^\n])/g, '$1\n\n$2') - .trim(); -} - -function cleanText(text: string): string { - return text - .replace(/\r\n/g, '\n') - .replace(/\s+/g, ' ') - .replace(/\n\s*\n/g, '\n\n') - .replace(/[ ]{2,}/g, ' ') - .trim(); -} - -function createEmptyResult(url: string): ProcessedResult { - return { - markdown: '', - plainText: '', - metadata: { - title: '', - description: '', - url: url, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0, - hasContent: false, - wordCount: 0, - linkCount: 0, - imageCount: 0 - } - }; -} \ No newline at end of file diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts deleted file mode 100644 index 025fb52d6..000000000 --- a/server/src/markdownify/get_llm_ready_text.ts +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: MIT - -import { getPageSource, GetPageSourceOptions } from './get_html'; -import { getProcessedText, ProcessTextOptions, ProcessedResult } from './get_llm_input_text'; - -export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions {} - -export async function urlToLlmText( - url: string, - options: UrlToLlmTextOptions = {} -): Promise { - try { - const pageSource = await getPageSource(url, options); - - if (!pageSource) { - return createEmptyResult(url); - } - - const result = await getProcessedText(pageSource, url, options); - return result; - - } catch (error) { - console.error('Error while scraping url: ', error); - return createEmptyResult(url); - } -} - -function createEmptyResult(url: string): ProcessedResult { - return { - markdown: '', - plainText: '', - metadata: { - title: '', - description: '', - url: url, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0, - hasContent: false, - language: 'en', - wordCount: 0, - linkCount: 0, - imageCount: 0 - } - }; -} - -export { getPageSource, getProcessedText }; \ No newline at end of file From ecaa23f4b620542d5d560d9ae244c080921645f8 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:39:45 +0530 Subject: [PATCH 29/79] chore: install scrape plugins --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 330db1e3d..1303410d7 100644 --- a/package.json +++ b/package.json @@ -47,6 +47,7 @@ "idcac-playwright": "^0.1.3", "ioredis": "^5.4.1", "joi": "^17.6.0", + "joplin-turndown-plugin-gfm": "^1.0.12", "jsonwebtoken": "^9.0.2", "jwt-decode": "^4.0.0", "koffi": "^2.14.1", From 767fa5fe4fd80500ee0baedba71a94cda0316d65 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:48:30 +0530 Subject: [PATCH 30/79] chore: del go --- .../markdownify/html-to-markdown/.gitignore | 3 - .../src/markdownify/html-to-markdown/go.mod | 18 -- .../src/markdownify/html-to-markdown/go.sum | 85 ---------- .../html-to-markdown/html-to-markdown.go | 157 ------------------ 4 files changed, 263 deletions(-) delete mode 100644 server/src/markdownify/html-to-markdown/.gitignore delete mode 100644 server/src/markdownify/html-to-markdown/go.mod delete mode 100644 server/src/markdownify/html-to-markdown/go.sum delete mode 100644 server/src/markdownify/html-to-markdown/html-to-markdown.go diff --git a/server/src/markdownify/html-to-markdown/.gitignore b/server/src/markdownify/html-to-markdown/.gitignore deleted file mode 100644 index 909db1866..000000000 --- a/server/src/markdownify/html-to-markdown/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -html-to-markdown.* -!html-to-markdown.go -libhtml-to-markdown.* \ No newline at end of file diff --git a/server/src/markdownify/html-to-markdown/go.mod b/server/src/markdownify/html-to-markdown/go.mod deleted file mode 100644 index f1d4f8774..000000000 --- a/server/src/markdownify/html-to-markdown/go.mod +++ /dev/null @@ -1,18 +0,0 @@ -module html-to-markdown - -go 1.23.0 - -toolchain go1.24.0 - -require ( - github.com/PuerkitoBio/goquery v1.10.3 - github.com/getmaxun/html-to-markdown v1.0.1 - golang.org/x/net v0.43.0 -) - -require ( - github.com/andybalholm/cascadia v1.3.3 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect -) - -replace github.com/JohannesKaufmann/html-to-markdown => github.com/getmaxun/html-to-markdown v1.0.1 diff --git a/server/src/markdownify/html-to-markdown/go.sum b/server/src/markdownify/html-to-markdown/go.sum deleted file mode 100644 index d2fc77e55..000000000 --- a/server/src/markdownify/html-to-markdown/go.sum +++ /dev/null @@ -1,85 +0,0 @@ -github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= -github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= -github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= -github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= -github.com/getmaxun/html-to-markdown v1.0.1 h1:ter2Nby2EeYx0ichgZ/Pc6uo3aBudfPPgPicLTh02VI= -github.com/getmaxun/html-to-markdown v1.0.1/go.mod h1:ggHEOofo3wcKaTOuD/z/pf3KJnQns0nQV+Gy/R8iE3U= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= -github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= -github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= -github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U= -github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= -golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= -golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= -golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= -golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= -golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= -golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= -golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= -golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go deleted file mode 100644 index 43c658a2f..000000000 --- a/server/src/markdownify/html-to-markdown/html-to-markdown.go +++ /dev/null @@ -1,157 +0,0 @@ -package main - -/* -#include -*/ -import "C" - -import ( - "strings" - "unsafe" - "unicode/utf8" - - "github.com/PuerkitoBio/goquery" - md "github.com/getmaxun/html-to-markdown" - "github.com/getmaxun/html-to-markdown/plugin" - "golang.org/x/net/html" -) - -//export ConvertHTMLToMarkdown -func ConvertHTMLToMarkdown(input *C.char) *C.char { - // ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C. - engine := md.NewConverter("", true, nil) - engine.Use(plugin.GitHubFlavored()) - - registerPreHandler(engine) - - result, err := engine.ConvertString(C.GoString(input)) - if err != nil { - // swallow conversion error (same as original) - } - - return C.CString(result) -} - -//export FreeCString -// Frees C string memory. -func FreeCString(str *C.char) { - C.free(unsafe.Pointer(str)) -} - -func main() { - // Required empty main for CGO. -} - -// registerPreHandler configures a specialized PRE/code block rule -// to properly extract nested content and detect languages. -func registerPreHandler(conv *md.Converter) { - isNoiseNode := func(class string) bool { - l := strings.ToLower(class) - return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers") - } - - findLanguage := func(sel *goquery.Selection) string { - cls := strings.ToLower(sel.AttrOr("class", "")) - for _, chunk := range strings.Fields(cls) { - if strings.HasPrefix(chunk, "language-") { - return strings.TrimPrefix(chunk, "language-") - } - if strings.HasPrefix(chunk, "lang-") { - return strings.TrimPrefix(chunk, "lang-") - } - } - return "" - } - - // Walk nodes and extract visible text, injecting newlines at block boundaries. - var scrape func(n *html.Node, out *strings.Builder) - scrape = func(n *html.Node, out *strings.Builder) { - if n == nil { - return - } - - switch n.Type { - case html.TextNode: - out.WriteString(n.Data) - - case html.ElementNode: - tag := strings.ToLower(n.Data) - - // skip gutter/line number elements - for _, attr := range n.Attr { - if attr.Key == "class" && isNoiseNode(attr.Val) { - return - } - } - - if tag == "br" { - out.WriteString("\n") - } - - for child := n.FirstChild; child != nil; child = child.NextSibling { - scrape(child, out) - } - - switch tag { - case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot", - "section", "article", "blockquote", "pre", - "h1", "h2", "h3", "h4", "h5", "h6": - out.WriteString("\n") - } - } - } - - // PRE blocks - conv.AddRules(md.Rule{ - Filter: []string{"pre"}, - Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string { - codeTag := s.Find("code").First() - lang := findLanguage(codeTag) - if lang == "" { - lang = findLanguage(s) - } - - var buf strings.Builder - for _, node := range s.Nodes { - scrape(node, &buf) - } - - raw := strings.TrimRight(buf.String(), "\n") - - fRune, _ := utf8.DecodeRuneInString(opt.Fence) - fence := md.CalculateCodeFence(fRune, raw) - - block := "\n\n" + fence + lang + "\n" + raw + "\n" + fence + "\n\n" - return md.String(block) - }, - }) - - // Inline code rule - conv.AddRules(md.Rule{ - Filter: []string{"code"}, - Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string { - // do nothing when inside PRE - if s.ParentsFiltered("pre").Length() > 0 { - return nil - } - - var buf strings.Builder - for _, node := range s.Nodes { - scrape(node, &buf) - } - - text := md.TrimTrailingSpaces(strings.ReplaceAll(buf.String(), "\r\n", "\n")) - - fence := "`" - if strings.Contains(text, "`") { - fence = "``" - if strings.Contains(text, "``") { - fence = "```" - } - } - - inline := fence + text + fence - return md.String(inline) - }, - }) -} From b4644ba1065dce096e9f07f8d91eebb75d90e8aa Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:51:27 +0530 Subject: [PATCH 31/79] feat: use turndown --- server/src/markdownify/markdown.ts | 96 +++--------------------------- 1 file changed, 8 insertions(+), 88 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index c4709e6e5..d13992974 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -1,109 +1,27 @@ -import koffi from "koffi"; -import dotenv from "dotenv"; -import { stat } from "fs/promises"; -import path from "node:path"; -import os from "node:os"; +import TurndownService from "turndown"; +import { gfm } from "joplin-turndown-plugin-gfm"; -const exts = { - win32: ".dll", - darwin: ".dylib", - default: ".so", -}; - -const ext = - exts[os.platform() as keyof typeof exts] || exts.default; - -// Build path to the binary **inside the same folder** -export const GO_MARKDOWN_PARSER_PATH = path.join( - __dirname, - "html-to-markdown", - `html-to-markdown${ext}` -); - -dotenv.config(); - -// --------------------------------------------- -// Native Go binding wrapper -// --------------------------------------------- -class NativeMarkdownBridge { - private static singleton: NativeMarkdownBridge; - private fnConvert: any; - - private constructor() { - const lib = koffi.load(GO_MARKDOWN_PARSER_PATH); - - const freeFn = lib.func("FreeCString", "void", ["string"]); - const trackedType = "CString:" + crypto.randomUUID(); - const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn); - - this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [ - "string", - ]); - } - - static async load(): Promise { - if (!NativeMarkdownBridge.singleton) { - try { - await stat(GO_MARKDOWN_PARSER_PATH); - } catch { - throw new Error("Go shared library not found"); - } - NativeMarkdownBridge.singleton = new NativeMarkdownBridge(); - } - return NativeMarkdownBridge.singleton; - } - - async run(html: string): Promise { - return new Promise((resolve, reject) => { - this.fnConvert.async(html, (err: Error, output: string) => { - err ? reject(err) : resolve(output); - }); - }); - } -} - -// --------------------------------------------- -// Main exposed function -// --------------------------------------------- export async function parseMarkdown( html: string | null | undefined, ): Promise { if (!html) return ""; - // Try Go library first (if enabled) - try { - const engine = await NativeMarkdownBridge.load(); - let md = await engine.run(html); - - md = fixBrokenLinks(md); - md = stripSkipLinks(md); - - return md; - } catch (err: any) { - if (err?.message !== "Go shared library not found") { - console.log("Go markdown parser failed, falling back to JS parser:", err); - } else { - console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH }); - } - } - - // Fallback parser - const TurndownService = require("turndown"); - const { gfm } = require("joplin-turndown-plugin-gfm"); - const t = new TurndownService(); + + // Custom rule for inline links t.addRule("inlineLink", { filter: (node: any, opts: any) => opts.linkStyle === "inlined" && node.nodeName === "A" && node.getAttribute("href"), replacement: (content: string, node: any) => { - const href = node.getAttribute("href").trim(); + const href = node.getAttribute("href")?.trim() || ""; const title = node.title ? ` "${node.title}"` : ""; return `[${content.trim()}](${href}${title})\n`; }, }); + // GitHub-flavored markdown features t.use(gfm); try { @@ -134,9 +52,11 @@ function fixBrokenLinks(md: string): string { result += ch; } } + return result; } function stripSkipLinks(md: string): string { return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, ""); } + From b14d84d83ab24f9c35cbca7da1d05165af8e9b53 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:51:53 +0530 Subject: [PATCH 32/79] fix: -rm debug turndown --- server/src/markdownify/debug_turndown.ts | 132 ----------------------- 1 file changed, 132 deletions(-) delete mode 100644 server/src/markdownify/debug_turndown.ts diff --git a/server/src/markdownify/debug_turndown.ts b/server/src/markdownify/debug_turndown.ts deleted file mode 100644 index 1d62b109f..000000000 --- a/server/src/markdownify/debug_turndown.ts +++ /dev/null @@ -1,132 +0,0 @@ -import { getPageSource } from './get_html'; -import { getProcessedText } from './get_llm_input_text'; -import * as cheerio from 'cheerio'; -import TurndownService from 'turndown'; - -async function debugTurndown() { - const testUrls = [ - "https://amazon.com/", - ]; - - for (const url of testUrls) { - console.log(`\n${'='.repeat(70)}`); - console.log(`πŸ” Testing URL: ${url}`); - console.log(`${'='.repeat(70)}`); - - try { - const pageSource = await getPageSource(url, { - wait: 3.0, // Longer wait time - timeout: 15000 // 15 second timeout - }); - - if (!pageSource || pageSource.length < 100) { - console.error("❌ No page source received or content too short"); - continue; - } - - // Save raw HTML for inspection - const fs = await import('fs/promises'); - const domain = new URL(url).hostname; - await fs.writeFile(`debug_${domain}_raw.html`, pageSource); - console.log(`πŸ’Ύ Raw HTML saved to debug_${domain}_raw.html (${pageSource.length} chars)`); - - // Parse with cheerio - const $ = cheerio.load(pageSource); - - // Check what's in the body - const bodyText = $('body').text(); - console.log(`πŸ“„ Body text length: ${bodyText.length} chars`); - console.log(`πŸ“„ Body preview: ${bodyText.substring(0, 200)}...`); - - // Test content extraction - const contentSelectors = [ - 'main', 'article', '[role="main"]', '.content', '.main-content', - '#content', '#main', '.post', '.article' - ]; - - let mainContent: cheerio.Cheerio = $('body'); - let foundSelector = 'body (fallback)'; - - for (const selector of contentSelectors) { - const $content = $(selector).first(); - if ($content.length > 0 && $content.text().trim().length > 10) { - console.log(`βœ… Found content with selector: ${selector}`); - console.log(`πŸ“ Content text length: ${$content.text().length}`); - mainContent = $content; - foundSelector = selector; - break; - } - } - - console.log(`🎯 Using content from: ${foundSelector}`); - - // Test Turndown directly - console.log("\nπŸ§ͺ Testing Turndown directly..."); - const turndownService = new TurndownService(); - - if (mainContent.length > 0) { - const contentHtml = mainContent.html() || ''; - if (contentHtml && contentHtml.length > 10) { - console.log(`πŸ“¦ Content HTML length: ${contentHtml.length} chars`); - - try { - const contentMarkdown = turndownService.turndown(contentHtml); - console.log(`πŸ“ Turndown result length: ${contentMarkdown.length} chars`); - - if (contentMarkdown.length > 0) { - console.log(`πŸ“ Markdown preview: ${contentMarkdown.substring(0, 300)}...`); - await fs.writeFile(`debug_${domain}_turndown.md`, contentMarkdown); - console.log(`πŸ’Ύ Turndown output saved to debug_${domain}_turndown.md`); - } else { - console.log("❌ Turndown produced empty markdown"); - } - } catch (turndownError) { - console.error("❌ Turndown conversion failed:", turndownError); - } - } else { - console.log("❌ No HTML content found for Turndown"); - } - } - - // Test our full function - console.log("\nπŸ§ͺ Testing full getProcessedText function..."); - const result = await getProcessedText(pageSource, url, { - keepImages: true, - keepWebpageLinks: true, - removeScriptTag: true, - removeStyleTag: true, - formatAsMarkdown: true - }); - - console.log("πŸ“Š Result metadata:"); - console.log(`- Markdown length: ${result.metadata.markdownLength} chars`); - console.log(`- Plain text length: ${result.metadata.textLength} chars`); - console.log(`- Has content: ${result.metadata.hasContent}`); - console.log(`- Content score: ${result.metadata.contentScore}/10`); - - if (result.markdown && result.markdown.length > 0) { - console.log(`πŸ“„ Markdown preview (300 chars):`); - console.log(result.markdown.substring(0, 300) + '...'); - await fs.writeFile(`debug_${domain}_full.md`, result.markdown); - console.log(`πŸ’Ύ Full output saved to debug_${domain}_full.md`); - } else { - console.log("❌ Empty markdown from full function"); - - // Debug why it's empty - if (result.plainText && result.plainText.length > 0) { - console.log("ℹ️ But plain text has content, so markdown conversion failed"); - await fs.writeFile(`debug_${domain}_plain.txt`, result.plainText); - console.log(`πŸ’Ύ Plain text saved to debug_${domain}_plain.txt`); - } - } - - } catch (error) { - console.error(`πŸ’₯ Error processing ${url}:`, error); - } - - // Small delay between requests - await new Promise(resolve => setTimeout(resolve, 1000)); - } -} - -debugTurndown().catch(console.error); \ No newline at end of file From 839f9fa5ce1a9e3e4f9f0a9e79a816089d8e0fb5 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 04:10:12 +0530 Subject: [PATCH 33/79] fix: plugin imports --- server/src/markdownify/markdown.ts | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index d13992974..0aeca9e71 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -1,27 +1,22 @@ -import TurndownService from "turndown"; -import { gfm } from "joplin-turndown-plugin-gfm"; - export async function parseMarkdown( html: string | null | undefined, ): Promise { - if (!html) return ""; + const TurndownService = require("turndown"); + const { gfm } = require("joplin-turndown-plugin-gfm"); const t = new TurndownService(); - - // Custom rule for inline links t.addRule("inlineLink", { filter: (node: any, opts: any) => opts.linkStyle === "inlined" && node.nodeName === "A" && node.getAttribute("href"), replacement: (content: string, node: any) => { - const href = node.getAttribute("href")?.trim() || ""; + const href = node.getAttribute("href").trim(); const title = node.title ? ` "${node.title}"` : ""; return `[${content.trim()}](${href}${title})\n`; }, }); - // GitHub-flavored markdown features t.use(gfm); try { @@ -52,7 +47,6 @@ function fixBrokenLinks(md: string): string { result += ch; } } - return result; } @@ -60,3 +54,4 @@ function stripSkipLinks(md: string): string { return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, ""); } + From 0a7a1eb9b839d764eb3757486e7b7e827cceabde Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 04:21:41 +0530 Subject: [PATCH 34/79] fix: make baseUrl optional param --- server/src/markdownify/markdown.ts | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index 0aeca9e71..9ee7c7205 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -1,8 +1,10 @@ export async function parseMarkdown( html: string | null | undefined, + baseUrl?: string | null ): Promise { const TurndownService = require("turndown"); const { gfm } = require("joplin-turndown-plugin-gfm"); + const { URL } = require('url'); const t = new TurndownService(); t.addRule("inlineLink", { @@ -11,7 +13,18 @@ export async function parseMarkdown( node.nodeName === "A" && node.getAttribute("href"), replacement: (content: string, node: any) => { - const href = node.getAttribute("href").trim(); + let href = node.getAttribute("href").trim(); + + // Convert relative URLs to absolute if baseUrl is provided + if (baseUrl && isRelativeUrl(href)) { + try { + const url = new URL(href, baseUrl); + href = url.toString(); + } catch (err) { + // If URL construction fails, keep the original href + } + } + const title = node.title ? ` "${node.title}"` : ""; return `[${content.trim()}](${href}${title})\n`; }, @@ -30,6 +43,10 @@ export async function parseMarkdown( } } +function isRelativeUrl(url: string): boolean { + return !url.includes('://') && !url.startsWith('mailto:') && !url.startsWith('tel:'); +} + // --------------------------------------------- // Helpers // --------------------------------------------- From 9257b1564e24d3d81a8877e3da0fe3cdcb18e815 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 04:22:06 +0530 Subject: [PATCH 35/79] feat: pass url param --- server/src/markdownify/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index 39d1fa3a3..b58265a24 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -52,6 +52,6 @@ export async function convertPageToMarkdown(url: string): Promise { await browser.close(); // Convert cleaned HTML β†’ Markdown - const markdown = await parseMarkdown(cleanedHtml || ""); + const markdown = await parseMarkdown(cleanedHtml, url); return markdown; } From d1f13cf10ef8d12e206ccb1e9cff85f7fbe96d77 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 13:17:09 +0530 Subject: [PATCH 36/79] feat: add robot markdown creation section ui --- src/components/robot/pages/RobotCreate.tsx | 144 ++++++++++++++++++- src/components/robot/pages/RobotEditPage.tsx | 7 +- 2 files changed, 140 insertions(+), 11 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 70058642c..0e76fac44 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -13,21 +13,47 @@ import { Card, CircularProgress, Container, - CardContent + CardContent, + Tabs, + Tab } from '@mui/material'; -import { ArrowBack, PlayCircleOutline, Article } from '@mui/icons-material'; +import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material'; import { useGlobalInfoStore } from '../../../context/globalInfo'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; import { AuthContext } from '../../../context/auth'; import { GenericModal } from '../../ui/GenericModal'; +interface TabPanelProps { + children?: React.ReactNode; + index: number; + value: number; +} + +function TabPanel(props: TabPanelProps) { + const { children, value, index, ...other } = props; + + return ( + + ); +} + const RobotCreate: React.FC = () => { const { t } = useTranslation(); const navigate = useNavigate(); - const { setBrowserId, setRecordingUrl, notify, setRecordingId } = useGlobalInfoStore(); + const { setBrowserId, setRecordingUrl, notify, setRecordingId, setRerenderRobots } = useGlobalInfoStore(); + const [tabValue, setTabValue] = useState(0); const [url, setUrl] = useState(''); + const [markdownRobotName, setMarkdownRobotName] = useState(''); const [needsLogin, setNeedsLogin] = useState(false); const [isLoading, setIsLoading] = useState(false); const [isWarningModalOpen, setWarningModalOpen] = useState(false); @@ -36,6 +62,10 @@ const RobotCreate: React.FC = () => { const { state } = React.useContext(AuthContext); const { user } = state; + const handleTabChange = (event: React.SyntheticEvent, newValue: number) => { + setTabValue(newValue); + }; + const handleStartRecording = async () => { if (!url.trim()) { @@ -146,11 +176,31 @@ const RobotCreate: React.FC = () => { - New Data Extraction Robot + Create New Robot - + + + } + iconPosition="start" + label="Data Extraction Robot" + id="robot-tab-0" + aria-controls="robot-tabpanel-0" + /> + } + iconPosition="start" + label="Markdown Robot" + id="robot-tab-1" + aria-controls="robot-tabpanel-1" + /> + + + + + {/* Logo (kept as original) */} { + + + + + + Maxun Logo + + + Create Markdown Robot + + + Convert any webpage to clean markdown format + + + + setMarkdownRobotName(e.target.value)} + label="Robot Name" + sx={{ mb: 2 }} + /> + setUrl(e.target.value)} + label="URL to convert" + /> + + + + + + diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index 80671c1fb..d5e7cb2d5 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -795,11 +795,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { navigate(basePath); }; - const lastPair = - robot?.recording.workflow[robot?.recording.workflow.length - 1]; - const targetUrl = lastPair?.what.find((action) => action.action === "goto") - ?.args?.[0]; - return ( { handleTargetUrlChange(e.target.value)} style={{ marginBottom: "20px" }} /> From c1373d8ca138f74a9506c24b19fc08ea7636865b Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 13:18:01 +0530 Subject: [PATCH 37/79] feat: display separate field md content --- src/components/run/RunContent.tsx | 87 +++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/components/run/RunContent.tsx b/src/components/run/RunContent.tsx index 2cc1bb861..3a676a003 100644 --- a/src/components/run/RunContent.tsx +++ b/src/components/run/RunContent.tsx @@ -37,6 +37,7 @@ interface RunContentProps { export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler }: RunContentProps) => { const { t } = useTranslation(); const [tab, setTab] = React.useState('output'); + const [markdownContent, setMarkdownContent] = useState(''); const [schemaData, setSchemaData] = useState([]); const [schemaColumns, setSchemaColumns] = useState([]); @@ -63,6 +64,15 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe setTab(tab); }, [interpretationInProgress]); + useEffect(() => { + if (row.serializableOutput?.markdown && Array.isArray(row.serializableOutput.markdown)) { + const markdownData = row.serializableOutput.markdown[0]; + if (markdownData && markdownData.content) { + setMarkdownContent(markdownData.content); + } + } + }, [row.serializableOutput]); + useEffect(() => { if (row.status === 'running' || row.status === 'queued' || row.status === 'scheduled') { setSchemaData([]); @@ -374,6 +384,22 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe }, 100); }; + const downloadMarkdown = (content: string, filename: string) => { + const blob = new Blob([content], { type: 'text/markdown;charset=utf-8;' }); + const url = URL.createObjectURL(blob); + + const link = document.createElement("a"); + link.href = url; + link.setAttribute("download", filename); + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + + setTimeout(() => { + URL.revokeObjectURL(url); + }, 100); + }; + const renderDataTable = ( data: any[], @@ -636,11 +662,70 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0; const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0; + const hasMarkdown = markdownContent.length > 0; return ( + {hasMarkdown ? ( + + + }> + + + Markdown Output + + + + + theme.palette.mode === 'dark' ? '#1e1e1e' : '#f5f5f5' + }} + > + + {markdownContent} + + + + + + + + + + + ) : ( + // Traditional robot output + <> {row.status === 'running' || row.status === 'queued' ? ( <> @@ -939,6 +1024,8 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe )} + + )} From 0d45d1d7f1f4bbc4ee0e2eedd832797a050a86cf Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 13:19:12 +0530 Subject: [PATCH 38/79] feat: markdownify manual, scheduled, api runs --- server/src/api/record.ts | 110 ++++++++++++++++- server/src/pgboss-worker.ts | 95 +++++++++++++- .../workflow-management/scheduler/index.ts | 116 +++++++++++++++++- 3 files changed, 316 insertions(+), 5 deletions(-) diff --git a/server/src/api/record.ts b/server/src/api/record.ts index 29d1f2615..f55e2b3fb 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -344,7 +344,8 @@ function formatRunResponse(run: any) { runByAPI: run.runByAPI, data: { textData: {}, - listData: {} + listData: {}, + markdown: '' }, screenshots: [] as any[], }; @@ -359,6 +360,10 @@ function formatRunResponse(run: any) { formattedRun.data.listData = output.scrapeList; } + if (output.markdown && Array.isArray(output.markdown)) { + formattedRun.data.markdown = output.markdown[0]?.content || ''; + } + if (run.binaryOutput) { Object.keys(run.binaryOutput).forEach(key => { if (run.binaryOutput[key]) { @@ -651,6 +656,106 @@ async function executeRun(id: string, userId: string) { }; } + if (recording.recording_meta.type === 'markdown') { + logger.log('info', `Executing markdown robot for API run ${id}`); + + await run.update({ + status: 'running', + log: 'Converting page to markdown' + }); + + try { + const { convertPageToMarkdown } = await import('../markdownify/scrape'); + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + const markdown = await convertPageToMarkdown(url); + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: 'Markdown conversion completed successfully', + serializableOutput: { + markdown: [{ content: markdown }] + }, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for API run ${id}`); + + try { + const completionData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + } + + const webhookPayload = { + robot_id: plainRun.robotMetaId, + run_id: plainRun.runId, + robot_name: recording.recording_meta.name, + status: 'success', + started_at: plainRun.startedAt, + finished_at: new Date().toLocaleString(), + markdown: markdown, + metadata: { + browser_id: plainRun.browserId, + user_id: userId, + } + }; + + try { + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`); + } catch (webhookError: any) { + logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + return { + success: true, + interpretationInfo: run.toJSON() + }; + } catch (error: any) { + logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `Markdown conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + throw error; + } + } + plainRun.status = 'running'; browser = browserPool.getRemoteBrowser(plainRun.browserId); @@ -889,12 +994,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) { if (!run) throw new Error('Run not found'); if (run.status === 'success') { - return run.toJSON(); + return run; } else if (run.status === 'failed') { throw new Error('Run failed'); } - // Wait for the next polling interval await new Promise(resolve => setTimeout(resolve, interval)); } } diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index b9f411008..0fcd7f650 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -187,7 +187,100 @@ async function processRunExecution(job: Job) { if (!recording) { throw new Error(`Recording for run ${data.runId} not found`); } - + + if (recording.recording_meta.type === 'markdown') { + logger.log('info', `Executing markdown robot for run ${data.runId}`); + + await run.update({ + status: 'running', + log: 'Converting page to markdown' + }); + + try { + const { convertPageToMarkdown } = await import('./markdownify/scrape'); + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + const markdown = await convertPageToMarkdown(url); + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: 'Markdown conversion completed successfully', + serializableOutput: { + markdown: [{ content: markdown }] + }, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for run ${data.runId}`); + + try { + const completionData = { + runId: data.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(browserId).emit('run-completed', completionData); + serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`); + } + + try { + const webhookPayload = { + runId: data.runId, + robotId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString(), + markdown: markdown + }; + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`); + } catch (webhookError: any) { + logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`); + } + + await destroyRemoteBrowser(browserId, data.userId); + + return { success: true }; + } catch (error: any) { + logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `Markdown conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: data.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(browserId).emit('run-completed', failureData); + serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`); + } + + await destroyRemoteBrowser(browserId, data.userId); + + throw error; + } + } + const isRunAborted = async (): Promise => { try { const currentRun = await Run.findOne({ where: { runId: data.runId } }); diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 899cb7f61..ba47b3e0e 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -207,6 +207,120 @@ async function executeRun(id: string, userId: string) { } } + if (recording.recording_meta.type === 'markdown') { + logger.log('info', `Executing markdown robot for scheduled run ${id}`); + + await run.update({ + status: 'running', + log: 'Converting page to markdown' + }); + + try { + const runStartedData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'running', + startedAt: plainRun.startedAt + }; + + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); + logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`); + } + + try { + const { convertPageToMarkdown } = await import('../../markdownify/scrape'); + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + const markdown = await convertPageToMarkdown(url); + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: 'Markdown conversion completed successfully', + serializableOutput: { + markdown: [{ content: markdown }] + }, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for scheduled run ${id}`); + + try { + const completionData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(plainRun.browserId).emit('run-completed', completionData); + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + } + + const webhookPayload = { + robot_id: plainRun.robotMetaId, + run_id: plainRun.runId, + robot_name: recording.recording_meta.name, + status: 'success', + started_at: plainRun.startedAt, + finished_at: new Date().toLocaleString(), + markdown: markdown, + metadata: { + browser_id: plainRun.browserId, + user_id: userId, + } + }; + + try { + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`); + } catch (webhookError: any) { + logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + return true; + } catch (error: any) { + logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `Markdown conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(plainRun.browserId).emit('run-completed', failureData); + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + throw error; + } + } + plainRun.status = 'running'; try { @@ -217,7 +331,7 @@ async function executeRun(id: string, userId: string) { status: 'running', startedAt: plainRun.startedAt }; - + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`); } catch (socketError: any) { From b19e02f13775b4ceb71f6850f334d1ec95489915 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 13:22:54 +0530 Subject: [PATCH 39/79] feat: add markdown route --- server/src/routes/storage.ts | 81 +++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 89872d6ae..ee23ee442 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r } if (targetUrl) { + robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl }); + const updatedWorkflow = [...robot.recording.workflow]; + let foundGoto = false; for (let i = updatedWorkflow.length - 1; i >= 0; i--) { const step = updatedWorkflow[i]; @@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r robot.set('recording', { ...robot.recording, workflow: updatedWorkflow }); robot.changed('recording', true); + foundGoto = true; i = -1; break; } @@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r } }; - if (name) { + if (name || targetUrl) { updates.recording_meta = { ...robot.recording_meta, - name + ...(name && { name }), + ...(targetUrl && { url: targetUrl }) }; } @@ -432,6 +437,78 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate } }); +/** + * POST endpoint for creating a markdown robot + */ +router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => { + try { + const { url, name } = req.body; + + if (!url) { + return res.status(400).json({ error: 'The "url" field is required.' }); + } + + if (!req.user) { + return res.status(401).send({ error: 'Unauthorized' }); + } + + // Validate URL format + try { + new URL(url); + } catch (err) { + return res.status(400).json({ error: 'Invalid URL format' }); + } + + const robotName = name || `Markdown Robot - ${new URL(url).hostname}`; + const currentTimestamp = new Date().toLocaleString(); + const robotId = uuid(); + + const newRobot = await Robot.create({ + id: uuid(), + userId: req.user.id, + recording_meta: { + name: robotName, + id: robotId, + createdAt: currentTimestamp, + updatedAt: currentTimestamp, + pairs: 0, + params: [], + type: 'markdown', + url: url, + }, + recording: { workflow: [] }, + google_sheet_email: null, + google_sheet_name: null, + google_sheet_id: null, + google_access_token: null, + google_refresh_token: null, + schedule: null, + }); + + logger.log('info', `Markdown robot created with id: ${newRobot.id}`); + capture( + 'maxun-oss-markdown-robot-created', + { + robot_meta: newRobot.recording_meta, + url: url, + } + ); + + return res.status(201).json({ + message: 'Markdown robot created successfully.', + robot: newRobot, + }); + } catch (error) { + if (error instanceof Error) { + logger.log('error', `Error creating markdown robot: ${error.message}`); + return res.status(500).json({ error: error.message }); + } else { + logger.log('error', 'Unknown error creating markdown robot'); + return res.status(500).json({ error: 'An unknown error occurred.' }); + } + } +}); + /** * DELETE endpoint for deleting a recording from the storage. */ From 05d2d1b7fef11db4bb7691976dc63cba1adb54ad Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 13:25:43 +0530 Subject: [PATCH 40/79] feat: add optional type and url fields --- server/src/models/Robot.ts | 2 ++ src/components/robot/pages/RobotDuplicatePage.tsx | 7 +------ src/components/robot/pages/RobotEditPage.tsx | 7 +------ src/components/robot/pages/RobotSettingsPage.tsx | 1 + src/context/globalInfo.tsx | 2 ++ 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts index eae9438ec..5acbdf133 100644 --- a/server/src/models/Robot.ts +++ b/server/src/models/Robot.ts @@ -9,6 +9,8 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'traditional' | 'markdown'; + url?: string; } interface RobotWorkflow { diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx index b02cecdeb..7c45c8e83 100644 --- a/src/components/robot/pages/RobotDuplicatePage.tsx +++ b/src/components/robot/pages/RobotDuplicatePage.tsx @@ -24,12 +24,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: string; - description?: string; - usedByUsers?: number[]; - subscriptionLevel?: number; - access?: string; - sample?: any[]; + type?: 'traditional' | 'markdown'; url?: string; } diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index d5e7cb2d5..19b9e43b2 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -24,12 +24,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: string; - description?: string; - usedByUsers?: number[]; - subscriptionLevel?: number; - access?: string; - sample?: any[]; + type?: 'traditional' | 'markdown'; url?: string; } diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx index 118329358..96b7d3ecf 100644 --- a/src/components/robot/pages/RobotSettingsPage.tsx +++ b/src/components/robot/pages/RobotSettingsPage.tsx @@ -16,6 +16,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'traditional' | 'markdown'; url?: string; } diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index 69969a09c..a0c79622a 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,6 +27,8 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'traditional' | 'markdown'; + url?: string; } interface RobotWorkflow { From d444756f673173f8e87e796963e320a6c82f41a8 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 13:33:10 +0530 Subject: [PATCH 41/79] chore: add static markdown import --- server/src/api/record.ts | 2 +- server/src/pgboss-worker.ts | 4 ++-- server/src/workflow-management/scheduler/index.ts | 2 +- src/components/robot/pages/RobotCreate.tsx | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/server/src/api/record.ts b/server/src/api/record.ts index f55e2b3fb..fd7376abc 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core"; import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { sendWebhook } from "../routes/webhook"; +import { convertPageToMarkdown } from '../markdownify/scrape'; chromium.use(stealthPlugin()); @@ -665,7 +666,6 @@ async function executeRun(id: string, userId: string) { }); try { - const { convertPageToMarkdown } = await import('../markdownify/scrape'); const url = recording.recording_meta.url; if (!url) { diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index 0fcd7f650..b2d5bdb30 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme import { io as serverIo } from "./server"; import { sendWebhook } from './routes/webhook'; import { BinaryOutputService } from './storage/mino'; +import { convertPageToMarkdown } from './markdownify/scrape'; if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) { throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.'); @@ -183,7 +184,7 @@ async function processRunExecution(job: Job) { try { // Find the recording const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); - + if (!recording) { throw new Error(`Recording for run ${data.runId} not found`); } @@ -197,7 +198,6 @@ async function processRunExecution(job: Job) { }); try { - const { convertPageToMarkdown } = await import('./markdownify/scrape'); const url = recording.recording_meta.url; if (!url) { diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index ba47b3e0e..7c2cb4085 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core"; import { Page } from "playwright"; import { sendWebhook } from "../../routes/webhook"; import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable"; +import { convertPageToMarkdown } from "../../markdownify/scrape"; chromium.use(stealthPlugin()); async function createWorkflowAndStoreMetadata(id: string, userId: string) { @@ -231,7 +232,6 @@ async function executeRun(id: string, userId: string) { } try { - const { convertPageToMarkdown } = await import('../../markdownify/scrape'); const url = recording.recording_meta.url; if (!url) { diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 0e76fac44..4bec52d88 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -20,6 +20,7 @@ import { import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material'; import { useGlobalInfoStore } from '../../../context/globalInfo'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; +import { createMarkdownRobot } from "../../../api/storage"; import { AuthContext } from '../../../context/auth'; import { GenericModal } from '../../ui/GenericModal'; @@ -401,7 +402,6 @@ const RobotCreate: React.FC = () => { return; } setIsLoading(true); - const { createMarkdownRobot } = await import('../../../api/storage'); const result = await createMarkdownRobot(url, markdownRobotName); setIsLoading(false); From ddcb3dfe4b9c99144b9b14cd0b197f52c3d81d6b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 15:35:31 +0530 Subject: [PATCH 42/79] feat: extend turndown + clean --- server/src/markdownify/markdown.ts | 164 +++++++++++++++++++++++++---- 1 file changed, 141 insertions(+), 23 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index 9ee7c7205..e660679d9 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -4,36 +4,76 @@ export async function parseMarkdown( ): Promise { const TurndownService = require("turndown"); const { gfm } = require("joplin-turndown-plugin-gfm"); - const { URL } = require('url'); + const cheerio = require("cheerio"); + const { URL } = require("url"); + + if (!html) return ""; + + const tidiedHtml = tidyHtml(html); const t = new TurndownService(); + + // Remove irrelevant tags + const elementsToRemove = [ + "meta", + "style", + "script", + "noscript", + "link", + "textarea", + ]; + + t.addRule("remove-irrelevant", { + filter: elementsToRemove, + replacement: () => "", + }); + + t.addRule("truncate-svg", { + filter: "svg", + replacement: () => "", + }); + + t.addRule("improved-paragraph", { + filter: "p", + replacement: (innerText: string) => { + const trimmed = innerText.trim(); + if (!trimmed) return ""; + return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`; + }, + }); + t.addRule("inlineLink", { filter: (node: any, opts: any) => opts.linkStyle === "inlined" && node.nodeName === "A" && node.getAttribute("href"), + replacement: (content: string, node: any) => { let href = node.getAttribute("href").trim(); - - // Convert relative URLs to absolute if baseUrl is provided + + // Relative β†’ absolute if (baseUrl && isRelativeUrl(href)) { try { - const url = new URL(href, baseUrl); - href = url.toString(); - } catch (err) { - // If URL construction fails, keep the original href - } + const u = new URL(href, baseUrl); + href = u.toString(); + } catch {} } - - const title = node.title ? ` "${node.title}"` : ""; + + // Clean URL + href = cleanUrl(href); + + const title = node.title ? ` "${cleanAttribute(node.title)}"` : ""; return `[${content.trim()}](${href}${title})\n`; }, }); t.use(gfm); + // --------------------------------------------------- + // Convert + // --------------------------------------------------- try { - let out = await t.turndown(html); + let out = await t.turndown(tidiedHtml); out = fixBrokenLinks(out); out = stripSkipLinks(out); return out; @@ -43,13 +83,98 @@ export async function parseMarkdown( } } -function isRelativeUrl(url: string): boolean { - return !url.includes('://') && !url.startsWith('mailto:') && !url.startsWith('tel:'); -} - // --------------------------------------------- // Helpers // --------------------------------------------- +function isRelativeUrl(url: string): boolean { + return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:"); +} + +function cleanUrl(u: string): string { + try { + return u; + } catch { + return u; + } +} + +// CODE 1: attribute cleaner +function cleanAttribute(attr: string) { + return attr ? attr.replace(/(\n+\s*)+/g, "\n") : ""; +} + +// --------------------------------------------------------- +// CODE 1: Full tidyHtml cleaning logic (ported verbatim) +// --------------------------------------------------------- +function tidyHtml(html: string): string { + const cheerio = require("cheerio"); + const $ = cheerio.load(html); + + // Fix broken attributes + $("*").each(function (this: any) { + const element = $(this); + const attributes = Object.keys(this.attribs); + + for (let i = 0; i < attributes.length; i++) { + let attr = attributes[i]; + if (attr.includes('"')) { + element.remove(); + } + } + }); + + const manuallyCleanedElements = [ + "aside", + "embed", + "head", + "iframe", + "menu", + "object", + "script", + "applet", + "audio", + "canvas", + "map", + "svg", + "video", + "area", + "blink", + "datalist", + "dialog", + "frame", + "frameset", + "link", + "input", + "ins", + "legend", + "marquee", + "math", + "menuitem", + "nav", + "noscript", + "optgroup", + "output", + "param", + "progress", + "rp", + "rt", + "rtc", + "source", + "style", + "track", + "textarea", + "time", + "use", + "img", + "picture", + "figure", + ]; + + manuallyCleanedElements.forEach((tag) => $(tag).remove()); + return $("body").html(); +} + + function fixBrokenLinks(md: string): string { let depth = 0; let result = ""; @@ -57,12 +182,7 @@ function fixBrokenLinks(md: string): string { for (const ch of md) { if (ch === "[") depth++; if (ch === "]") depth = Math.max(0, depth - 1); - - if (depth > 0 && ch === "\n") { - result += "\\\n"; - } else { - result += ch; - } + result += depth > 0 && ch === "\n" ? "\\\n" : ch; } return result; } @@ -70,5 +190,3 @@ function fixBrokenLinks(md: string): string { function stripSkipLinks(md: string): string { return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, ""); } - - From 8346c9637a694b97067ee6a7bf28d26869905956 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 15:37:26 +0530 Subject: [PATCH 43/79] chore: cleanup --- server/src/markdownify/markdown.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index e660679d9..92551b51b 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -98,14 +98,10 @@ function cleanUrl(u: string): string { } } -// CODE 1: attribute cleaner function cleanAttribute(attr: string) { return attr ? attr.replace(/(\n+\s*)+/g, "\n") : ""; } -// --------------------------------------------------------- -// CODE 1: Full tidyHtml cleaning logic (ported verbatim) -// --------------------------------------------------------- function tidyHtml(html: string): string { const cheerio = require("cheerio"); const $ = cheerio.load(html); From 924d687e20fb20cd321f242bfa6c8a0666fc52b0 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 15:54:39 +0530 Subject: [PATCH 44/79] feat: add create markdown api --- src/api/storage.ts | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/api/storage.ts b/src/api/storage.ts index b5dc32ded..ca4d975cf 100644 --- a/src/api/storage.ts +++ b/src/api/storage.ts @@ -28,6 +28,27 @@ export const getStoredRecordings = async (): Promise => { } }; +export const createMarkdownRobot = async (url: string, name?: string): Promise => { + try { + const response = await axios.post(`${apiUrl}/storage/recordings/markdown`, { + url, + name, + }, { + headers: { 'Content-Type': 'application/json' }, + withCredentials: true + }); + + if (response.status === 201) { + return response.data; + } else { + throw new Error('Failed to create markdown robot'); + } + } catch (error: any) { + console.error('Error creating markdown robot:', error); + return null; + } +}; + export const updateRecording = async (id: string, data: { name?: string; limits?: Array<{pairIndex: number, actionIndex: number, argIndex: number, limit: number}>; From e711326c0e17a809c6a195420cd14e1fd1508815 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:00:02 +0530 Subject: [PATCH 45/79] feat: extract --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 4bec52d88..de1ae6c06 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -186,7 +186,7 @@ const RobotCreate: React.FC = () => { } iconPosition="start" - label="Data Extraction Robot" + label="Extract" id="robot-tab-0" aria-controls="robot-tabpanel-0" /> From 51a0c3a769fcb0a8300087d2b77c2287e5c68203 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:03:36 +0530 Subject: [PATCH 46/79] chore: remove icon --- src/components/robot/pages/RobotCreate.tsx | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index de1ae6c06..40ef4cc17 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -184,8 +184,6 @@ const RobotCreate: React.FC = () => { } - iconPosition="start" label="Extract" id="robot-tab-0" aria-controls="robot-tabpanel-0" From 672a1822cb110e777ecadc55ba068fd78adb7150 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:03:55 +0530 Subject: [PATCH 47/79] feat: extract --- src/components/robot/pages/RobotCreate.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 40ef4cc17..359fc7006 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -185,8 +185,8 @@ const RobotCreate: React.FC = () => { } From d0b8d0c6d77db1a61ac108e5e0a9a132a6fa37d1 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:05:02 +0530 Subject: [PATCH 48/79] chore: remove icon --- src/components/robot/pages/RobotCreate.tsx | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 359fc7006..9fb927126 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -189,8 +189,6 @@ const RobotCreate: React.FC = () => { aria-controls="extract-robot" /> } - iconPosition="start" label="Markdown Robot" id="robot-tab-1" aria-controls="robot-tabpanel-1" From 53bf9eb09234787538e97bb2ad345e85a4c72b6e Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:07:30 +0530 Subject: [PATCH 49/79] feat: scrape --- src/components/robot/pages/RobotCreate.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 9fb927126..bc38ca0b5 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -190,8 +190,8 @@ const RobotCreate: React.FC = () => { /> From 8428314a2cf6342a9630833d09bccfb9f3ff012d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:08:05 +0530 Subject: [PATCH 50/79] feat: scrape --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index bc38ca0b5..3f3c88f9b 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -189,7 +189,7 @@ const RobotCreate: React.FC = () => { aria-controls="extract-robot" /> From 6de6c3b04294dfdfad67311a00197390f046e507 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:08:51 +0530 Subject: [PATCH 51/79] feat: remove header --- src/components/robot/pages/RobotCreate.tsx | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 3f3c88f9b..c75760021 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -358,9 +358,6 @@ const RobotCreate: React.FC = () => { alt="Maxun Logo" /> - - Create Markdown Robot - Convert any webpage to clean markdown format From ef4311606673a2590cc1b302630c0ce9860381c5 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:10:05 +0530 Subject: [PATCH 52/79] feat: markdown --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index c75760021..98feb5fe6 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -359,7 +359,7 @@ const RobotCreate: React.FC = () => { /> - Convert any webpage to clean markdown format + Turn websites into LLM-ready Markdown content for AI apps. From f745089d9ac84228d70ad035c02c811e2eea4f38 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:17:27 +0530 Subject: [PATCH 53/79] feat: markdown --- src/components/robot/pages/RobotCreate.tsx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 98feb5fe6..b014586cc 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -211,6 +211,10 @@ const RobotCreate: React.FC = () => { alt="Maxun Logo" /> + + Extract structured data from websites in a few clicks. + + {/* Origin URL Input */} { setMarkdownRobotName(e.target.value)} - label="Robot Name" sx={{ mb: 2 }} /> setUrl(e.target.value)} - label="URL to convert" + label="Website URL" /> From 81d69a44c1a861643f4779e28a71635730b3645f Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:17:45 +0530 Subject: [PATCH 54/79] chore: lint --- src/components/robot/pages/RobotCreate.tsx | 260 ++++++++++----------- 1 file changed, 129 insertions(+), 131 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index b014586cc..5cd4e74a6 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -198,154 +198,152 @@ const RobotCreate: React.FC = () => { - - {/* Logo (kept as original) */} - Maxun Logo + + {/* Logo (kept as original) */} + Maxun Logo Extract structured data from websites in a few clicks. - {/* Origin URL Input */} - - setUrl(e.target.value)} - /> - - - {/* Checkbox */} - - setNeedsLogin(e.target.checked)} - color="primary" - /> - } - label="This website needs logging in." - /> - - - {/* Button */} - - - - - - - - - First time creating a robot? - - - Get help and learn how to use Maxun effectively. - + {/* Origin URL Input */} + + setUrl(e.target.value)} + /> + - + {/* Checkbox */} + + setNeedsLogin(e.target.checked)} + color="primary" + /> + } + label="This website needs logging in." + /> + - {/* YouTube Tutorials */} - - window.open("https://www.youtube.com/@MaxunOSS/videos", "_blank")} + startIcon={isLoading ? : null} > - + + + + + + First time creating a robot? + + + Get help and learn how to use Maxun effectively. + + + + + {/* YouTube Tutorials */} + + - theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '', + height: 140, + cursor: "pointer", }} + onClick={() => window.open("https://www.youtube.com/@MaxunOSS/videos", "_blank")} > - - - - - Video Tutorials - - - Watch step-by-step guides - - - - - - - {/* Documentation */} - - window.open("https://docs.maxun.dev", "_blank")} - > - + theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '', + }} + > + + + + + Video Tutorials + + + Watch step-by-step guides + + + + + + + {/* Documentation */} + + - theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '', + height: 140, + cursor: "pointer", }} + onClick={() => window.open("https://docs.maxun.dev", "_blank")} > -
- - - - Documentation - - - Explore detailed guides - - - - + + theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '', + }} + > +
+ + + + Documentation + + + Explore detailed guides + + + + + - - + From eb86b6eb175269553c4fd0166675a7793c51e968 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:18:17 +0530 Subject: [PATCH 55/79] feat: markdown --- src/components/robot/pages/RobotCreate.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 5cd4e74a6..62b7c9e61 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -372,6 +372,7 @@ const RobotCreate: React.FC = () => { value={markdownRobotName} onChange={(e) => setMarkdownRobotName(e.target.value)} sx={{ mb: 2 }} + label="Robot Name" /> Date: Thu, 20 Nov 2025 16:21:09 +0530 Subject: [PATCH 56/79] feat: markdown --- src/components/robot/pages/RobotCreate.tsx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 62b7c9e61..50f1d9ed1 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -189,9 +189,9 @@ const RobotCreate: React.FC = () => { aria-controls="extract-robot" /> From dbb6c8728978f45ae6adf553b8d616de3f50fc80 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:24:19 +0530 Subject: [PATCH 57/79] feat: change mui default tabs --- src/components/robot/pages/RobotCreate.tsx | 37 ++++++++++++++-------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 50f1d9ed1..0cc1491c7 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -181,20 +181,29 @@ const RobotCreate: React.FC = () => { - - - - - - + + + + + + + From 606790e483a1a06f903ba6bb9147d345058ad00d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:24:29 +0530 Subject: [PATCH 58/79] chore: lint --- src/components/robot/pages/RobotCreate.tsx | 42 +++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 0cc1491c7..014312e64 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -182,27 +182,27 @@ const RobotCreate: React.FC = () => { - - - - - + + + + + From 3dac1a09fbb57b6d563ba1bda18fa2a88ba1517b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:26:24 +0530 Subject: [PATCH 59/79] feat: change mui default tabs --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 014312e64..8863838ff 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -191,7 +191,7 @@ const RobotCreate: React.FC = () => { '& .MuiTab-root': { minHeight: 36, paddingX: 2, - paddingY: 0.5, + paddingY: 1.5, minWidth: 0, }, '& .MuiTabs-indicator': { From 96019058e96196e80ce7516cc9f81ef5a152ff41 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:36:43 +0530 Subject: [PATCH 60/79] feat: turn to markdown --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 8863838ff..88daa49b4 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -428,7 +428,7 @@ const RobotCreate: React.FC = () => { }} startIcon={isLoading ? : null} > - {isLoading ? 'Creating...' : 'Create Markdown Robot'} + {isLoading ? 'Turning...' : 'Turn to Markdown'} From 930c7b6c7490bbdaafab60f07cb56fa73417d2ba Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 16:56:43 +0530 Subject: [PATCH 61/79] fix: lesser restrictions --- server/src/markdownify/markdown.ts | 83 ++++-------------------------- 1 file changed, 11 insertions(+), 72 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index 92551b51b..439a3a622 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -13,21 +13,6 @@ export async function parseMarkdown( const t = new TurndownService(); - // Remove irrelevant tags - const elementsToRemove = [ - "meta", - "style", - "script", - "noscript", - "link", - "textarea", - ]; - - t.addRule("remove-irrelevant", { - filter: elementsToRemove, - replacement: () => "", - }); - t.addRule("truncate-svg", { filter: "svg", replacement: () => "", @@ -106,64 +91,18 @@ function tidyHtml(html: string): string { const cheerio = require("cheerio"); const $ = cheerio.load(html); - // Fix broken attributes - $("*").each(function (this: any) { - const element = $(this); - const attributes = Object.keys(this.attribs); - - for (let i = 0; i < attributes.length; i++) { - let attr = attributes[i]; - if (attr.includes('"')) { - element.remove(); - } - } - }); - const manuallyCleanedElements = [ - "aside", - "embed", - "head", - "iframe", - "menu", - "object", - "script", - "applet", - "audio", - "canvas", - "map", - "svg", - "video", - "area", - "blink", - "datalist", - "dialog", - "frame", - "frameset", - "link", - "input", - "ins", - "legend", - "marquee", - "math", - "menuitem", - "nav", - "noscript", - "optgroup", - "output", - "param", - "progress", - "rp", - "rt", - "rtc", - "source", - "style", - "track", - "textarea", - "time", - "use", - "img", - "picture", - "figure", + "script", + "style", + "iframe", + "noscript", + "meta", + "link", + "object", + "embed", + "canvas", + "audio", + "video" ]; manuallyCleanedElements.forEach((tag) => $(tag).remove()); From 691dedc351d88f6daece4f19960ee1744cd172de Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 17:22:33 +0530 Subject: [PATCH 62/79] fix: lesser restrictions --- server/src/markdownify/markdown.ts | 91 ++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index 439a3a622..f50136f68 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -11,13 +11,34 @@ export async function parseMarkdown( const tidiedHtml = tidyHtml(html); - const t = new TurndownService(); + const t = new TurndownService({ + headingStyle: "atx", // ensures #### instead of ------ + codeBlockStyle: "fenced", + }); + + // --------------------------------------------- + // Fix 1: Proper ATX headings #### instead of underline-style + // --------------------------------------------- + t.addRule("forceAtxHeadings", { + filter: ["h1", "h2", "h3", "h4", "h5", "h6"], + replacement: (content: string, node: any) => { + const level = Number(node.nodeName.charAt(1)); + const clean = content.trim(); + return `\n${"#".repeat(level)} ${clean}\n`; + }, + }); + // --------------------------------------------- + // Remove SVGs + // --------------------------------------------- t.addRule("truncate-svg", { filter: "svg", replacement: () => "", }); + // --------------------------------------------- + // Improved paragraph cleanup + // --------------------------------------------- t.addRule("improved-paragraph", { filter: "p", replacement: (innerText: string) => { @@ -27,16 +48,28 @@ export async function parseMarkdown( }, }); + // --------------------------------------------- + // Fix 2: Inline link with fallback text + // --------------------------------------------- t.addRule("inlineLink", { filter: (node: any, opts: any) => - opts.linkStyle === "inlined" && - node.nodeName === "A" && - node.getAttribute("href"), + node.nodeName === "A" && node.getAttribute("href"), replacement: (content: string, node: any) => { + let text = content.trim(); + + // Fallback: aria-label β†’ title β†’ domain + if (!text) { + text = + node.getAttribute("aria-label")?.trim() || + node.getAttribute("title")?.trim() || + getDomainFromUrl(node.getAttribute("href")) || + "link"; + } + let href = node.getAttribute("href").trim(); - // Relative β†’ absolute + // relative β†’ absolute if (baseUrl && isRelativeUrl(href)) { try { const u = new URL(href, baseUrl); @@ -44,45 +77,46 @@ export async function parseMarkdown( } catch {} } - // Clean URL href = cleanUrl(href); - const title = node.title ? ` "${cleanAttribute(node.title)}"` : ""; - return `[${content.trim()}](${href}${title})\n`; + return `[${text}](${href})`; }, }); t.use(gfm); - // --------------------------------------------------- - // Convert - // --------------------------------------------------- + // Convert HTML β†’ Markdown try { let out = await t.turndown(tidiedHtml); out = fixBrokenLinks(out); out = stripSkipLinks(out); - return out; + return out.trim(); } catch (err) { console.error("HTMLβ†’Markdown failed", { err }); return ""; } } -// --------------------------------------------- +// ----------------------------------------------------- // Helpers -// --------------------------------------------- +// ----------------------------------------------------- function isRelativeUrl(url: string): boolean { return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:"); } -function cleanUrl(u: string): string { +function getDomainFromUrl(url: string): string | null { try { - return u; + const u = new URL(url); + return u.hostname.replace("www.", ""); } catch { - return u; + return null; } } +function cleanUrl(u: string): string { + return u; +} + function cleanAttribute(attr: string) { return attr ? attr.replace(/(\n+\s*)+/g, "\n") : ""; } @@ -92,24 +126,23 @@ function tidyHtml(html: string): string { const $ = cheerio.load(html); const manuallyCleanedElements = [ - "script", - "style", - "iframe", - "noscript", - "meta", - "link", - "object", - "embed", - "canvas", - "audio", - "video" + "script", + "style", + "iframe", + "noscript", + "meta", + "link", + "object", + "embed", + "canvas", + "audio", + "video", ]; manuallyCleanedElements.forEach((tag) => $(tag).remove()); return $("body").html(); } - function fixBrokenLinks(md: string): string { let depth = 0; let result = ""; From 7f48e276f1370fc28cba8d9f98582516d84243e0 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 17:23:04 +0530 Subject: [PATCH 63/79] chore: lint --- server/src/markdownify/markdown.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index f50136f68..eb4567f3b 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -17,7 +17,7 @@ export async function parseMarkdown( }); // --------------------------------------------- - // Fix 1: Proper ATX headings #### instead of underline-style + // Proper ATX headings #### instead of underline-style // --------------------------------------------- t.addRule("forceAtxHeadings", { filter: ["h1", "h2", "h3", "h4", "h5", "h6"], @@ -49,7 +49,7 @@ export async function parseMarkdown( }); // --------------------------------------------- - // Fix 2: Inline link with fallback text + // Inline link with fallback text // --------------------------------------------- t.addRule("inlineLink", { filter: (node: any, opts: any) => @@ -74,7 +74,7 @@ export async function parseMarkdown( try { const u = new URL(href, baseUrl); href = u.toString(); - } catch {} + } catch { } } href = cleanUrl(href); From fef038b8cfb4dd8db1cd84c0303bab991e8ea64d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 17:24:15 +0530 Subject: [PATCH 64/79] chore: cleanup wanted deps --- package.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/package.json b/package.json index 1303410d7..405571436 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,6 @@ "bcrypt": "^5.1.1", "body-parser": "^1.20.3", "buffer": "^6.0.3", - "cheerio": "^1.1.2", "connect-pg-simple": "^10.0.0", "cookie-parser": "^1.4.6", "cors": "^2.8.5", @@ -50,7 +49,6 @@ "joplin-turndown-plugin-gfm": "^1.0.12", "jsonwebtoken": "^9.0.2", "jwt-decode": "^4.0.0", - "koffi": "^2.14.1", "lodash": "^4.17.21", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", From 5aafe6eaaf3feb0a40402542742be2037f0f7793 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 17:55:47 +0530 Subject: [PATCH 65/79] feat: add html --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 88daa49b4..9d88279bb 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -370,7 +370,7 @@ const RobotCreate: React.FC = () => { /> - Turn websites into LLM-ready Markdown content for AI apps. + Turn websites into LLM-ready Markdown & clean HTML for AI apps. From 418100c1698446dd7e6ec7f9d6b99ffed1067919 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 17:56:37 +0530 Subject: [PATCH 66/79] feat: scrape robot --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 9d88279bb..ec01b60b9 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -200,7 +200,7 @@ const RobotCreate: React.FC = () => { }} > - + From f3c79bd30322285e6d183e2bbf2e144cff0112a5 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 17:59:30 +0530 Subject: [PATCH 67/79] feat: scrape robot --- src/components/robot/pages/RobotCreate.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index ec01b60b9..119e9be91 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -428,7 +428,7 @@ const RobotCreate: React.FC = () => { }} startIcon={isLoading ? : null} > - {isLoading ? 'Turning...' : 'Turn to Markdown'} + {isLoading ? 'Creating...' : 'Create Robot'} From e90cd9961e1b78cd475a50bf5ff15857142d7fc5 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 18:49:39 +0530 Subject: [PATCH 68/79] feat: add html scrape support --- server/src/api/record.ts | 122 ++++++++++++++---- server/src/markdownify/scrape.ts | 54 ++++++++ server/src/models/Robot.ts | 3 +- server/src/pgboss-worker.ts | 47 +++++-- server/src/routes/storage.ts | 19 ++- .../workflow-management/scheduler/index.ts | 75 ++++++++--- src/components/robot/RecordingsTable.tsx | 65 ++++++---- src/components/robot/pages/RobotCreate.tsx | 74 +++++++++-- .../robot/pages/RobotDuplicatePage.tsx | 3 +- src/components/robot/pages/RobotEditPage.tsx | 3 +- .../robot/pages/RobotSettingsPage.tsx | 3 +- src/context/globalInfo.tsx | 3 +- 12 files changed, 366 insertions(+), 105 deletions(-) diff --git a/server/src/api/record.ts b/server/src/api/record.ts index fd7376abc..cbf4f67e4 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -18,7 +18,7 @@ import { WorkflowFile } from "maxun-core"; import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { sendWebhook } from "../routes/webhook"; -import { convertPageToMarkdown } from '../markdownify/scrape'; +import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape'; chromium.use(stealthPlugin()); @@ -346,7 +346,8 @@ function formatRunResponse(run: any) { data: { textData: {}, listData: {}, - markdown: '' + markdown: '', + html: '' }, screenshots: [] as any[], }; @@ -365,6 +366,10 @@ function formatRunResponse(run: any) { formattedRun.data.markdown = output.markdown[0]?.content || ''; } + if (output.html && Array.isArray(output.html)) { + formattedRun.data.html = output.html[0]?.content || ''; + } + if (run.binaryOutput) { Object.keys(run.binaryOutput).forEach(key => { if (run.binaryOutput[key]) { @@ -575,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr } } -async function readyForRunHandler(browserId: string, id: string, userId: string){ +async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){ try { - const result = await executeRun(id, userId); + const result = await executeRun(id, userId, requestedFormats); if (result && result.success) { logger.log('info', `Interpretation of ${id} succeeded`); @@ -614,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) { return copy; }; -async function executeRun(id: string, userId: string) { +async function executeRun(id: string, userId: string, requestedFormats?: string[]) { let browser: any = null; try { @@ -657,12 +662,19 @@ async function executeRun(id: string, userId: string) { }; } - if (recording.recording_meta.type === 'markdown') { - logger.log('info', `Executing markdown robot for API run ${id}`); + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for API run ${id}`); + + let formats = recording.recording_meta.formats || ['markdown']; + + // Override if API request defines formats + if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) { + formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f)); + } await run.update({ status: 'running', - log: 'Converting page to markdown' + log: `Converting page to: ${formats.join(', ')}` }); try { @@ -672,20 +684,33 @@ async function executeRun(id: string, userId: string) { throw new Error('No URL specified for markdown robot'); } - const markdown = await convertPageToMarkdown(url); + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } await run.update({ status: 'success', finishedAt: new Date().toLocaleString(), - log: 'Markdown conversion completed successfully', - serializableOutput: { - markdown: [{ content: markdown }] - }, + log: `${formats.join(', ')} conversion completed successfully`, + serializableOutput, binaryOutput: {}, }); logger.log('info', `Markdown robot execution completed for API run ${id}`); + // Push success socket event try { const completionData = { runId: plainRun.runId, @@ -695,30 +720,45 @@ async function executeRun(id: string, userId: string) { finishedAt: new Date().toLocaleString() }; - serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); + serverIo + .of('/queued-run') + .to(`user-${userId}`) + .emit('run-completed', completionData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}` + ); } - const webhookPayload = { + // Build webhook payload + const webhookPayload: any = { robot_id: plainRun.robotMetaId, run_id: plainRun.runId, robot_name: recording.recording_meta.name, status: 'success', started_at: plainRun.startedAt, finished_at: new Date().toLocaleString(), - markdown: markdown, metadata: { browser_id: plainRun.browserId, user_id: userId, - } + }, }; + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + try { await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); - logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`); + logger.log( + 'info', + `Webhooks sent successfully for markdown robot API run ${plainRun.runId}` + ); } catch (webhookError: any) { - logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + logger.log( + 'warn', + `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); @@ -728,14 +768,18 @@ async function executeRun(id: string, userId: string) { interpretationInfo: run.toJSON() }; } catch (error: any) { - logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`); + logger.log( + 'error', + `${formats.join(', ')} conversion failed for API run ${id}: ${error.message}` + ); await run.update({ status: 'failed', finishedAt: new Date().toLocaleString(), - log: `Markdown conversion failed: ${error.message}`, + log: `${formats.join(', ')} conversion failed: ${error.message}`, }); + // Send failure socket event try { const failureData = { runId: plainRun.runId, @@ -745,9 +789,15 @@ async function executeRun(id: string, userId: string) { finishedAt: new Date().toLocaleString() }; - serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); + serverIo + .of('/queued-run') + .to(`user-${userId}`) + .emit('run-completed', failureData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); @@ -953,7 +1003,7 @@ async function executeRun(id: string, userId: string) { } } -export async function handleRunRecording(id: string, userId: string) { +export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) { try { const result = await createWorkflowAndStoreMetadata(id, userId); const { browserId, runId: newRunId } = result; @@ -967,7 +1017,7 @@ export async function handleRunRecording(id: string, userId: string) { rejectUnauthorized: false }); - socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId)); + socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats)); logger.log('info', `Running Robot: ${id}`); @@ -1018,6 +1068,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) { * type: string * required: true * description: The ID of the robot to run. + * requestBody: + * required: false + * content: + * application/json: + * schema: + * type: object + * properties: + * formats: + * type: array + * items: + * type: string + * enum: [markdown, html] + * description: Optional override formats for this run. + * example: + * formats: ["html"] * responses: * 200: * description: Robot run started successfully. @@ -1076,7 +1141,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, if (!req.user) { return res.status(401).json({ ok: false, error: 'Unauthorized' }); } - const runId = await handleRunRecording(req.params.id, req.user.id); + + const requestedFormats = req.body.formats; + + const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats); if (!runId) { throw new Error('Run ID is undefined'); diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index b58265a24..935fa0cb6 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -55,3 +55,57 @@ export async function convertPageToMarkdown(url: string): Promise { const markdown = await parseMarkdown(cleanedHtml, url); return markdown; } + +/** + * Fetches a webpage, strips scripts/styles/images/etc, + * returns clean HTML. + */ +export async function convertPageToHTML(url: string): Promise { + const browser = await chromium.launch(); + const page = await browser.newPage(); + + await page.goto(url, { waitUntil: "networkidle" }); + + await page.addInitScript(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); + + // Remove inline event handlers (onclick, onload…) + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + }); + + // Re-extract HTML after cleanup + const cleanedHtml = await page.evaluate(() => { + return document.documentElement.outerHTML; + }); + + await browser.close(); + + // Return cleaned HTML directly + return cleanedHtml; +} diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts index 5acbdf133..39218de24 100644 --- a/server/src/models/Robot.ts +++ b/server/src/models/Robot.ts @@ -9,8 +9,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index b2d5bdb30..66e852b85 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -20,7 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme import { io as serverIo } from "./server"; import { sendWebhook } from './routes/webhook'; import { BinaryOutputService } from './storage/mino'; -import { convertPageToMarkdown } from './markdownify/scrape'; +import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape'; if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) { throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.'); @@ -189,12 +189,14 @@ async function processRunExecution(job: Job) { throw new Error(`Recording for run ${data.runId} not found`); } - if (recording.recording_meta.type === 'markdown') { - logger.log('info', `Executing markdown robot for run ${data.runId}`); + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for run ${data.runId}`); + + const formats = recording.recording_meta.formats || ['markdown']; await run.update({ status: 'running', - log: 'Converting page to markdown' + log: `Converting page to ${formats.join(', ')}` }); try { @@ -204,20 +206,34 @@ async function processRunExecution(job: Job) { throw new Error('No URL specified for markdown robot'); } - const markdown = await convertPageToMarkdown(url); + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } + // Success update await run.update({ status: 'success', finishedAt: new Date().toLocaleString(), - log: 'Markdown conversion completed successfully', - serializableOutput: { - markdown: [{ content: markdown }] - }, + log: `${formats.join(', ').toUpperCase()} conversion completed successfully`, + serializableOutput, binaryOutput: {}, }); logger.log('info', `Markdown robot execution completed for run ${data.runId}`); + // Notify sockets try { const completionData = { runId: data.runId, @@ -233,15 +249,19 @@ async function processRunExecution(job: Job) { logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`); } + // Webhooks try { - const webhookPayload = { + const webhookPayload: any = { runId: data.runId, robotId: plainRun.robotMetaId, robotName: recording.recording_meta.name, status: 'success', finishedAt: new Date().toLocaleString(), - markdown: markdown }; + + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`); } catch (webhookError: any) { @@ -251,13 +271,14 @@ async function processRunExecution(job: Job) { await destroyRemoteBrowser(browserId, data.userId); return { success: true }; + } catch (error: any) { - logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`); + logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`); await run.update({ status: 'failed', finishedAt: new Date().toLocaleString(), - log: `Markdown conversion failed: ${error.message}`, + log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`, }); try { diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index ee23ee442..44279e9cd 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -440,9 +440,9 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate /** * POST endpoint for creating a markdown robot */ -router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => { +router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => { try { - const { url, name } = req.body; + const { url, name, formats } = req.body; if (!url) { return res.status(400).json({ error: 'The "url" field is required.' }); @@ -459,6 +459,18 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ return res.status(400).json({ error: 'Invalid URL format' }); } + // Validate format + const validFormats = ['markdown', 'html']; + + if (!Array.isArray(formats) || formats.length === 0) { + return res.status(400).json({ error: 'At least one output format must be selected.' }); + } + + const invalid = formats.filter(f => !validFormats.includes(f)); + if (invalid.length > 0) { + return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` }); + } + const robotName = name || `Markdown Robot - ${new URL(url).hostname}`; const currentTimestamp = new Date().toLocaleString(); const robotId = uuid(); @@ -473,8 +485,9 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ updatedAt: currentTimestamp, pairs: 0, params: [], - type: 'markdown', + type: 'scrape', url: url, + formats: formats, }, recording: { workflow: [] }, google_sheet_email: null, diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 7c2cb4085..d5ba76f40 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -15,7 +15,7 @@ import { WorkflowFile } from "maxun-core"; import { Page } from "playwright"; import { sendWebhook } from "../../routes/webhook"; import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable"; -import { convertPageToMarkdown } from "../../markdownify/scrape"; +import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape"; chromium.use(stealthPlugin()); async function createWorkflowAndStoreMetadata(id: string, userId: string) { @@ -208,12 +208,14 @@ async function executeRun(id: string, userId: string) { } } - if (recording.recording_meta.type === 'markdown') { - logger.log('info', `Executing markdown robot for scheduled run ${id}`); + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for scheduled run ${id}`); + + const formats = recording.recording_meta.formats || ['markdown']; await run.update({ status: 'running', - log: 'Converting page to markdown' + log: `Converting page to: ${formats.join(', ')}` }); try { @@ -226,9 +228,15 @@ async function executeRun(id: string, userId: string) { }; serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); - logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`); + logger.log( + 'info', + `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}` + ); } catch (socketError: any) { - logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}` + ); } try { @@ -238,20 +246,33 @@ async function executeRun(id: string, userId: string) { throw new Error('No URL specified for markdown robot'); } - const markdown = await convertPageToMarkdown(url); + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } await run.update({ status: 'success', finishedAt: new Date().toLocaleString(), - log: 'Markdown conversion completed successfully', - serializableOutput: { - markdown: [{ content: markdown }] - }, + log: `${formats.join(', ')} conversion completed successfully`, + serializableOutput, binaryOutput: {}, }); logger.log('info', `Markdown robot execution completed for scheduled run ${id}`); + // Run-completed socket notifications try { const completionData = { runId: plainRun.runId, @@ -264,40 +285,53 @@ async function executeRun(id: string, userId: string) { serverIo.of(plainRun.browserId).emit('run-completed', completionData); serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}` + ); } - const webhookPayload = { + // Webhook payload + const webhookPayload: any = { robot_id: plainRun.robotMetaId, run_id: plainRun.runId, robot_name: recording.recording_meta.name, status: 'success', started_at: plainRun.startedAt, finished_at: new Date().toLocaleString(), - markdown: markdown, metadata: { browser_id: plainRun.browserId, user_id: userId, } }; + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + try { await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); - logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`); + logger.log( + 'info', + `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}` + ); } catch (webhookError: any) { - logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + logger.log( + 'warn', + `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); return true; + } catch (error: any) { - logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`); + logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`); await run.update({ status: 'failed', finishedAt: new Date().toLocaleString(), - log: `Markdown conversion failed: ${error.message}`, + log: `${formats.join(', ')} conversion failed: ${error.message}`, }); try { @@ -312,7 +346,10 @@ async function executeRun(id: string, userId: string) { serverIo.of(plainRun.browserId).emit('run-completed', failureData); serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); diff --git a/src/components/robot/RecordingsTable.tsx b/src/components/robot/RecordingsTable.tsx index f06270ed3..79319b923 100644 --- a/src/components/robot/RecordingsTable.tsx +++ b/src/components/robot/RecordingsTable.tsx @@ -110,7 +110,10 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => { case 'integrate': return ( - handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} /> + handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} + robotType={row.type} + /> ); case 'options': @@ -121,6 +124,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => { handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])} handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])} handleDelete={() => handlers.handleDelete(row.id)} + robotType={row.type} /> ); @@ -709,13 +713,22 @@ const ScheduleButton = ({ handleSchedule }: ScheduleButtonProps) => { interface IntegrateButtonProps { handleIntegrate: () => void; + robotType: string; } -const IntegrateButton = ({ handleIntegrate }: IntegrateButtonProps) => { +const IntegrateButton = ({ handleIntegrate, robotType }: IntegrateButtonProps) => { + const isDisabled = robotType === 'scrape'; + return ( - { - handleIntegrate(); - }} + @@ -742,9 +755,10 @@ interface OptionsButtonProps { handleEdit: () => void; handleDelete: () => void; handleDuplicate: () => void; + robotType: string; } -const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate }: OptionsButtonProps) => { +const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate, robotType }: OptionsButtonProps) => { const [anchorEl, setAnchorEl] = React.useState(null); const handleClick = (event: React.MouseEvent) => { @@ -771,34 +785,33 @@ const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicat open={Boolean(anchorEl)} onClose={handleClose} > - { handleRetrain(); handleClose(); }}> - - - - {t('recordingtable.retrain')} - + {robotType !== 'scrape' && ( + { handleRetrain(); handleClose(); }}> + + + + Retrain + + )} { handleEdit(); handleClose(); }}> - - - - {t('recordingtable.edit')} + + Edit { handleDelete(); handleClose(); }}> - - - - {t('recordingtable.delete')} + + Delete - { handleDuplicate(); handleClose(); }}> - - - - {t('recordingtable.duplicate')} - + {robotType !== 'scrape' && ( + { handleDuplicate(); handleClose(); }}> + + Duplicate + + )} + ); }; diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 88daa49b4..312d7bae7 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -15,12 +15,16 @@ import { Container, CardContent, Tabs, - Tab + Tab, + RadioGroup, + Radio, + FormControl, + FormLabel } from '@mui/material'; import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material'; import { useGlobalInfoStore } from '../../../context/globalInfo'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; -import { createMarkdownRobot } from "../../../api/storage"; +import { createScrapeRobot } from "../../../api/storage"; import { AuthContext } from '../../../context/auth'; import { GenericModal } from '../../ui/GenericModal'; @@ -54,11 +58,12 @@ const RobotCreate: React.FC = () => { const [tabValue, setTabValue] = useState(0); const [url, setUrl] = useState(''); - const [markdownRobotName, setMarkdownRobotName] = useState(''); + const [scrapeRobotName, setScrapeRobotName] = useState(''); const [needsLogin, setNeedsLogin] = useState(false); const [isLoading, setIsLoading] = useState(false); const [isWarningModalOpen, setWarningModalOpen] = useState(false); const [activeBrowserId, setActiveBrowserId] = useState(''); + const [outputFormats, setOutputFormats] = useState([]); const { state } = React.useContext(AuthContext); const { user } = state; @@ -200,7 +205,7 @@ const RobotCreate: React.FC = () => { }} > - + @@ -370,7 +375,7 @@ const RobotCreate: React.FC = () => { /> - Turn websites into LLM-ready Markdown content for AI apps. + Turn websites into LLM-ready Markdown or clean HTML content for AI apps. @@ -378,8 +383,8 @@ const RobotCreate: React.FC = () => { placeholder="Example: YC Companies Scraper" variant="outlined" fullWidth - value={markdownRobotName} - onChange={(e) => setMarkdownRobotName(e.target.value)} + value={scrapeRobotName} + onChange={(e) => setScrapeRobotName(e.target.value)} sx={{ mb: 2 }} label="Robot Name" /> @@ -390,7 +395,44 @@ const RobotCreate: React.FC = () => { value={url} onChange={(e) => setUrl(e.target.value)} label="Website URL" + sx={{ mb: 2 }} /> + + + Output Format (Select at least one) + + { + if (e.target.checked) { + setOutputFormats([...outputFormats, 'markdown']); + } else { + setOutputFormats(outputFormats.filter(f => f !== 'markdown')); + } + }} + /> + } + label="Markdown" + /> + + { + if (e.target.checked) { + setOutputFormats([...outputFormats, 'html']); + } else { + setOutputFormats(outputFormats.filter(f => f !== 'html')); + } + }} + /> + } + label="HTML" + /> + diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx index 7c45c8e83..ac602f8e1 100644 --- a/src/components/robot/pages/RobotDuplicatePage.tsx +++ b/src/components/robot/pages/RobotDuplicatePage.tsx @@ -24,8 +24,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index 19b9e43b2..53424bb2f 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -24,8 +24,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx index 96b7d3ecf..f0f2f6ae0 100644 --- a/src/components/robot/pages/RobotSettingsPage.tsx +++ b/src/components/robot/pages/RobotSettingsPage.tsx @@ -16,8 +16,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index a0c79622a..973714b79 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,8 +27,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { From c89b2afed624df52c1f76cd0129f4265528cef16 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 18:52:28 +0530 Subject: [PATCH 69/79] feat: modify scrape api to support html --- src/api/storage.ts | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/api/storage.ts b/src/api/storage.ts index ca4d975cf..d2b28d5e2 100644 --- a/src/api/storage.ts +++ b/src/api/storage.ts @@ -28,15 +28,24 @@ export const getStoredRecordings = async (): Promise => { } }; -export const createMarkdownRobot = async (url: string, name?: string): Promise => { +export const createScrapeRobot = async ( + url: string, + name?: string, + formats: string[] = ['markdown'] +): Promise => { try { - const response = await axios.post(`${apiUrl}/storage/recordings/markdown`, { - url, - name, - }, { - headers: { 'Content-Type': 'application/json' }, - withCredentials: true - }); + const response = await axios.post( + `${apiUrl}/storage/recordings/scrape`, + { + url, + name, + formats, + }, + { + headers: { 'Content-Type': 'application/json' }, + withCredentials: true, + } + ); if (response.status === 201) { return response.data; From 0987183bac73f5875c98798e9639428474cc3379 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 18:59:32 +0530 Subject: [PATCH 70/79] chore: increase goto timeout scrape 100s --- server/src/markdownify/scrape.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index 935fa0cb6..6821bfdb7 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -9,7 +9,7 @@ export async function convertPageToMarkdown(url: string): Promise { const browser = await chromium.launch(); const page = await browser.newPage(); - await page.goto(url, { waitUntil: "networkidle" }); + await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); await page.addInitScript(() => { const selectors = [ @@ -64,7 +64,7 @@ export async function convertPageToHTML(url: string): Promise { const browser = await chromium.launch(); const page = await browser.newPage(); - await page.goto(url, { waitUntil: "networkidle" }); + await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); await page.addInitScript(() => { const selectors = [ From ac0c70ebfe6f4f24b49677f8e32965ed1f2d9f90 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 19:18:11 +0530 Subject: [PATCH 71/79] feat: disable sheets and airtable scrape robot --- src/components/robot/RecordingsTable.tsx | 22 ++----- .../robot/pages/RobotIntegrationPage.tsx | 59 ++++++++++++++----- 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/src/components/robot/RecordingsTable.tsx b/src/components/robot/RecordingsTable.tsx index 79319b923..aed9ea749 100644 --- a/src/components/robot/RecordingsTable.tsx +++ b/src/components/robot/RecordingsTable.tsx @@ -110,10 +110,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => { case 'integrate': return ( - handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} - robotType={row.type} - /> + handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} /> ); case 'options': @@ -713,22 +710,13 @@ const ScheduleButton = ({ handleSchedule }: ScheduleButtonProps) => { interface IntegrateButtonProps { handleIntegrate: () => void; - robotType: string; } -const IntegrateButton = ({ handleIntegrate, robotType }: IntegrateButtonProps) => { - const isDisabled = robotType === 'scrape'; - +const IntegrateButton = ({ handleIntegrate }: IntegrateButtonProps) => { return ( - { + handleIntegrate(); + }} > diff --git a/src/components/robot/pages/RobotIntegrationPage.tsx b/src/components/robot/pages/RobotIntegrationPage.tsx index 3c8425901..9bedf3a54 100644 --- a/src/components/robot/pages/RobotIntegrationPage.tsx +++ b/src/components/robot/pages/RobotIntegrationPage.tsx @@ -128,6 +128,8 @@ export const RobotIntegrationPage = ({ "googleSheets" | "airtable" | "webhook" | null >(integrationType); + const isScrapeRobot = recording?.recording_meta?.type === "scrape"; + const authenticateWithGoogle = () => { if (!recordingId) { console.error("Cannot authenticate: recordingId is null"); @@ -729,26 +731,55 @@ export const RobotIntegrationPage = ({ width: "100%", }} > - - + From b2b5a914e7826077ad619fa295e197c943547766 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 19:40:48 +0530 Subject: [PATCH 76/79] chore: add telemetry for scrape robots and runs --- server/src/api/record.ts | 16 ++++++++++++++++ server/src/pgboss-worker.ts | 16 ++++++++++++++++ server/src/routes/storage.ts | 6 +++--- .../src/workflow-management/scheduler/index.ts | 16 ++++++++++++++++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/server/src/api/record.ts b/server/src/api/record.ts index cbf4f67e4..7c665001e 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -761,6 +761,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ ); } + capture("maxun-oss-run-created-api", { + runId: plainRun.runId, + user_id: userId, + status: "success", + robot_type: "scrape", + formats + }); + await destroyRemoteBrowser(plainRun.browserId, userId); return { @@ -800,6 +808,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ ); } + capture("maxun-oss-run-created-api", { + runId: plainRun.runId, + user_id: userId, + status: "failed", + robot_type: "scrape", + formats + }); + await destroyRemoteBrowser(plainRun.browserId, userId); throw error; diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index 66e852b85..f5d719b46 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -268,6 +268,14 @@ async function processRunExecution(job: Job) { logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`); } + capture("maxun-oss-run-created-manual", { + runId: data.runId, + user_id: data.userId, + status: "success", + robot_type: "scrape", + formats, + }); + await destroyRemoteBrowser(browserId, data.userId); return { success: true }; @@ -296,6 +304,14 @@ async function processRunExecution(job: Job) { logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`); } + capture("maxun-oss-run-created-manual", { + runId: data.runId, + user_id: data.userId, + status: "failed", + robot_type: "scrape", + formats, + }); + await destroyRemoteBrowser(browserId, data.userId); throw error; diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 44279e9cd..8451c7205 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -500,12 +500,12 @@ router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedReques logger.log('info', `Markdown robot created with id: ${newRobot.id}`); capture( - 'maxun-oss-markdown-robot-created', + 'maxun-oss-robot-created', { robot_meta: newRobot.recording_meta, - url: url, + recording: newRobot.recording, } - ); + ) return res.status(201).json({ message: 'Markdown robot created successfully.', diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index d5ba76f40..470cdacb3 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -321,6 +321,14 @@ async function executeRun(id: string, userId: string) { ); } + capture("maxun-oss-run-created-scheduled", { + runId: plainRun.runId, + user_id: userId, + status: "success", + robot_type: "scrape", + formats + }); + await destroyRemoteBrowser(plainRun.browserId, userId); return true; @@ -352,6 +360,14 @@ async function executeRun(id: string, userId: string) { ); } + capture("maxun-oss-run-created-scheduled", { + runId: plainRun.runId, + user_id: userId, + status: "failed", + robot_type: "scrape", + formats + }); + await destroyRemoteBrowser(plainRun.browserId, userId); throw error; From 467ffe39fa8575ee2821f5149a7065ed316da95a Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 20 Nov 2025 20:38:03 +0530 Subject: [PATCH 77/79] feat: rm display integrations scrape robot --- .../robot/pages/RobotIntegrationPage.tsx | 102 +++++++++--------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/src/components/robot/pages/RobotIntegrationPage.tsx b/src/components/robot/pages/RobotIntegrationPage.tsx index 9bedf3a54..8905abe21 100644 --- a/src/components/robot/pages/RobotIntegrationPage.tsx +++ b/src/components/robot/pages/RobotIntegrationPage.tsx @@ -731,54 +731,60 @@ export const RobotIntegrationPage = ({ width: "100%", }} > - - + {!isScrapeRobot && ( + + )} + + {!isScrapeRobot && ( + + )} +