From 01a26783580d10b6f8045a4084ec6388ac52b577 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 19:45:11 +0530
Subject: [PATCH 01/79] wip: get html

---
 server/src/markdownify/get_html.ts | 53 ++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 server/src/markdownify/get_html.ts
diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts
new file mode 100644
index 000000000..ea4c6788a
--- /dev/null
+++ b/server/src/markdownify/get_html.ts
@@ -0,0 +1,53 @@
+import { chromium, Browser, Page } from 'playwright';
+
+export interface GetPageSourceOptions {
+  wait?: number;
+  headless?: boolean;
+  userAgent?: string;
+}
+
+export async function getPageSource(
+  url: string,
+  options: GetPageSourceOptions = {}
+): Promise<string> {
+  const {
+    wait = 1.5,
+    headless = true,
+    userAgent = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166"
+  } = options;
+
+  let browser: Browser | null = null;
+  let page: Page | null = null;
+
+  try {
+    browser = await chromium.launch({ 
+      headless,
+      args: ['--no-sandbox', '--disable-dev-shm-usage']
+    });
+    
+    page = await browser.newPage();
+    await page.setUserAgent(userAgent);
+    
+    // Convert wait time to milliseconds
+    const waitMs = wait * 1000;
+    
+    // Set default timeout and navigate to URL
+    await page.setDefaultTimeout(waitMs);
+    await page.goto(url, { waitUntil: 'domcontentloaded' });
+    
+    // Wait for additional time if specified
+    if (waitMs > 0) {
+      await page.waitForTimeout(waitMs);
+    }
+    
+    const pageSource = await page.content();
+    return pageSource;
+    
+  } catch (error) {
+    console.error('Error while getting page source: ', error);
+    return '';
+  } finally {
+    if (page) await page.close();
+    if (browser) await browser.close();
+  }
+}
\ No newline at end of file

From 994142ae403d61e94adabbecdc04b3112863ec00 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 19:49:38 +0530
Subject: [PATCH 02/79] fix: define browser context

---
 server/src/markdownify/get_html.ts | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts
index ea4c6788a..3e459383d 100644
--- a/server/src/markdownify/get_html.ts
+++ b/server/src/markdownify/get_html.ts
@@ -1,4 +1,4 @@
-import { chromium, Browser, Page } from 'playwright';
+import { chromium, Browser, Page, BrowserContext } from 'playwright';
 
 export interface GetPageSourceOptions {
   wait?: number;
@@ -17,6 +17,7 @@ export async function getPageSource(
   } = options;
 
   let browser: Browser | null = null;
+  let context: BrowserContext | null = null;
   let page: Page | null = null;
 
   try {
@@ -25,8 +26,8 @@ export async function getPageSource(
       args: ['--no-sandbox', '--disable-dev-shm-usage']
     });
     
-    page = await browser.newPage();
-    await page.setUserAgent(userAgent);
+    context = await browser.newContext({ userAgent });
+    page = await context.newPage();
     
     // Convert wait time to milliseconds
     const waitMs = wait * 1000;
@@ -45,9 +46,9 @@ export async function getPageSource(
     
   } catch (error) {
     console.error('Error while getting page source: ', error);
-    return '';
   } finally {
     if (page) await page.close();
+    if (context) await context.close();
     if (browser) await browser.close();
   }
-}
\ No newline at end of file
+  }
\ No newline at end of file

From 0c9dc899c379c9c26603136d337875ca4893ce6f Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 19:50:28 +0530
Subject: [PATCH 03/79] feat: get input text for llm

---
 server/src/markdownify/get_llm_input_text.ts | 151 +++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 server/src/markdownify/get_llm_input_text.ts

diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts
new file mode 100644
index 000000000..d3846373d
--- /dev/null
+++ b/server/src/markdownify/get_llm_input_text.ts
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: MIT
+
+import * as cheerio from 'cheerio';
+import { AnyAuthClient } from 'node_modules/google-auth-library/build/src';
+import { URL } from 'url';
+
+export interface ProcessTextOptions {
+  htmlParser?: boolean;
+  keepImages?: boolean;
+  removeSvgImage?: boolean;
+  removeGifImage?: boolean;
+  removeImageTypes?: string[];
+  keepWebpageLinks?: boolean;
+  removeScriptTag?: boolean;
+  removeStyleTag?: boolean;
+  removeTags?: string[];
+}
+
+export async function getProcessedText(
+  pageSource: string,
+  baseUrl: string,
+  options: ProcessTextOptions = {}
+): Promise<string> {
+  const {
+    keepImages = true,
+    removeSvgImage = true,
+    removeGifImage = true,
+    removeImageTypes = [],
+    keepWebpageLinks = true,
+    removeScriptTag = true,
+    removeStyleTag = true,
+    removeTags = []
+  } = options;
+
+  try {
+    const $ = cheerio.load(pageSource);
+    
+    // Remove tags
+    const tagsToRemove: string[] = [];
+    if (removeScriptTag) tagsToRemove.push('script');
+    if (removeStyleTag) tagsToRemove.push('style');
+    tagsToRemove.push(...removeTags);
+    
+    const uniqueTags = [...new Set(tagsToRemove)];
+    uniqueTags.forEach(tag => {
+      $(tag).remove();
+    });
+
+    // Process image links
+    const imageTypesToRemove: string[] = [];
+    if (removeSvgImage) imageTypesToRemove.push('.svg');
+    if (removeGifImage) imageTypesToRemove.push('.gif');
+    imageTypesToRemove.push(...removeImageTypes);
+    
+    const uniqueImageTypes = [...new Set(imageTypesToRemove)];
+    
+    $('img').each((_: any, element: any) => {
+      try {
+        const $img = $(element);
+        if (!keepImages) {
+          $img.remove();
+        } else {
+          const imageLink = $img.attr('src');
+          let typeReplaced = false;
+          
+          if (imageLink) {
+            if (uniqueImageTypes.length > 0) {
+              for (const imageType of uniqueImageTypes) {
+                if (!typeReplaced && imageLink.includes(imageType)) {
+                  $img.remove();
+                  typeReplaced = true;
+                  break;
+                }
+              }
+            }
+            if (!typeReplaced) {
+              const absoluteUrl = new URL(imageLink, baseUrl).toString();
+              $img.replaceWith('\n' + absoluteUrl + ' ');
+            }
+          }
+        }
+      } catch (error) {
+        console.error('Error while processing image link: ', error);
+      }
+    });
+
+    // Process website links
+    $('a[href]').each((_: any, element: any) => {
+      try {
+        const $link = $(element);
+        if (!keepWebpageLinks) {
+          $link.remove();
+        } else {
+          const href = $link.attr('href');
+          if (href) {
+            const absoluteUrl = new URL(href, baseUrl).toString();
+            $link.replaceWith($link.text() + ': ' + absoluteUrl + ' ');
+          }
+        }
+      } catch (error) {
+        console.error('Error while processing webpage link: ', error);
+      }
+    });
+
+    // Get text content
+    let text: string;
+    const bodyContent = $('body');
+    
+    if (bodyContent.length > 0) {
+      // For minification, we'll use a simple approach to clean up the HTML
+      const bodyHtml = bodyContent.html() || '';
+      const minimizedBody = minifyHtml(bodyHtml);
+      text = htmlToText(minimizedBody);
+    } else {
+      text = $.text();
+    }
+    
+    return text;
+
+  } catch (error) {
+    console.error('Error while getting processed text: ', error);
+    return '';
+  }
+}
+
+// Simple HTML minification function
+function minifyHtml(html: string): string {
+  return html
+    .replace(/\s+/g, ' ')
+    .replace(/>\s+</g, '><')
+    .trim();
+}
+
+// Convert HTML to text (simplified version of inscriptis functionality)
+function htmlToText(html: string): string {
+  const $ = cheerio.load(html);
+  
+  // Remove elements that shouldn't contribute to text
+  $('script, style, noscript').remove();
+  
+  // Get text content with basic formatting
+  let text = $('body').text() || $.text();
+  
+  // Clean up the text
+  text = text
+    .replace(/\s+/g, ' ')
+    .replace(/\n\s*\n/g, '\n')
+    .trim();
+    
+  return text;
+}
\ No newline at end of file

From 560f5a33003bef854a3170578f562e452e4c7b32 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 19:51:34 +0530
Subject: [PATCH 04/79] feat: get llm ready text

---
 server/src/markdownify/get_llm_ready_text.ts | 29 ++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 server/src/markdownify/get_llm_ready_text.ts

diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts
new file mode 100644
index 000000000..4d0515c6c
--- /dev/null
+++ b/server/src/markdownify/get_llm_ready_text.ts
@@ -0,0 +1,29 @@
+import { getPageSource, GetPageSourceOptions } from './get_html';
+import { getProcessedText, ProcessTextOptions } from './get_llm_input_text';
+
+export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions {
+  // Combined options from both interfaces
+}
+
+export async function urlToLlmText(
+  url: string,
+  options: UrlToLlmTextOptions = {}
+): Promise<string> {
+  try {
+    const pageSource = await getPageSource(url, options);
+    
+    if (!pageSource) {
+      return '';
+    }
+
+    const llmText = await getProcessedText(pageSource, url, options);
+    return llmText;
+    
+  } catch (error) {
+    console.error('Error while scraping url: ', error);
+    return '';
+  }
+}
+
+// Export individual functions as well
+export { getPageSource, getProcessedText };
\ No newline at end of file

From 9b71cfc40cc30866afd6e4a6af74a45cf457ae1a Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 19:54:28 +0530
Subject: [PATCH 05/79] fix: return empty empty str on error

---
 server/src/markdownify/get_html.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts
index 3e459383d..dbf6a8a93 100644
--- a/server/src/markdownify/get_html.ts
+++ b/server/src/markdownify/get_html.ts
@@ -1,4 +1,4 @@
-import { chromium, Browser, Page, BrowserContext } from 'playwright';
+import { chromium, Browser, BrowserContext, Page } from 'playwright';
 
 export interface GetPageSourceOptions {
   wait?: number;
@@ -46,9 +46,10 @@ export async function getPageSource(
     
   } catch (error) {
     console.error('Error while getting page source: ', error);
+    return ''; // Explicitly return empty string on error
   } finally {
     if (page) await page.close();
     if (context) await context.close();
     if (browser) await browser.close();
   }
-  }
\ No newline at end of file
+}
\ No newline at end of file

From 191ac52ee3516120455ebe0945eaba8b9f4251ad Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 19:55:17 +0530
Subject: [PATCH 06/79] fix: return empty empty str on error

---
 server/src/markdownify/get_llm_input_text.ts | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts
index d3846373d..d33c582be 100644
--- a/server/src/markdownify/get_llm_input_text.ts
+++ b/server/src/markdownify/get_llm_input_text.ts
@@ -1,7 +1,4 @@
-// SPDX-License-Identifier: MIT
-
 import * as cheerio from 'cheerio';
-import { AnyAuthClient } from 'node_modules/google-auth-library/build/src';
 import { URL } from 'url';
 
 export interface ProcessTextOptions {
@@ -54,7 +51,7 @@ export async function getProcessedText(
     
     const uniqueImageTypes = [...new Set(imageTypesToRemove)];
     
-    $('img').each((_: any, element: any) => {
+    $('img').each((_, element) => {
       try {
         const $img = $(element);
         if (!keepImages) {
@@ -85,7 +82,7 @@ export async function getProcessedText(
     });
 
     // Process website links
-    $('a[href]').each((_: any, element: any) => {
+    $('a[href]').each((_, element) => {
       try {
         const $link = $(element);
         if (!keepWebpageLinks) {
@@ -107,7 +104,6 @@ export async function getProcessedText(
     const bodyContent = $('body');
     
     if (bodyContent.length > 0) {
-      // For minification, we'll use a simple approach to clean up the HTML
       const bodyHtml = bodyContent.html() || '';
       const minimizedBody = minifyHtml(bodyHtml);
       text = htmlToText(minimizedBody);
@@ -119,11 +115,10 @@ export async function getProcessedText(
 
   } catch (error) {
     console.error('Error while getting processed text: ', error);
-    return '';
+    return ''; // Explicitly return empty string on error
   }
 }
 
-// Simple HTML minification function
 function minifyHtml(html: string): string {
   return html
     .replace(/\s+/g, ' ')
@@ -131,17 +126,13 @@ function minifyHtml(html: string): string {
     .trim();
 }
 
-// Convert HTML to text (simplified version of inscriptis functionality)
 function htmlToText(html: string): string {
   const $ = cheerio.load(html);
   
-  // Remove elements that shouldn't contribute to text
   $('script, style, noscript').remove();
   
-  // Get text content with basic formatting
   let text = $('body').text() || $.text();
   
-  // Clean up the text
   text = text
     .replace(/\s+/g, ' ')
     .replace(/\n\s*\n/g, '\n')

From af9570659f304422e5cc2c0aab30017a5b50335a Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 20:50:25 +0530
Subject: [PATCH 07/79] fix: get important content

---
 server/src/markdownify/get_llm_input_text.ts | 76 +++++++++++++-------
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts
index d33c582be..c7962392c 100644
--- a/server/src/markdownify/get_llm_input_text.ts
+++ b/server/src/markdownify/get_llm_input_text.ts
@@ -1,3 +1,4 @@
+
 import * as cheerio from 'cheerio';
 import { URL } from 'url';
 
@@ -81,17 +82,20 @@ export async function getProcessedText(
       }
     });
 
-    // Process website links
+    // Process website links - Preserve the link text AND the URL
     $('a[href]').each((_, element) => {
       try {
         const $link = $(element);
         if (!keepWebpageLinks) {
-          $link.remove();
+          // Just remove the link but keep the text
+          $link.replaceWith($link.text());
         } else {
           const href = $link.attr('href');
           if (href) {
             const absoluteUrl = new URL(href, baseUrl).toString();
-            $link.replaceWith($link.text() + ': ' + absoluteUrl + ' ');
+            const linkText = $link.text().trim();
+            // Keep both the link text and the URL
+            $link.replaceWith(linkText + ' [' + absoluteUrl + '] ');
           }
         }
       } catch (error) {
@@ -99,44 +103,64 @@ export async function getProcessedText(
       }
     });
 
-    // Get text content
+    // Get text content 
     let text: string;
+    
+    // Use a simpler approach to extract text
     const bodyContent = $('body');
     
     if (bodyContent.length > 0) {
-      const bodyHtml = bodyContent.html() || '';
-      const minimizedBody = minifyHtml(bodyHtml);
-      text = htmlToText(minimizedBody);
+      // Remove script and style tags that might have been missed
+      bodyContent.find('script, style, noscript').remove();
+      
+      // Get text with proper spacing
+      text = bodyContent
+        .contents()
+        .map((_, el) => {
+          if (el.type === 'text') {
+            return $(el).text();
+          }
+          if (el.type === 'tag') {
+            const $el = $(el);
+            const tagName = el.name?.toLowerCase();
+            
+            // Add appropriate spacing for block elements
+            if (['div', 'p', 'br', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName || '')) {
+              return $el.text() + '\n';
+            }
+            return $el.text() + ' ';
+          }
+          return '';
+        })
+        .get()
+        .join('');
     } else {
       text = $.text();
     }
     
+    // Clean up the text while preserving quotes
+    text = cleanText(text);
+    
     return text;
 
   } catch (error) {
     console.error('Error while getting processed text: ', error);
-    return ''; // Explicitly return empty string on error
+    return '';
   }
 }
 
-function minifyHtml(html: string): string {
-  return html
-    .replace(/\s+/g, ' ')
-    .replace(/>\s+</g, '><')
-    .trim();
-}
-
-function htmlToText(html: string): string {
-  const $ = cheerio.load(html);
+// Clean up text while preserving quotes and important content
+function cleanText(text: string): string {
+  if (!text) return '';
   
-  $('script, style, noscript').remove();
-  
-  let text = $('body').text() || $.text();
-  
-  text = text
-    .replace(/\s+/g, ' ')
-    .replace(/\n\s*\n/g, '\n')
+  return text
+    // Replace multiple spaces with single space, but be careful with quotes
+    .replace(/[^\S\n]+/g, ' ')
+    // Replace multiple newlines with max 2 newlines
+    .replace(/\n\s*\n/g, '\n\n')
+    // Clean up spaces around quotes but don't remove the quotes
+    .replace(/\s+"/g, ' "')
+    .replace(/"\s+/g, '" ')
+    // Remove leading/trailing whitespace
     .trim();
-    
-  return text;
 }
\ No newline at end of file

From a3891f6813fca50a989633beb5ab4e9787aea668 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 21:14:23 +0530
Subject: [PATCH 08/79] wip: markdown + plain text

---
 server/src/markdownify/get_llm_input_text.ts | 371 +++++++++++++------
 1 file changed, 266 insertions(+), 105 deletions(-)

diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts
index c7962392c..fa0aec6cd 100644
--- a/server/src/markdownify/get_llm_input_text.ts
+++ b/server/src/markdownify/get_llm_input_text.ts
@@ -1,9 +1,7 @@
-
 import * as cheerio from 'cheerio';
 import { URL } from 'url';
 
 export interface ProcessTextOptions {
-  htmlParser?: boolean;
   keepImages?: boolean;
   removeSvgImage?: boolean;
   removeGifImage?: boolean;
@@ -12,13 +10,26 @@ export interface ProcessTextOptions {
   removeScriptTag?: boolean;
   removeStyleTag?: boolean;
   removeTags?: string[];
+  formatAsMarkdown?: boolean;
+}
+
+export interface ProcessedResult {
+  markdown: string;
+  plainText: string;
+  metadata: {
+    title: string;
+    url: string;
+    processedAt: string;
+    textLength: number;
+    markdownLength: number;
+  };
 }
 
 export async function getProcessedText(
   pageSource: string,
   baseUrl: string,
   options: ProcessTextOptions = {}
-): Promise<string> {
+): Promise<ProcessedResult> {
   const {
     keepImages = true,
     removeSvgImage = true,
@@ -27,13 +38,14 @@ export async function getProcessedText(
     keepWebpageLinks = true,
     removeScriptTag = true,
     removeStyleTag = true,
-    removeTags = []
+    removeTags = [],
+    formatAsMarkdown = true
   } = options;
 
   try {
     const $ = cheerio.load(pageSource);
     
-    // Remove tags
+    // Remove unwanted tags
     const tagsToRemove: string[] = [];
     if (removeScriptTag) tagsToRemove.push('script');
     if (removeStyleTag) tagsToRemove.push('style');
@@ -44,123 +56,272 @@ export async function getProcessedText(
       $(tag).remove();
     });
 
-    // Process image links
-    const imageTypesToRemove: string[] = [];
-    if (removeSvgImage) imageTypesToRemove.push('.svg');
-    if (removeGifImage) imageTypesToRemove.push('.gif');
-    imageTypesToRemove.push(...removeImageTypes);
+    // Extract page title
+    const title = $('title').text() || $('h1').first().text() || 'Untitled';
     
-    const uniqueImageTypes = [...new Set(imageTypesToRemove)];
+    // Generate both formats
+    const markdown = formatAsMarkdown ? 
+      convertToMarkdown($, baseUrl, options) : 
+      convertToPlainText($, baseUrl, options); // Fallback to plain text if markdown disabled
     
-    $('img').each((_, element) => {
-      try {
-        const $img = $(element);
-        if (!keepImages) {
-          $img.remove();
-        } else {
-          const imageLink = $img.attr('src');
-          let typeReplaced = false;
-          
-          if (imageLink) {
-            if (uniqueImageTypes.length > 0) {
-              for (const imageType of uniqueImageTypes) {
-                if (!typeReplaced && imageLink.includes(imageType)) {
-                  $img.remove();
-                  typeReplaced = true;
-                  break;
-                }
-              }
-            }
-            if (!typeReplaced) {
-              const absoluteUrl = new URL(imageLink, baseUrl).toString();
-              $img.replaceWith('\n' + absoluteUrl + ' ');
-            }
-          }
-        }
-      } catch (error) {
-        console.error('Error while processing image link: ', error);
+    const plainText = convertToPlainText($, baseUrl, options);
+
+    const result: ProcessedResult = {
+      markdown,
+      plainText,
+      metadata: {
+        title: title.trim(),
+        url: baseUrl,
+        processedAt: new Date().toISOString(),
+        textLength: plainText.length,
+        markdownLength: markdown.length
+      }
+    };
+
+    return result;
+
+  } catch (error) {
+    console.error('Error while getting processed text: ', error);
+    // Return empty result on error
+    return {
+      markdown: '',
+      plainText: '',
+      metadata: {
+        title: '',
+        url: baseUrl,
+        processedAt: new Date().toISOString(),
+        textLength: 0,
+        markdownLength: 0
+      }
+    };
+  }
+}
+
+function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string {
+  const { keepImages, keepWebpageLinks } = options;
+  
+  // Clone the body to avoid modifying the original
+  const $body = $('body').clone();
+  
+  // Process headers
+  $body.find('h1').each((_, element) => {
+    const $el = $(element);
+    $el.replaceWith(`# ${$el.text().trim()}\n\n`);
+  });
+  
+  $body.find('h2').each((_, element) => {
+    const $el = $(element);
+    $el.replaceWith(`## ${$el.text().trim()}\n\n`);
+  });
+  
+  $body.find('h3').each((_, element) => {
+    const $el = $(element);
+    $el.replaceWith(`### ${$el.text().trim()}\n\n`);
+  });
+  
+  $body.find('h4, h5, h6').each((_, element) => {
+    const $el = $(element);
+    const level = element.name?.substring(1) || '4';
+    const hashes = '#'.repeat(parseInt(level));
+    $el.replaceWith(`${hashes} ${$el.text().trim()}\n\n`);
+  });
+
+  // Process paragraphs
+  $body.find('p').each((_, element) => {
+    const $el = $(element);
+    $el.replaceWith(`${$el.text().trim()}\n\n`);
+  });
+
+  // Process lists
+  $body.find('li').each((_, element) => {
+    const $el = $(element);
+    const text = $el.text().trim();
+    if ($el.parent().is('ol')) {
+      $el.replaceWith(`1. ${text}\n`);
+    } else {
+      $el.replaceWith(`- ${text}\n`);
+    }
+  });
+
+  $body.find('ul, ol').each((_, element) => {
+    const $el = $(element);
+    $el.replaceWith(`\n${$el.html()}\n\n`);
+  });
+
+  // Process blockquotes
+  $body.find('blockquote').each((_, element) => {
+    const $el = $(element);
+    const text = $el.text().trim();
+    $el.replaceWith(`> ${text.replace(/\n/g, '\n> ')}\n\n`);
+  });
+
+  // Process code blocks
+  $body.find('pre').each((_, element) => {
+    const $el = $(element);
+    const text = $el.text().trim();
+    $el.replaceWith(`\`\`\`\n${text}\n\`\`\`\n\n`);
+  });
+
+  $body.find('code').each((_, element) => {
+    const $el = $(element);
+    // Only format inline code that's not inside pre blocks
+    if (!$el.closest('pre').length) {
+      const text = $el.text().trim();
+      $el.replaceWith(`\`${text}\``);
+    }
+  });
+
+  // Process images
+  if (keepImages) {
+    $body.find('img').each((_, element) => {
+      const $img = $(element);
+      const src = $img.attr('src');
+      const alt = $img.attr('alt') || '';
+      
+      if (src && !shouldRemoveImage(src, options)) {
+        const absoluteUrl = new URL(src, baseUrl).toString();
+        $img.replaceWith(`![${alt}](${absoluteUrl})\n\n`);
+      } else {
+        $img.remove();
       }
     });
+  } else {
+    $body.find('img').remove();
+  }
 
-    // Process website links - Preserve the link text AND the URL
-    $('a[href]').each((_, element) => {
-      try {
-        const $link = $(element);
-        if (!keepWebpageLinks) {
-          // Just remove the link but keep the text
-          $link.replaceWith($link.text());
-        } else {
-          const href = $link.attr('href');
-          if (href) {
-            const absoluteUrl = new URL(href, baseUrl).toString();
-            const linkText = $link.text().trim();
-            // Keep both the link text and the URL
-            $link.replaceWith(linkText + ' [' + absoluteUrl + '] ');
-          }
-        }
-      } catch (error) {
-        console.error('Error while processing webpage link: ', error);
+  // Process links
+  if (keepWebpageLinks) {
+    $body.find('a[href]').each((_, element) => {
+      const $link = $(element);
+      const href = $link.attr('href');
+      const text = $link.text().trim();
+      
+      if (href && text) {
+        const absoluteUrl = new URL(href, baseUrl).toString();
+        $link.replaceWith(`[${text}](${absoluteUrl})`);
+      } else if (text) {
+        $link.replaceWith(text);
+      } else {
+        $link.remove();
       }
     });
+  } else {
+    $body.find('a[href]').each((_, element) => {
+      const $link = $(element);
+      $link.replaceWith($link.text().trim());
+    });
+  }
 
-    // Get text content 
-    let text: string;
+  // Process tables (basic support)
+  $body.find('table').each((_, element) => {
+    const $table = $(element);
+    let markdownTable = '\n';
     
-    // Use a simpler approach to extract text
-    const bodyContent = $('body');
-    
-    if (bodyContent.length > 0) {
-      // Remove script and style tags that might have been missed
-      bodyContent.find('script, style, noscript').remove();
+    $table.find('tr').each((rowIndex, row) => {
+      const $row = $(row);
+      const cells: string[] = [];
       
-      // Get text with proper spacing
-      text = bodyContent
-        .contents()
-        .map((_, el) => {
-          if (el.type === 'text') {
-            return $(el).text();
-          }
-          if (el.type === 'tag') {
-            const $el = $(el);
-            const tagName = el.name?.toLowerCase();
-            
-            // Add appropriate spacing for block elements
-            if (['div', 'p', 'br', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName || '')) {
-              return $el.text() + '\n';
-            }
-            return $el.text() + ' ';
-          }
-          return '';
-        })
-        .get()
-        .join('');
-    } else {
-      text = $.text();
-    }
-    
-    // Clean up the text while preserving quotes
-    text = cleanText(text);
+      $row.find('th, td').each((_, cell) => {
+        const $cell = $(cell);
+        cells.push($cell.text().trim());
+      });
+      
+      if (cells.length > 0) {
+        markdownTable += `| ${cells.join(' | ')} |\n`;
+        
+        // Add header separator after first row
+        if (rowIndex === 0) {
+          markdownTable += `|${cells.map(() => '---').join('|')}|\n`;
+        }
+      }
+    });
     
-    return text;
+    $table.replaceWith(markdownTable + '\n');
+  });
 
-  } catch (error) {
-    console.error('Error while getting processed text: ', error);
-    return '';
+  // Get the final text and clean it up
+  let markdown = $body.text();
+  
+  // Clean up excessive whitespace while preserving structure
+  markdown = cleanMarkdown(markdown);
+  
+  return markdown;
+}
+
+function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string {
+  const { keepImages, keepWebpageLinks } = options;
+  
+  const $body = $('body').clone();
+  
+  // Process images
+  if (keepImages) {
+    $body.find('img').each((_, element) => {
+      const $img = $(element);
+      const src = $img.attr('src');
+      
+      if (src && !shouldRemoveImage(src, options)) {
+        const absoluteUrl = new URL(src, baseUrl).toString();
+        $img.replaceWith(`\nImage: ${absoluteUrl}\n`);
+      } else {
+        $img.remove();
+      }
+    });
+  } else {
+    $body.find('img').remove();
   }
+
+  // Process links
+  if (keepWebpageLinks) {
+    $body.find('a[href]').each((_, element) => {
+      const $link = $(element);
+      const href = $link.attr('href');
+      const text = $link.text().trim();
+      
+      if (href && text) {
+        const absoluteUrl = new URL(href, baseUrl).toString();
+        $link.replaceWith(`${text}: ${absoluteUrl} `);
+      }
+    });
+  } else {
+    $body.find('a[href]').each((_, element) => {
+      const $link = $(element);
+      $link.replaceWith($link.text().trim());
+    });
+  }
+
+  let text = $body.text();
+  text = cleanText(text);
+  
+  return text;
 }
 
-// Clean up text while preserving quotes and important content
-function cleanText(text: string): string {
-  if (!text) return '';
+function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean {
+  const { removeSvgImage, removeGifImage, removeImageTypes = [] } = options;
+  
+  const imageTypesToRemove: string[] = [];
+  if (removeSvgImage) imageTypesToRemove.push('.svg');
+  if (removeGifImage) imageTypesToRemove.push('.gif');
+  imageTypesToRemove.push(...removeImageTypes);
   
+  return imageTypesToRemove.some(type => src.includes(type));
+}
+
+function cleanMarkdown(markdown: string): string {
+  return markdown
+    // Replace 3+ newlines with 2 newlines
+    .replace(/\n{3,}/g, '\n\n')
+    // Remove excessive spaces
+    .replace(/[ ]{2,}/g, ' ')
+    // Clean up space around headers
+    .replace(/\n\s*(#+)\s*/g, '\n$1 ')
+    // Remove trailing whitespace
+    .replace(/[ \t]+$/gm, '')
+    .trim();
+}
+
+function cleanText(text: string): string {
   return text
-    // Replace multiple spaces with single space, but be careful with quotes
-    .replace(/[^\S\n]+/g, ' ')
-    // Replace multiple newlines with max 2 newlines
+    .replace(/\s+/g, ' ')
     .replace(/\n\s*\n/g, '\n\n')
-    // Clean up spaces around quotes but don't remove the quotes
-    .replace(/\s+"/g, ' "')
-    .replace(/"\s+/g, '" ')
-    // Remove leading/trailing whitespace
     .trim();
 }
\ No newline at end of file

From dae4e83412d8867e446841aef90022351ef6e574 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 21:18:11 +0530
Subject: [PATCH 09/79] wip: markdown + plain text

---
 server/src/markdownify/get_llm_ready_text.ts | 37 ++++++++++++++------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts
index 4d0515c6c..ed7849ec3 100644
--- a/server/src/markdownify/get_llm_ready_text.ts
+++ b/server/src/markdownify/get_llm_ready_text.ts
@@ -1,29 +1,46 @@
 import { getPageSource, GetPageSourceOptions } from './get_html';
-import { getProcessedText, ProcessTextOptions } from './get_llm_input_text';
+import { getProcessedText, ProcessTextOptions, ProcessedResult } from './get_llm_input_text';
 
-export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions {
-  // Combined options from both interfaces
-}
+export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions {}
 
 export async function urlToLlmText(
   url: string,
   options: UrlToLlmTextOptions = {}
-): Promise<string> {
+): Promise<ProcessedResult> {
   try {
     const pageSource = await getPageSource(url, options);
     
     if (!pageSource) {
-      return '';
+      return {
+        markdown: '',
+        plainText: '',
+        metadata: {
+          title: '',
+          url: url,
+          processedAt: new Date().toISOString(),
+          textLength: 0,
+          markdownLength: 0
+        }
+      };
     }
 
-    const llmText = await getProcessedText(pageSource, url, options);
-    return llmText;
+    const result = await getProcessedText(pageSource, url, options);
+    return result;
     
   } catch (error) {
     console.error('Error while scraping url: ', error);
-    return '';
+    return {
+      markdown: '',
+      plainText: '',
+      metadata: {
+        title: '',
+        url: url,
+        processedAt: new Date().toISOString(),
+        textLength: 0,
+        markdownLength: 0
+      }
+    };
   }
 }
 
-// Export individual functions as well
 export { getPageSource, getProcessedText };
\ No newline at end of file

From 28f1bf85102acca3daa65738bc3dcc2af1306b89 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 21:52:39 +0530
Subject: [PATCH 10/79] fix: better markdown output

---
 server/src/markdownify/get_llm_input_text.ts | 523 +++++++++++++------
 1 file changed, 363 insertions(+), 160 deletions(-)

diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts
index fa0aec6cd..3e600140f 100644
--- a/server/src/markdownify/get_llm_input_text.ts
+++ b/server/src/markdownify/get_llm_input_text.ts
@@ -11,6 +11,9 @@ export interface ProcessTextOptions {
   removeStyleTag?: boolean;
   removeTags?: string[];
   formatAsMarkdown?: boolean;
+  maxContentLength?: number;
+  preserveLineBreaks?: boolean;
+  includeMetadata?: boolean;
 }
 
 export interface ProcessedResult {
@@ -18,13 +21,22 @@ export interface ProcessedResult {
   plainText: string;
   metadata: {
     title: string;
+    description: string;
     url: string;
     processedAt: string;
     textLength: number;
     markdownLength: number;
+    hasContent: boolean;
+    language?: string;
+    wordCount: number;
+    linkCount: number;
+    imageCount: number;
   };
 }
 
+// Global cheerio instance for helper functions
+let $: cheerio.CheerioAPI;
+
 export async function getProcessedText(
   pageSource: string,
   baseUrl: string,
@@ -39,16 +51,21 @@ export async function getProcessedText(
     removeScriptTag = true,
     removeStyleTag = true,
     removeTags = [],
-    formatAsMarkdown = true
+    formatAsMarkdown = true,
+    maxContentLength = 100000,
+    preserveLineBreaks = true,
+    includeMetadata = true
   } = options;
 
   try {
-    const $ = cheerio.load(pageSource);
+    // Initialize cheerio without problematic options
+    $ = cheerio.load(pageSource);
     
-    // Remove unwanted tags
+    // Remove unwanted tags completely
     const tagsToRemove: string[] = [];
     if (removeScriptTag) tagsToRemove.push('script');
     if (removeStyleTag) tagsToRemove.push('style');
+    if (removeScriptTag) tagsToRemove.push('noscript');
     tagsToRemove.push(...removeTags);
     
     const uniqueTags = [...new Set(tagsToRemove)];
@@ -56,25 +73,45 @@ export async function getProcessedText(
       $(tag).remove();
     });
 
-    // Extract page title
-    const title = $('title').text() || $('h1').first().text() || 'Untitled';
+    // Remove common unwanted elements
+    $('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
     
+    // Extract metadata
+    const title = extractTitle();
+    const description = extractDescription();
+    const language = extractLanguage();
+
     // Generate both formats
     const markdown = formatAsMarkdown ? 
-      convertToMarkdown($, baseUrl, options) : 
-      convertToPlainText($, baseUrl, options); // Fallback to plain text if markdown disabled
+      convertToMarkdown(baseUrl, options) : 
+      '';
     
-    const plainText = convertToPlainText($, baseUrl, options);
+    const plainText = convertToPlainText(baseUrl, options);
+
+    // Truncate if necessary
+    const finalMarkdown = markdown.substring(0, maxContentLength);
+    const finalPlainText = plainText.substring(0, maxContentLength);
+
+    // Count elements
+    const linkCount = $('a[href]').length;
+    const imageCount = $('img').length;
+    const wordCount = countWords(finalPlainText);
 
     const result: ProcessedResult = {
-      markdown,
-      plainText,
+      markdown: finalMarkdown,
+      plainText: finalPlainText,
       metadata: {
-        title: title.trim(),
+        title,
+        description,
         url: baseUrl,
         processedAt: new Date().toISOString(),
-        textLength: plainText.length,
-        markdownLength: markdown.length
+        textLength: finalPlainText.length,
+        markdownLength: finalMarkdown.length,
+        hasContent: finalPlainText.length > 0,
+        language,
+        wordCount,
+        linkCount,
+        imageCount
       }
     };
 
@@ -82,186 +119,321 @@ export async function getProcessedText(
 
   } catch (error) {
     console.error('Error while getting processed text: ', error);
-    // Return empty result on error
-    return {
-      markdown: '',
-      plainText: '',
-      metadata: {
-        title: '',
-        url: baseUrl,
-        processedAt: new Date().toISOString(),
-        textLength: 0,
-        markdownLength: 0
-      }
-    };
+    return createEmptyResult(baseUrl);
   }
 }
 
-function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string {
-  const { keepImages, keepWebpageLinks } = options;
+function extractTitle(): string {
+  return $('title').text()?.trim() || 
+         $('meta[property="og:title"]').attr('content')?.trim() ||
+         $('h1').first().text()?.trim() || 
+         'Untitled';
+}
+
+function extractDescription(): string {
+  return $('meta[name="description"]').attr('content')?.trim() ||
+         $('meta[property="og:description"]').attr('content')?.trim() ||
+         '';
+}
+
+function extractLanguage(): string {
+  return $('html').attr('lang') || 'en';
+}
+
+function countWords(text: string): number {
+  return text.split(/\s+/).filter(word => word.length > 0).length;
+}
+
+function convertToMarkdown(baseUrl: string, options: ProcessTextOptions): string {
+  const { keepImages, keepWebpageLinks, preserveLineBreaks } = options;
   
+  // Start with metadata if available
+  let markdown = '';
+  const title = extractTitle();
+  if (title && title !== 'Untitled') {
+    markdown += `# ${title}\n\n`;
+  }
+
+  const description = extractDescription();
+  if (description) {
+    markdown += `> ${description}\n\n`;
+  }
+
   // Clone the body to avoid modifying the original
   const $body = $('body').clone();
   
-  // Process headers
-  $body.find('h1').each((_, element) => {
-    const $el = $(element);
-    $el.replaceWith(`# ${$el.text().trim()}\n\n`);
-  });
-  
-  $body.find('h2').each((_, element) => {
-    const $el = $(element);
-    $el.replaceWith(`## ${$el.text().trim()}\n\n`);
-  });
-  
-  $body.find('h3').each((_, element) => {
-    const $el = $(element);
-    $el.replaceWith(`### ${$el.text().trim()}\n\n`);
-  });
-  
-  $body.find('h4, h5, h6').each((_, element) => {
-    const $el = $(element);
-    const level = element.name?.substring(1) || '4';
-    const hashes = '#'.repeat(parseInt(level));
-    $el.replaceWith(`${hashes} ${$el.text().trim()}\n\n`);
-  });
+  // Remove unwanted elements from the clone
+  $body.find('script, style, noscript, meta, link').remove();
+  $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
 
-  // Process paragraphs
-  $body.find('p').each((_, element) => {
-    const $el = $(element);
-    $el.replaceWith(`${$el.text().trim()}\n\n`);
-  });
+  // Process in order of importance
+  const sections: string[] = [];
+
+  // Process main content areas first
+  const contentSelectors = [
+    'main', 'article', '[role="main"]', '.content', '.main', 
+    '#content', '#main', '.post', '.article'
+  ];
 
-  // Process lists
-  $body.find('li').each((_, element) => {
-    const $el = $(element);
-    const text = $el.text().trim();
-    if ($el.parent().is('ol')) {
-      $el.replaceWith(`1. ${text}\n`);
-    } else {
-      $el.replaceWith(`- ${text}\n`);
+  let mainContent = '';
+  for (const selector of contentSelectors) {
+    const $content = $body.find(selector).first();
+    if ($content.length > 0) {
+      mainContent = processElementToMarkdown($content, baseUrl, options, 0);
+      if (mainContent.trim().length > 100) { // Only use if substantial content
+        sections.push(mainContent);
+        $content.remove(); // Remove from body to avoid duplication
+        break;
+      }
     }
-  });
+  }
 
-  $body.find('ul, ol').each((_, element) => {
-    const $el = $(element);
-    $el.replaceWith(`\n${$el.html()}\n\n`);
-  });
+  // Process headers and structure
+  sections.push(processElementToMarkdown($body, baseUrl, options, 0));
 
-  // Process blockquotes
-  $body.find('blockquote').each((_, element) => {
-    const $el = $(element);
-    const text = $el.text().trim();
-    $el.replaceWith(`> ${text.replace(/\n/g, '\n> ')}\n\n`);
-  });
+  // Combine sections
+  markdown += sections.filter(s => s.trim().length > 0).join('\n\n');
 
-  // Process code blocks
-  $body.find('pre').each((_, element) => {
-    const $el = $(element);
-    const text = $el.text().trim();
-    $el.replaceWith(`\`\`\`\n${text}\n\`\`\`\n\n`);
-  });
+  // Final cleanup
+  markdown = cleanMarkdown(markdown, preserveLineBreaks);
+  
+  return markdown;
+}
 
-  $body.find('code').each((_, element) => {
-    const $el = $(element);
-    // Only format inline code that's not inside pre blocks
-    if (!$el.closest('pre').length) {
-      const text = $el.text().trim();
-      $el.replaceWith(`\`${text}\``);
-    }
-  });
+function processElementToMarkdown($element: cheerio.Cheerio<any>, baseUrl: string, options: ProcessTextOptions, depth: number = 0): string {
+  if (depth > 10) return ''; // Prevent infinite recursion
+  
+  const { keepImages, keepWebpageLinks } = options;
+  let markdown = '';
 
-  // Process images
-  if (keepImages) {
-    $body.find('img').each((_, element) => {
-      const $img = $(element);
-      const src = $img.attr('src');
-      const alt = $img.attr('alt') || '';
-      
-      if (src && !shouldRemoveImage(src, options)) {
-        const absoluteUrl = new URL(src, baseUrl).toString();
-        $img.replaceWith(`![${alt}](${absoluteUrl})\n\n`);
-      } else {
-        $img.remove();
+  $element.contents().each((index, node) => {
+    if (node.type === 'text') {
+      const text = $(node).text().trim();
+      if (text) {
+        markdown += text + ' ';
       }
-    });
-  } else {
-    $body.find('img').remove();
-  }
+    } else if (node.type === 'tag') {
+      const $node = $(node);
+      const tagName = node.name?.toLowerCase() || '';
 
-  // Process links
-  if (keepWebpageLinks) {
-    $body.find('a[href]').each((_, element) => {
-      const $link = $(element);
-      const href = $link.attr('href');
-      const text = $link.text().trim();
-      
-      if (href && text) {
-        const absoluteUrl = new URL(href, baseUrl).toString();
-        $link.replaceWith(`[${text}](${absoluteUrl})`);
-      } else if (text) {
-        $link.replaceWith(text);
-      } else {
-        $link.remove();
+      switch (tagName) {
+        case 'h1':
+          markdown += `\n# ${$node.text().trim()}\n\n`;
+          break;
+        case 'h2':
+          markdown += `\n## ${$node.text().trim()}\n\n`;
+          break;
+        case 'h3':
+          markdown += `\n### ${$node.text().trim()}\n\n`;
+          break;
+        case 'h4':
+          markdown += `\n#### ${$node.text().trim()}\n\n`;
+          break;
+        case 'h5':
+          markdown += `\n##### ${$node.text().trim()}\n\n`;
+          break;
+        case 'h6':
+          markdown += `\n###### ${$node.text().trim()}\n\n`;
+          break;
+        case 'p':
+          const paragraphText = processElementToMarkdown($node, baseUrl, options, depth + 1);
+          if (paragraphText.trim()) {
+            markdown += `\n${paragraphText.trim()}\n\n`;
+          }
+          break;
+        case 'br':
+          markdown += '\n';
+          break;
+        case 'hr':
+          markdown += '\n---\n\n';
+          break;
+        case 'strong':
+        case 'b':
+          const strongText = processElementToMarkdown($node, baseUrl, options, depth + 1);
+          if (strongText.trim()) {
+            markdown += `**${strongText.trim()}**`;
+          }
+          break;
+        case 'em':
+        case 'i':
+          const emText = processElementToMarkdown($node, baseUrl, options, depth + 1);
+          if (emText.trim()) {
+            markdown += `*${emText.trim()}*`;
+          }
+          break;
+        case 'code':
+          if (!$node.closest('pre').length) {
+            const codeText = $node.text().trim();
+            if (codeText) {
+              markdown += `\`${codeText}\``;
+            }
+          }
+          break;
+        case 'pre':
+          const preText = $node.text().trim();
+          if (preText) {
+            const codeClass = $node.find('code').attr('class');
+            const language = codeClass ? codeClass.replace('language-', '') : '';
+            markdown += `\n\`\`\`${language}\n${preText}\n\`\`\`\n\n`;
+          }
+          break;
+        case 'blockquote':
+          const quoteText = processElementToMarkdown($node, baseUrl, options, depth + 1);
+          if (quoteText.trim()) {
+            const lines = quoteText.trim().split('\n');
+            markdown += '\n' + lines.map(line => `> ${line}`).join('\n') + '\n\n';
+          }
+          break;
+        case 'ul':
+          const listItems: string[] = [];
+          $node.find('> li').each((_, li) => {
+            const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1);
+            if (itemText.trim()) {
+              listItems.push(`- ${itemText.trim()}`);
+            }
+          });
+          if (listItems.length > 0) {
+            markdown += '\n' + listItems.join('\n') + '\n\n';
+          }
+          break;
+        case 'ol':
+          const olItems: string[] = [];
+          $node.find('> li').each((i, li) => {
+            const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1);
+            if (itemText.trim()) {
+              olItems.push(`${i + 1}. ${itemText.trim()}`);
+            }
+          });
+          if (olItems.length > 0) {
+            markdown += '\n' + olItems.join('\n') + '\n\n';
+          }
+          break;
+        case 'a':
+          if (keepWebpageLinks) {
+            const href = $node.attr('href');
+            const linkText = processElementToMarkdown($node, baseUrl, options, depth + 1).trim();
+            if (href && linkText) {
+              try {
+                const absoluteUrl = new URL(href, baseUrl).toString();
+                markdown += `[${linkText}](${absoluteUrl})`;
+              } catch {
+                markdown += linkText;
+              }
+            } else if (linkText) {
+              markdown += linkText;
+            }
+          } else {
+            markdown += processElementToMarkdown($node, baseUrl, options, depth + 1);
+          }
+          break;
+        case 'img':
+          if (keepImages) {
+            const src = $node.attr('src');
+            const alt = $node.attr('alt') || $node.attr('title') || '';
+            if (src && !shouldRemoveImage(src, options)) {
+              try {
+                const absoluteUrl = new URL(src, baseUrl).toString();
+                markdown += `![${alt}](${absoluteUrl})`;
+              } catch {
+                // Ignore invalid URLs
+              }
+            }
+          }
+          break;
+        case 'table':
+          markdown += processTableToMarkdown($node);
+          break;
+        case 'div':
+        case 'section':
+        case 'article':
+        case 'header':
+        case 'footer':
+        case 'nav':
+        case 'aside':
+          // Process block-level elements with their content
+          const blockContent = processElementToMarkdown($node, baseUrl, options, depth + 1);
+          if (blockContent.trim()) {
+            markdown += `\n${blockContent.trim()}\n\n`;
+          }
+          break;
+        default:
+          // For other tags, just process their content
+          markdown += processElementToMarkdown($node, baseUrl, options, depth + 1);
+          break;
       }
-    });
-  } else {
-    $body.find('a[href]').each((_, element) => {
-      const $link = $(element);
-      $link.replaceWith($link.text().trim());
-    });
-  }
+    }
+  });
+
+  return markdown;
+}
+
+function processTableToMarkdown($table: cheerio.Cheerio<any>): string {
+  const rows: string[][] = [];
+  let maxColumns = 0;
 
-  // Process tables (basic support)
-  $body.find('table').each((_, element) => {
-    const $table = $(element);
-    let markdownTable = '\n';
+  $table.find('tr').each((_, row) => {
+    const $row = $(row);
+    const cells: string[] = [];
     
-    $table.find('tr').each((rowIndex, row) => {
-      const $row = $(row);
-      const cells: string[] = [];
-      
-      $row.find('th, td').each((_, cell) => {
-        const $cell = $(cell);
-        cells.push($cell.text().trim());
-      });
+    $row.find('th, td').each((_, cell) => {
+      const $cell = $(cell);
+      const text = $cell.text().trim();
+      const colspan = parseInt($cell.attr('colspan') || '1');
       
-      if (cells.length > 0) {
-        markdownTable += `| ${cells.join(' | ')} |\n`;
-        
-        // Add header separator after first row
-        if (rowIndex === 0) {
-          markdownTable += `|${cells.map(() => '---').join('|')}|\n`;
-        }
+      cells.push(text);
+      // Add empty cells for colspan
+      for (let i = 1; i < colspan; i++) {
+        cells.push('');
       }
     });
     
-    $table.replaceWith(markdownTable + '\n');
+    if (cells.length > 0) {
+      rows.push(cells);
+      maxColumns = Math.max(maxColumns, cells.length);
+    }
   });
 
-  // Get the final text and clean it up
-  let markdown = $body.text();
+  if (rows.length === 0) return '';
+
+  let markdownTable = '\n';
   
-  // Clean up excessive whitespace while preserving structure
-  markdown = cleanMarkdown(markdown);
+  // Header row
+  if (rows.length > 0) {
+    markdownTable += `| ${rows[0].join(' | ')} |\n`;
+    markdownTable += `|${' --- |'.repeat(rows[0].length)}\n`;
+    
+    // Data rows
+    for (let i = 1; i < rows.length; i++) {
+      markdownTable += `| ${rows[i].join(' | ')} |\n`;
+    }
+  }
   
-  return markdown;
+  return markdownTable + '\n';
 }
 
-function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string {
+function convertToPlainText(baseUrl: string, options: ProcessTextOptions): string {
   const { keepImages, keepWebpageLinks } = options;
   
   const $body = $('body').clone();
   
+  // Remove unwanted elements
+  $body.find('script, style, noscript, meta, link').remove();
+  $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
+
   // Process images
   if (keepImages) {
     $body.find('img').each((_, element) => {
       const $img = $(element);
       const src = $img.attr('src');
+      const alt = $img.attr('alt') || '';
       
       if (src && !shouldRemoveImage(src, options)) {
-        const absoluteUrl = new URL(src, baseUrl).toString();
-        $img.replaceWith(`\nImage: ${absoluteUrl}\n`);
+        try {
+          const absoluteUrl = new URL(src, baseUrl).toString();
+          $img.replaceWith(`[Image: ${alt || 'image'} - ${absoluteUrl}]`);
+        } catch {
+          $img.remove();
+        }
       } else {
         $img.remove();
       }
@@ -278,8 +450,12 @@ function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: Pro
       const text = $link.text().trim();
       
       if (href && text) {
-        const absoluteUrl = new URL(href, baseUrl).toString();
-        $link.replaceWith(`${text}: ${absoluteUrl} `);
+        try {
+          const absoluteUrl = new URL(href, baseUrl).toString();
+          $link.replaceWith(`${text} (${absoluteUrl})`);
+        } catch {
+          $link.replaceWith(text);
+        }
       }
     });
   } else {
@@ -303,25 +479,52 @@ function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean {
   if (removeGifImage) imageTypesToRemove.push('.gif');
   imageTypesToRemove.push(...removeImageTypes);
   
-  return imageTypesToRemove.some(type => src.includes(type));
+  return imageTypesToRemove.some(type => src.toLowerCase().includes(type.toLowerCase()));
 }
 
-function cleanMarkdown(markdown: string): string {
+function cleanMarkdown(markdown: string, preserveLineBreaks: boolean = true): string {
   return markdown
-    // Replace 3+ newlines with 2 newlines
+    // Normalize line breaks
+    .replace(/\r\n/g, '\n')
+    // Remove excessive empty lines (keep max 2)
     .replace(/\n{3,}/g, '\n\n')
-    // Remove excessive spaces
-    .replace(/[ ]{2,}/g, ' ')
-    // Clean up space around headers
+    // Clean up spaces around headers
     .replace(/\n\s*(#+)\s*/g, '\n$1 ')
+    // Remove spaces at start of lines
+    .replace(/^\s+/gm, '')
     // Remove trailing whitespace
     .replace(/[ \t]+$/gm, '')
+    // Fix multiple spaces
+    .replace(/[ ]{2,}/g, ' ')
+    // Ensure proper spacing after paragraphs
+    .replace(/([^\n])\n([^\n])/g, '$1\n\n$2')
     .trim();
 }
 
 function cleanText(text: string): string {
   return text
+    .replace(/\r\n/g, '\n')
     .replace(/\s+/g, ' ')
     .replace(/\n\s*\n/g, '\n\n')
+    .replace(/[ ]{2,}/g, ' ')
     .trim();
+}
+
+function createEmptyResult(url: string): ProcessedResult {
+  return {
+    markdown: '',
+    plainText: '',
+    metadata: {
+      title: '',
+      description: '',
+      url: url,
+      processedAt: new Date().toISOString(),
+      textLength: 0,
+      markdownLength: 0,
+      hasContent: false,
+      wordCount: 0,
+      linkCount: 0,
+      imageCount: 0
+    }
+  };
 }
\ No newline at end of file

From 1651763fc288c0c8b663294ab05b7d3b40326580 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Mon, 17 Nov 2025 21:53:04 +0530
Subject: [PATCH 11/79] fix: better markdown output

---
 server/src/markdownify/get_llm_ready_text.ts | 46 ++++++++++----------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts
index ed7849ec3..025fb52d6 100644
--- a/server/src/markdownify/get_llm_ready_text.ts
+++ b/server/src/markdownify/get_llm_ready_text.ts
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: MIT
+
 import { getPageSource, GetPageSourceOptions } from './get_html';
 import { getProcessedText, ProcessTextOptions, ProcessedResult } from './get_llm_input_text';
 
@@ -11,17 +13,7 @@ export async function urlToLlmText(
     const pageSource = await getPageSource(url, options);
     
     if (!pageSource) {
-      return {
-        markdown: '',
-        plainText: '',
-        metadata: {
-          title: '',
-          url: url,
-          processedAt: new Date().toISOString(),
-          textLength: 0,
-          markdownLength: 0
-        }
-      };
+      return createEmptyResult(url);
     }
 
     const result = await getProcessedText(pageSource, url, options);
@@ -29,18 +21,28 @@ export async function urlToLlmText(
     
   } catch (error) {
     console.error('Error while scraping url: ', error);
-    return {
-      markdown: '',
-      plainText: '',
-      metadata: {
-        title: '',
-        url: url,
-        processedAt: new Date().toISOString(),
-        textLength: 0,
-        markdownLength: 0
-      }
-    };
+    return createEmptyResult(url);
   }
 }
 
+function createEmptyResult(url: string): ProcessedResult {
+  return {
+    markdown: '',
+    plainText: '',
+    metadata: {
+      title: '',
+      description: '',
+      url: url,
+      processedAt: new Date().toISOString(),
+      textLength: 0,
+      markdownLength: 0,
+      hasContent: false,
+      language: 'en',
+      wordCount: 0,
+      linkCount: 0,
+      imageCount: 0
+    }
+  };
+}
+
 export { getPageSource, getProcessedText };
\ No newline at end of file

From 6e6d6c68011533b42b886989726ad5830291b293 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Tue, 18 Nov 2025 23:27:22 +0530
Subject: [PATCH 12/79] chore(deps): install cheerio, turndown

---
 package.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/package.json b/package.json
index c70b0fef5..0a8a8f6c7 100644
--- a/package.json
+++ b/package.json
@@ -28,6 +28,7 @@
     "bcrypt": "^5.1.1",
     "body-parser": "^1.20.3",
     "buffer": "^6.0.3",
+    "cheerio": "^1.1.2",
     "connect-pg-simple": "^10.0.0",
     "cookie-parser": "^1.4.6",
     "cors": "^2.8.5",
@@ -80,6 +81,7 @@
     "styled-components": "^5.3.3",
     "swagger-jsdoc": "^6.2.8",
     "swagger-ui-express": "^5.0.1",
+    "turndown": "^7.2.2",
     "typedoc": "^0.23.8",
     "typescript": "^5.0.0",
     "uuid": "^8.3.2",
@@ -126,6 +128,7 @@
     "@types/styled-components": "^5.1.23",
     "@types/swagger-jsdoc": "^6.0.4",
     "@types/swagger-ui-express": "^4.1.6",
+    "@types/turndown": "^5.0.6",
     "@vitejs/plugin-react": "^4.3.3",
     "ajv": "^8.8.2",
     "concurrently": "^7.0.0",

From f22f6ef83daf13bd5c254c11527c786cff70dec5 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Tue, 18 Nov 2025 23:41:27 +0530
Subject: [PATCH 13/79] debug(temporary): turndown x amzn

---
 server/src/markdownify/debug_turndown.ts | 132 +++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 server/src/markdownify/debug_turndown.ts

diff --git a/server/src/markdownify/debug_turndown.ts b/server/src/markdownify/debug_turndown.ts
new file mode 100644
index 000000000..1d62b109f
--- /dev/null
+++ b/server/src/markdownify/debug_turndown.ts
@@ -0,0 +1,132 @@
+import { getPageSource } from './get_html';
+import { getProcessedText } from './get_llm_input_text';
+import * as cheerio from 'cheerio';
+import TurndownService from 'turndown';
+
+async function debugTurndown() {
+    const testUrls = [
+        "https://amazon.com/",
+    ];
+
+    for (const url of testUrls) {
+        console.log(`\n${'='.repeat(70)}`);
+        console.log(`🔍 Testing URL: ${url}`);
+        console.log(`${'='.repeat(70)}`);
+        
+        try {
+            const pageSource = await getPageSource(url, {
+                wait: 3.0, // Longer wait time
+                timeout: 15000 // 15 second timeout
+            });
+            
+            if (!pageSource || pageSource.length < 100) {
+                console.error("❌ No page source received or content too short");
+                continue;
+            }
+
+            // Save raw HTML for inspection
+            const fs = await import('fs/promises');
+            const domain = new URL(url).hostname;
+            await fs.writeFile(`debug_${domain}_raw.html`, pageSource);
+            console.log(`💾 Raw HTML saved to debug_${domain}_raw.html (${pageSource.length} chars)`);
+
+            // Parse with cheerio
+            const $ = cheerio.load(pageSource);
+            
+            // Check what's in the body
+            const bodyText = $('body').text();
+            console.log(`📄 Body text length: ${bodyText.length} chars`);
+            console.log(`📄 Body preview: ${bodyText.substring(0, 200)}...`);
+
+            // Test content extraction
+            const contentSelectors = [
+                'main', 'article', '[role="main"]', '.content', '.main-content',
+                '#content', '#main', '.post', '.article'
+            ];
+
+            let mainContent: cheerio.Cheerio<any> = $('body');
+            let foundSelector = 'body (fallback)';
+            
+            for (const selector of contentSelectors) {
+                const $content = $(selector).first();
+                if ($content.length > 0 && $content.text().trim().length > 10) {
+                    console.log(`✅ Found content with selector: ${selector}`);
+                    console.log(`📝 Content text length: ${$content.text().length}`);
+                    mainContent = $content;
+                    foundSelector = selector;
+                    break;
+                }
+            }
+
+            console.log(`🎯 Using content from: ${foundSelector}`);
+
+            // Test Turndown directly
+            console.log("\n🧪 Testing Turndown directly...");
+            const turndownService = new TurndownService();
+            
+            if (mainContent.length > 0) {
+                const contentHtml = mainContent.html() || '';
+                if (contentHtml && contentHtml.length > 10) {
+                    console.log(`📦 Content HTML length: ${contentHtml.length} chars`);
+                    
+                    try {
+                        const contentMarkdown = turndownService.turndown(contentHtml);
+                        console.log(`📝 Turndown result length: ${contentMarkdown.length} chars`);
+                        
+                        if (contentMarkdown.length > 0) {
+                            console.log(`📝 Markdown preview: ${contentMarkdown.substring(0, 300)}...`);
+                            await fs.writeFile(`debug_${domain}_turndown.md`, contentMarkdown);
+                            console.log(`💾 Turndown output saved to debug_${domain}_turndown.md`);
+                        } else {
+                            console.log("❌ Turndown produced empty markdown");
+                        }
+                    } catch (turndownError) {
+                        console.error("❌ Turndown conversion failed:", turndownError);
+                    }
+                } else {
+                    console.log("❌ No HTML content found for Turndown");
+                }
+            }
+
+            // Test our full function
+            console.log("\n🧪 Testing full getProcessedText function...");
+            const result = await getProcessedText(pageSource, url, {
+                keepImages: true,
+                keepWebpageLinks: true,
+                removeScriptTag: true,
+                removeStyleTag: true,
+                formatAsMarkdown: true
+            });
+
+            console.log("📊 Result metadata:");
+            console.log(`- Markdown length: ${result.metadata.markdownLength} chars`);
+            console.log(`- Plain text length: ${result.metadata.textLength} chars`);
+            console.log(`- Has content: ${result.metadata.hasContent}`);
+            console.log(`- Content score: ${result.metadata.contentScore}/10`);
+
+            if (result.markdown && result.markdown.length > 0) {
+                console.log(`📄 Markdown preview (300 chars):`);
+                console.log(result.markdown.substring(0, 300) + '...');
+                await fs.writeFile(`debug_${domain}_full.md`, result.markdown);
+                console.log(`💾 Full output saved to debug_${domain}_full.md`);
+            } else {
+                console.log("❌ Empty markdown from full function");
+                
+                // Debug why it's empty
+                if (result.plainText && result.plainText.length > 0) {
+                    console.log("ℹ️  But plain text has content, so markdown conversion failed");
+                    await fs.writeFile(`debug_${domain}_plain.txt`, result.plainText);
+                    console.log(`💾 Plain text saved to debug_${domain}_plain.txt`);
+                }
+            }
+
+        } catch (error) {
+            console.error(`💥 Error processing ${url}:`, error);
+        }
+        
+        // Small delay between requests
+        await new Promise(resolve => setTimeout(resolve, 1000));
+    }
+}
+
+debugTurndown().catch(console.error);
\ No newline at end of file

From 0fa5397b45e42d2280734b7a436cef0d1d9a53c9 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Tue, 18 Nov 2025 23:42:20 +0530
Subject: [PATCH 14/79] debug(temporary): test url -> llm text

---
 server/src/markdownify/test.ts | 73 ++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 server/src/markdownify/test.ts

diff --git a/server/src/markdownify/test.ts b/server/src/markdownify/test.ts
new file mode 100644
index 000000000..16e2a2855
--- /dev/null
+++ b/server/src/markdownify/test.ts
@@ -0,0 +1,73 @@
+import { urlToLlmText } from './get_llm_ready_text';
+
+async function demoDualOutput() {
+    const testUrls = [
+        "https://quotes.toscrape.com/",
+        "https://httpbin.org/html",
+        "https://example.com",
+        "https://amazon.com"
+    ];
+
+    for (const url of testUrls) {
+        console.log(`\n${'='.repeat(70)}`);
+        console.log(`Processing: ${url}`);
+        console.log(`${'='.repeat(70)}`);
+
+        try {
+            const result = await urlToLlmText(url, {
+                keepImages: true,
+                keepWebpageLinks: true,
+                removeScriptTag: true,
+                removeStyleTag: true,
+                formatAsMarkdown: true
+            });
+
+            console.log(`\n METADATA:`);
+            console.log(`Title: ${result.metadata.title}`);
+            console.log(`URL: ${result.metadata.url}`);
+            console.log(`Processed: ${result.metadata.processedAt}`);
+            console.log(`Plain text length: ${result.metadata.textLength} chars`);
+            console.log(`Markdown length: ${result.metadata.markdownLength} chars`);
+            console.log(`Content Score: ${result.metadata.contentScore}/10`);
+
+            console.log(`\nPLAIN TEXT (first 600 chars):`);
+            console.log(`${result.plainText.substring(0, 600)}${result.plainText.length > 600 ? '...' : ''}`);
+
+            console.log(`\nMARKDOWN (first 600 chars):`);
+            console.log(`${result.markdown.substring(0, 600)}${result.markdown.length > 600 ? '...' : ''}`);
+
+            // Save both formats
+            const domain = new URL(url).hostname;
+            const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+            
+            await saveToFile(result.plainText, `output/${domain}_${timestamp}_plain.txt`);
+            await saveToFile(result.markdown, `output/${domain}_${timestamp}_markdown.md`);
+            
+            // Save metadata as JSON
+            await saveToFile(JSON.stringify(result.metadata, null, 2), `output/${domain}_${timestamp}_metadata.json`);
+
+            console.log(`\nSaved to output/ directory`);
+
+        } catch (error) {
+            console.error(`Error processing ${url}:`, error);
+        }
+    }
+}
+
+async function saveToFile(content: string, filename: string) {
+    const fs = await import('fs/promises');
+    const path = await import('path');
+    
+    try {
+        // Create directory if it doesn't exist
+        const dir = path.dirname(filename);
+        await fs.mkdir(dir, { recursive: true });
+        
+        await fs.writeFile(filename, content, 'utf-8');
+    } catch (error) {
+        console.error(`Error saving to ${filename}:`, error);
+    }
+}
+
+// Run the demo
+demoDualOutput().catch(console.error);
\ No newline at end of file

From 4158896e3c20d1449f5d167c4529aad81615b510 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Wed, 19 Nov 2025 22:34:18 +0530
Subject: [PATCH 15/79] chore: link replace

---
 server/src/markdownify/html-to-markdown/go.mod | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 server/src/markdownify/html-to-markdown/go.mod

diff --git a/server/src/markdownify/html-to-markdown/go.mod b/server/src/markdownify/html-to-markdown/go.mod
new file mode 100644
index 000000000..ada1bf6dc
--- /dev/null
+++ b/server/src/markdownify/html-to-markdown/go.mod
@@ -0,0 +1,18 @@
+module html-to-markdown
+
+go 1.23.0
+
+toolchain go1.24.0
+
+require (
+	github.com/PuerkitoBio/goquery v1.10.3
+	github.com/getmaxun/html-to-markdown/v2 v2.0.6
+	golang.org/x/net v0.43.0
+)
+
+require (
+	github.com/JohannesKaufmann/dom v0.2.0 // indirect
+	github.com/andybalholm/cascadia v1.3.3 // indirect
+)
+
+replace github.com/JohannesKaufmann/html-to-markdown/v2 => github.com/getmaxun/html-to-markdown/v2 v2.0.0

From 6c8850a0a7f674e11f8250189753ddf1ac2bb2f3 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Wed, 19 Nov 2025 22:35:25 +0530
Subject: [PATCH 16/79] chore: link replace

---
 .../src/markdownify/html-to-markdown/go.sum   | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 server/src/markdownify/html-to-markdown/go.sum

diff --git a/server/src/markdownify/html-to-markdown/go.sum b/server/src/markdownify/html-to-markdown/go.sum
new file mode 100644
index 000000000..f5cc5e695
--- /dev/null
+++ b/server/src/markdownify/html-to-markdown/go.sum
@@ -0,0 +1,83 @@
+github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ=
+github.com/JohannesKaufmann/dom v0.2.0/go.mod h1:57iSUl5RKric4bUkgos4zu6Xt5LMHUnw3TF1l5CbGZo=
+github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
+github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
+github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
+github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
+github.com/getmaxun/html-to-markdown/v2 v2.0.6 h1:SXoxwR0TCCggwdEAlarhsrvMBHIbD9YqVTSkCYnZukc=
+github.com/getmaxun/html-to-markdown/v2 v2.0.6/go.mod h1:FjUN4bMyWtxmt2EpnEEOb5zu/GUSRk3PIr5ADTE4GBg=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/sebdah/goldie/v2 v2.7.1 h1:PkBHymaYdtvEkZV7TmyqKxdmn5/Vcj+8TpATWZjnG5E=
+github.com/sebdah/goldie/v2 v2.7.1/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
+github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw=
+github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA=
+github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
+golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
+golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
+golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
+golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
+golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
+golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
+golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

From 7da464755d0dd77dedacc827e9f1d389ef514f32 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Wed, 19 Nov 2025 22:50:46 +0530
Subject: [PATCH 17/79] wip: to markdown

---
 .../html-to-markdown/html-to-markdown.go      | 157 ++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 server/src/markdownify/html-to-markdown/html-to-markdown.go

diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go
new file mode 100644
index 000000000..dce011c13
--- /dev/null
+++ b/server/src/markdownify/html-to-markdown/html-to-markdown.go
@@ -0,0 +1,157 @@
+package main
+
+/*
+#include <stdlib.h>
+*/
+import "C"
+
+import (
+	"strings"
+	"unsafe"
+	"unicode/utf8"
+
+	"github.com/PuerkitoBio/goquery"
+	md "github.com/getmaxun/html-to-markdown/v2"
+	"github.com/getmaxun/html-to-markdown/v2/plugin"
+	converter "github.com/getmaxun/html-to-markdown/v2/converter"
+	"golang.org/x/net/html"
+)
+
+// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C.
+func ConvertHTMLToMarkdown(input *C.char) *C.char {
+	engine := converter.NewConverter("", true, nil)
+	// engine.Use(plugin.GitHubFlavored())
+
+	registerPreHandler(engine)
+
+	result, err := engine.ConvertString(C.GoString(input))
+	if err != nil {
+		// swallow conversion error (same as original)
+	}
+
+	return C.CString(result)
+}
+
+//export FreeCString
+// Frees C string memory.
+func FreeCString(str *C.char) {
+	C.free(unsafe.Pointer(str))
+}
+
+func main() {
+	// Required empty main for CGO.
+}
+
+// registerPreHandler configures a specialized PRE/code block rule
+// to properly extract nested content and detect languages.
+func registerPreHandler(conv *converter.Converter) {
+	isNoiseNode := func(class string) bool {
+		l := strings.ToLower(class)
+		return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers")
+	}
+
+	findLanguage := func(sel *goquery.Selection) string {
+		cls := strings.ToLower(sel.AttrOr("class", ""))
+		for _, chunk := range strings.Fields(cls) {
+			if strings.HasPrefix(chunk, "language-") {
+				return strings.TrimPrefix(chunk, "language-")
+			}
+			if strings.HasPrefix(chunk, "lang-") {
+				return strings.TrimPrefix(chunk, "lang-")
+			}
+		}
+		return ""
+	}
+
+	// Walk nodes and extract visible text, injecting newlines at block boundaries.
+	var scrape func(n *html.Node, out *strings.Builder)
+	scrape = func(n *html.Node, out *strings.Builder) {
+		if n == nil {
+			return
+		}
+
+		switch n.Type {
+		case html.TextNode:
+			out.WriteString(n.Data)
+
+		case html.ElementNode:
+			tag := strings.ToLower(n.Data)
+
+			// skip gutter/line number elements
+			for _, attr := range n.Attr {
+				if attr.Key == "class" && isNoiseNode(attr.Val) {
+					return
+				}
+			}
+
+			if tag == "br" {
+				out.WriteString("\n")
+			}
+
+			for child := n.FirstChild; child != nil; child = child.NextSibling {
+				scrape(child, out)
+			}
+
+			switch tag {
+			case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot",
+				"section", "article", "blockquote", "pre",
+				"h1", "h2", "h3", "h4", "h5", "h6":
+				out.WriteString("\n")
+			}
+		}
+	}
+
+	// PRE blocks
+	conv.AddRules(md.Rule{
+		Filter: []string{"pre"},
+		Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string {
+			codeTag := s.Find("code").First()
+			lang := findLanguage(codeTag)
+			if lang == "" {
+				lang = findLanguage(s)
+			}
+
+			var buf strings.Builder
+			for _, node := range s.Nodes {
+				scrape(node, &buf)
+			}
+
+			raw := strings.TrimRight(buf.String(), "\n")
+
+			fRune, _ := utf8.DecodeRuneInString(opt.Fence)
+			fence := md.CalculateCodeFence(fRune, raw)
+
+			block := "\n\n" + fence + lang + "\n" + raw + "\n" + fence + "\n\n"
+			return md.String(block)
+		},
+	})
+
+	// Inline code rule
+	conv.AddRules(md.Rule{
+		Filter: []string{"code"},
+		Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string {
+			// do nothing when inside PRE
+			if s.ParentsFiltered("pre").Length() > 0 {
+				return nil
+			}
+
+			var buf strings.Builder
+			for _, node := range s.Nodes {
+				scrape(node, &buf)
+			}
+
+			text := md.TrimTrailingSpaces(strings.ReplaceAll(buf.String(), "\r\n", "\n"))
+
+			fence := "`"
+			if strings.Contains(text, "`") {
+				fence = "``"
+				if strings.Contains(text, "``") {
+					fence = "```"
+				}
+			}
+
+			inline := fence + text + fence
+			return md.String(inline)
+		},
+	})
+}

From dd1a9a8a85ffac8df612b163bd7d0cb952c60c12 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Wed, 19 Nov 2025 23:42:44 +0530
Subject: [PATCH 18/79] chore(deps): install koffi

---
 package.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/package.json b/package.json
index 0a8a8f6c7..330db1e3d 100644
--- a/package.json
+++ b/package.json
@@ -49,6 +49,7 @@
     "joi": "^17.6.0",
     "jsonwebtoken": "^9.0.2",
     "jwt-decode": "^4.0.0",
+    "koffi": "^2.14.1",
     "lodash": "^4.17.21",
     "loglevel": "^1.8.0",
     "loglevel-plugin-remote": "^0.6.8",

From ec49565c44b81dfe6ae4af0757b47efd7027b8bc Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Wed, 19 Nov 2025 23:59:14 +0530
Subject: [PATCH 19/79] chore: ignore build files

---
 server/src/markdownify/html-to-markdown/.gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 server/src/markdownify/html-to-markdown/.gitignore

diff --git a/server/src/markdownify/html-to-markdown/.gitignore b/server/src/markdownify/html-to-markdown/.gitignore
new file mode 100644
index 000000000..909db1866
--- /dev/null
+++ b/server/src/markdownify/html-to-markdown/.gitignore
@@ -0,0 +1,3 @@
+html-to-markdown.*
+!html-to-markdown.go
+libhtml-to-markdown.*
\ No newline at end of file

From f0d6712c3e45812206fadf3c35352e9b9422fca8 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Wed, 19 Nov 2025 23:59:33 +0530
Subject: [PATCH 20/79] chore: build

---
 server/src/markdownify/html-to-markdown/go.mod | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/src/markdownify/html-to-markdown/go.mod b/server/src/markdownify/html-to-markdown/go.mod
index ada1bf6dc..f1d4f8774 100644
--- a/server/src/markdownify/html-to-markdown/go.mod
+++ b/server/src/markdownify/html-to-markdown/go.mod
@@ -6,13 +6,13 @@ toolchain go1.24.0
 
 require (
 	github.com/PuerkitoBio/goquery v1.10.3
-	github.com/getmaxun/html-to-markdown/v2 v2.0.6
+	github.com/getmaxun/html-to-markdown v1.0.1
 	golang.org/x/net v0.43.0
 )
 
 require (
-	github.com/JohannesKaufmann/dom v0.2.0 // indirect
 	github.com/andybalholm/cascadia v1.3.3 // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
 )
 
-replace github.com/JohannesKaufmann/html-to-markdown/v2 => github.com/getmaxun/html-to-markdown/v2 v2.0.0
+replace github.com/JohannesKaufmann/html-to-markdown => github.com/getmaxun/html-to-markdown v1.0.1

From da48d46f2a6bb2bda5375b522637e328241df6f7 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Wed, 19 Nov 2025 23:59:39 +0530
Subject: [PATCH 21/79] chore: build

---
 .../src/markdownify/html-to-markdown/go.sum   | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/server/src/markdownify/html-to-markdown/go.sum b/server/src/markdownify/html-to-markdown/go.sum
index f5cc5e695..d2fc77e55 100644
--- a/server/src/markdownify/html-to-markdown/go.sum
+++ b/server/src/markdownify/html-to-markdown/go.sum
@@ -1,21 +1,19 @@
-github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ=
-github.com/JohannesKaufmann/dom v0.2.0/go.mod h1:57iSUl5RKric4bUkgos4zu6Xt5LMHUnw3TF1l5CbGZo=
 github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
 github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
 github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
 github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
-github.com/getmaxun/html-to-markdown/v2 v2.0.6 h1:SXoxwR0TCCggwdEAlarhsrvMBHIbD9YqVTSkCYnZukc=
-github.com/getmaxun/html-to-markdown/v2 v2.0.6/go.mod h1:FjUN4bMyWtxmt2EpnEEOb5zu/GUSRk3PIr5ADTE4GBg=
+github.com/getmaxun/html-to-markdown v1.0.1 h1:ter2Nby2EeYx0ichgZ/Pc6uo3aBudfPPgPicLTh02VI=
+github.com/getmaxun/html-to-markdown v1.0.1/go.mod h1:ggHEOofo3wcKaTOuD/z/pf3KJnQns0nQV+Gy/R8iE3U=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/sebdah/goldie/v2 v2.7.1 h1:PkBHymaYdtvEkZV7TmyqKxdmn5/Vcj+8TpATWZjnG5E=
-github.com/sebdah/goldie/v2 v2.7.1/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
-github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw=
-github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
+github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
+github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
+github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
+github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
-github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA=
-github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
+github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
+github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
@@ -81,3 +79,7 @@ golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
 golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
 golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=

From 713d37465dec19420cc305abf74db5385251dd40 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 00:00:56 +0530
Subject: [PATCH 22/79] feat: to markdown

---
 .../markdownify/html-to-markdown/html-to-markdown.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go
index dce011c13..8d4202a66 100644
--- a/server/src/markdownify/html-to-markdown/html-to-markdown.go
+++ b/server/src/markdownify/html-to-markdown/html-to-markdown.go
@@ -11,16 +11,16 @@ import (
 	"unicode/utf8"
 
 	"github.com/PuerkitoBio/goquery"
-	md "github.com/getmaxun/html-to-markdown/v2"
-	"github.com/getmaxun/html-to-markdown/v2/plugin"
-	converter "github.com/getmaxun/html-to-markdown/v2/converter"
+	md "github.com/getmaxun/html-to-markdown"
+	"github.com/getmaxun/html-to-markdown/plugin"
 	"golang.org/x/net/html"
 )
 
 // ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C.
+// Function name changed, comment rewritten.
 func ConvertHTMLToMarkdown(input *C.char) *C.char {
-	engine := converter.NewConverter("", true, nil)
-	// engine.Use(plugin.GitHubFlavored())
+	engine := md.NewConverter("", true, nil)
+	engine.Use(plugin.GitHubFlavored())
 
 	registerPreHandler(engine)
 
@@ -44,7 +44,7 @@ func main() {
 
 // registerPreHandler configures a specialized PRE/code block rule
 // to properly extract nested content and detect languages.
-func registerPreHandler(conv *converter.Converter) {
+func registerPreHandler(conv *md.Converter) {
 	isNoiseNode := func(class string) bool {
 		l := strings.ToLower(class)
 		return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers")

From 6c93cbc9a2d462bcc1315a8fa1d8699145d3dc70 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 02:42:44 +0530
Subject: [PATCH 23/79] feat: html -> markdown

---
 server/src/markdownify/markdown.ts | 141 +++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 server/src/markdownify/markdown.ts

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
new file mode 100644
index 000000000..8666f785a
--- /dev/null
+++ b/server/src/markdownify/markdown.ts
@@ -0,0 +1,141 @@
+import koffi from "koffi";
+import dotenv from "dotenv";
+import { stat } from "fs/promises";
+import path from "node:path";
+import os from "node:os";
+
+const exts = {
+  win32: ".dll",
+  darwin: ".dylib",
+  default: ".so",
+};
+
+const ext =
+  exts[os.platform() as keyof typeof exts] || exts.default;
+
+// Build path to the binary **inside the same folder**
+export const GO_MARKDOWN_PARSER_PATH = path.join(
+  __dirname,
+  `html-to-markdown${ext}`
+);
+
+dotenv.config();
+
+// ---------------------------------------------
+// Native Go binding wrapper
+// ---------------------------------------------
+class NativeMarkdownBridge {
+  private static singleton: NativeMarkdownBridge;
+  private fnConvert: any;
+
+  private constructor() {
+    const lib = koffi.load(GO_MARKDOWN_PARSER_PATH);
+
+    const freeFn = lib.func("FreeCString", "void", ["string"]);
+    const trackedType = "CString:" + crypto.randomUUID();
+    const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn);
+
+    this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [
+      "string",
+    ]);
+  }
+
+  static async load(): Promise<NativeMarkdownBridge> {
+    if (!NativeMarkdownBridge.singleton) {
+      try {
+        await stat(GO_MARKDOWN_PARSER_PATH);
+      } catch {
+        throw new Error("Go shared library not found");
+      }
+      NativeMarkdownBridge.singleton = new NativeMarkdownBridge();
+    }
+    return NativeMarkdownBridge.singleton;
+  }
+
+  async run(html: string): Promise<string> {
+    return new Promise((resolve, reject) => {
+      this.fnConvert.async(html, (err: Error, output: string) => {
+        err ? reject(err) : resolve(output);
+      });
+    });
+  }
+}
+
+// ---------------------------------------------
+// Main exposed function
+// ---------------------------------------------
+export async function parseMarkdown(
+  html: string | null | undefined,
+): Promise<string> {
+  if (!html) return "";
+
+  // Try Go library first (if enabled)
+  try {
+      const engine = await NativeMarkdownBridge.load();
+      let md = await engine.run(html);
+
+      md = fixBrokenLinks(md);
+      md = stripSkipLinks(md);
+
+      return md;
+  } catch (err: any) {
+    if (err?.message !== "Go shared library not found") {
+        console.log("Go markdown parser failed, falling back to JS parser:", err);
+    } else {
+      console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH });
+    }
+  }
+
+  // Fallback parser
+  const TurndownService = require("turndown");
+  const { gfm } = require("joplin-turndown-plugin-gfm");
+
+  const t = new TurndownService();
+  t.addRule("inlineLink", {
+    filter: (node: any, opts: any) =>
+      opts.linkStyle === "inlined" &&
+      node.nodeName === "A" &&
+      node.getAttribute("href"),
+    replacement: (content: string, node: any) => {
+      const href = node.getAttribute("href").trim();
+      const title = node.title ? ` "${node.title}"` : "";
+      return `[${content.trim()}](${href}${title})\n`;
+    },
+  });
+
+  t.use(gfm);
+
+  try {
+    let out = await t.turndown(html);
+    out = fixBrokenLinks(out);
+    out = stripSkipLinks(out);
+    return out;
+  } catch (err) {
+    console.error("HTML→Markdown failed", { err });
+    return "";
+  }
+}
+
+// ---------------------------------------------
+// Helpers
+// ---------------------------------------------
+function fixBrokenLinks(md: string): string {
+  let depth = 0;
+  let result = "";
+
+  for (const ch of md) {
+    if (ch === "[") depth++;
+    if (ch === "]") depth = Math.max(0, depth - 1);
+
+    if (depth > 0 && ch === "\n") {
+      result += "\\\n";
+    } else {
+      result += ch;
+    }
+  }
+  return result;
+}
+
+function stripSkipLinks(md: string): string {
+  return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
+}

From 0837ac50b90b6da79c1aca08055b6dc4351c5f4e Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 02:44:05 +0530
Subject: [PATCH 24/79] fix: go parser path

---
 server/src/markdownify/markdown.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index 8666f785a..c4709e6e5 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -16,6 +16,7 @@ const ext =
 // Build path to the binary **inside the same folder**
 export const GO_MARKDOWN_PARSER_PATH = path.join(
   __dirname,
+  "html-to-markdown",
   `html-to-markdown${ext}`
 );
 

From 66d829128293c08224a35f5bf108ec7c0ac69a3c Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 02:47:20 +0530
Subject: [PATCH 25/79] fix: export convert fxn

---
 server/src/markdownify/html-to-markdown/html-to-markdown.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go
index 8d4202a66..43c658a2f 100644
--- a/server/src/markdownify/html-to-markdown/html-to-markdown.go
+++ b/server/src/markdownify/html-to-markdown/html-to-markdown.go
@@ -16,9 +16,9 @@ import (
 	"golang.org/x/net/html"
 )
 
-// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C.
-// Function name changed, comment rewritten.
+//export ConvertHTMLToMarkdown
 func ConvertHTMLToMarkdown(input *C.char) *C.char {
+	// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C.
 	engine := md.NewConverter("", true, nil)
 	engine.Use(plugin.GitHubFlavored())
 

From 1d65f900339831d081a336bd44998442a6f24f97 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 03:01:18 +0530
Subject: [PATCH 26/79] feat: use parser to scrape

---
 server/src/markdownify/scrape.ts | 57 ++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 server/src/markdownify/scrape.ts

diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts
new file mode 100644
index 000000000..39d1fa3a3
--- /dev/null
+++ b/server/src/markdownify/scrape.ts
@@ -0,0 +1,57 @@
+import { chromium } from "playwright";
+import { parseMarkdown } from "./markdown";
+
+/**
+ * Fetches a webpage, strips scripts/styles/images/etc,
+ * returns clean Markdown using parser.
+ */
+export async function convertPageToMarkdown(url: string): Promise<string> {
+  const browser = await chromium.launch();
+  const page = await browser.newPage();
+
+  await page.goto(url, { waitUntil: "networkidle" });
+
+  await page.addInitScript(() => {
+    const selectors = [
+      "script",
+      "style",
+      "link[rel='stylesheet']",
+      "noscript",
+      "meta",
+      "svg",
+      "img",
+      "picture",
+      "source",
+      "video",
+      "audio",
+      "iframe",
+      "object",
+      "embed"
+    ];
+
+    selectors.forEach(sel => {
+      document.querySelectorAll(sel).forEach(e => e.remove());
+    });
+
+    // Remove inline event handlers (onclick, onload…)
+    const all = document.querySelectorAll("*");
+    all.forEach(el => {
+      [...el.attributes].forEach(attr => {
+        if (attr.name.startsWith("on")) {
+          el.removeAttribute(attr.name);
+        }
+      });
+    });
+  });
+
+  // Re-extract HTML after cleanup
+  const cleanedHtml = await page.evaluate(() => {
+    return document.documentElement.outerHTML;
+  });
+
+  await browser.close();
+
+  // Convert cleaned HTML → Markdown
+  const markdown = await parseMarkdown(cleanedHtml || "");
+  return markdown;
+}

From 3fd9bb5e0ea521d64e01998887ddb291ef718340 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 03:01:42 +0530
Subject: [PATCH 27/79] chore(debug): test

---
 server/src/markdownify/test.ts | 77 +++-------------------------------
 1 file changed, 5 insertions(+), 72 deletions(-)

diff --git a/server/src/markdownify/test.ts b/server/src/markdownify/test.ts
index 16e2a2855..48db37dc2 100644
--- a/server/src/markdownify/test.ts
+++ b/server/src/markdownify/test.ts
@@ -1,73 +1,6 @@
-import { urlToLlmText } from './get_llm_ready_text';
+import { convertPageToMarkdown } from "./scrape";
 
-async function demoDualOutput() {
-    const testUrls = [
-        "https://quotes.toscrape.com/",
-        "https://httpbin.org/html",
-        "https://example.com",
-        "https://amazon.com"
-    ];
-
-    for (const url of testUrls) {
-        console.log(`\n${'='.repeat(70)}`);
-        console.log(`Processing: ${url}`);
-        console.log(`${'='.repeat(70)}`);
-
-        try {
-            const result = await urlToLlmText(url, {
-                keepImages: true,
-                keepWebpageLinks: true,
-                removeScriptTag: true,
-                removeStyleTag: true,
-                formatAsMarkdown: true
-            });
-
-            console.log(`\n METADATA:`);
-            console.log(`Title: ${result.metadata.title}`);
-            console.log(`URL: ${result.metadata.url}`);
-            console.log(`Processed: ${result.metadata.processedAt}`);
-            console.log(`Plain text length: ${result.metadata.textLength} chars`);
-            console.log(`Markdown length: ${result.metadata.markdownLength} chars`);
-            console.log(`Content Score: ${result.metadata.contentScore}/10`);
-
-            console.log(`\nPLAIN TEXT (first 600 chars):`);
-            console.log(`${result.plainText.substring(0, 600)}${result.plainText.length > 600 ? '...' : ''}`);
-
-            console.log(`\nMARKDOWN (first 600 chars):`);
-            console.log(`${result.markdown.substring(0, 600)}${result.markdown.length > 600 ? '...' : ''}`);
-
-            // Save both formats
-            const domain = new URL(url).hostname;
-            const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
-            
-            await saveToFile(result.plainText, `output/${domain}_${timestamp}_plain.txt`);
-            await saveToFile(result.markdown, `output/${domain}_${timestamp}_markdown.md`);
-            
-            // Save metadata as JSON
-            await saveToFile(JSON.stringify(result.metadata, null, 2), `output/${domain}_${timestamp}_metadata.json`);
-
-            console.log(`\nSaved to output/ directory`);
-
-        } catch (error) {
-            console.error(`Error processing ${url}:`, error);
-        }
-    }
-}
-
-async function saveToFile(content: string, filename: string) {
-    const fs = await import('fs/promises');
-    const path = await import('path');
-    
-    try {
-        // Create directory if it doesn't exist
-        const dir = path.dirname(filename);
-        await fs.mkdir(dir, { recursive: true });
-        
-        await fs.writeFile(filename, content, 'utf-8');
-    } catch (error) {
-        console.error(`Error saving to ${filename}:`, error);
-    }
-}
-
-// Run the demo
-demoDualOutput().catch(console.error);
\ No newline at end of file
+(async () => {
+  const md = await convertPageToMarkdown("https://quotes.toscrape.com/");
+  console.log(md);
+})();

From 1a291c22b6b70aa0c65ca88b884b7a6be7a04b13 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 03:38:19 +0530
Subject: [PATCH 28/79] chore: cleanup

---
 server/src/markdownify/get_html.ts           |  55 --
 server/src/markdownify/get_llm_input_text.ts | 530 -------------------
 server/src/markdownify/get_llm_ready_text.ts |  48 --
 3 files changed, 633 deletions(-)
 delete mode 100644 server/src/markdownify/get_html.ts
 delete mode 100644 server/src/markdownify/get_llm_input_text.ts
 delete mode 100644 server/src/markdownify/get_llm_ready_text.ts

diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts
deleted file mode 100644
index dbf6a8a93..000000000
--- a/server/src/markdownify/get_html.ts
+++ /dev/null
@@ -1,55 +0,0 @@
-import { chromium, Browser, BrowserContext, Page } from 'playwright';
-
-export interface GetPageSourceOptions {
-  wait?: number;
-  headless?: boolean;
-  userAgent?: string;
-}
-
-export async function getPageSource(
-  url: string,
-  options: GetPageSourceOptions = {}
-): Promise<string> {
-  const {
-    wait = 1.5,
-    headless = true,
-    userAgent = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166"
-  } = options;
-
-  let browser: Browser | null = null;
-  let context: BrowserContext | null = null;
-  let page: Page | null = null;
-
-  try {
-    browser = await chromium.launch({ 
-      headless,
-      args: ['--no-sandbox', '--disable-dev-shm-usage']
-    });
-    
-    context = await browser.newContext({ userAgent });
-    page = await context.newPage();
-    
-    // Convert wait time to milliseconds
-    const waitMs = wait * 1000;
-    
-    // Set default timeout and navigate to URL
-    await page.setDefaultTimeout(waitMs);
-    await page.goto(url, { waitUntil: 'domcontentloaded' });
-    
-    // Wait for additional time if specified
-    if (waitMs > 0) {
-      await page.waitForTimeout(waitMs);
-    }
-    
-    const pageSource = await page.content();
-    return pageSource;
-    
-  } catch (error) {
-    console.error('Error while getting page source: ', error);
-    return ''; // Explicitly return empty string on error
-  } finally {
-    if (page) await page.close();
-    if (context) await context.close();
-    if (browser) await browser.close();
-  }
-}
\ No newline at end of file
diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts
deleted file mode 100644
index 3e600140f..000000000
--- a/server/src/markdownify/get_llm_input_text.ts
+++ /dev/null
@@ -1,530 +0,0 @@
-import * as cheerio from 'cheerio';
-import { URL } from 'url';
-
-export interface ProcessTextOptions {
-  keepImages?: boolean;
-  removeSvgImage?: boolean;
-  removeGifImage?: boolean;
-  removeImageTypes?: string[];
-  keepWebpageLinks?: boolean;
-  removeScriptTag?: boolean;
-  removeStyleTag?: boolean;
-  removeTags?: string[];
-  formatAsMarkdown?: boolean;
-  maxContentLength?: number;
-  preserveLineBreaks?: boolean;
-  includeMetadata?: boolean;
-}
-
-export interface ProcessedResult {
-  markdown: string;
-  plainText: string;
-  metadata: {
-    title: string;
-    description: string;
-    url: string;
-    processedAt: string;
-    textLength: number;
-    markdownLength: number;
-    hasContent: boolean;
-    language?: string;
-    wordCount: number;
-    linkCount: number;
-    imageCount: number;
-  };
-}
-
-// Global cheerio instance for helper functions
-let $: cheerio.CheerioAPI;
-
-export async function getProcessedText(
-  pageSource: string,
-  baseUrl: string,
-  options: ProcessTextOptions = {}
-): Promise<ProcessedResult> {
-  const {
-    keepImages = true,
-    removeSvgImage = true,
-    removeGifImage = true,
-    removeImageTypes = [],
-    keepWebpageLinks = true,
-    removeScriptTag = true,
-    removeStyleTag = true,
-    removeTags = [],
-    formatAsMarkdown = true,
-    maxContentLength = 100000,
-    preserveLineBreaks = true,
-    includeMetadata = true
-  } = options;
-
-  try {
-    // Initialize cheerio without problematic options
-    $ = cheerio.load(pageSource);
-    
-    // Remove unwanted tags completely
-    const tagsToRemove: string[] = [];
-    if (removeScriptTag) tagsToRemove.push('script');
-    if (removeStyleTag) tagsToRemove.push('style');
-    if (removeScriptTag) tagsToRemove.push('noscript');
-    tagsToRemove.push(...removeTags);
-    
-    const uniqueTags = [...new Set(tagsToRemove)];
-    uniqueTags.forEach(tag => {
-      $(tag).remove();
-    });
-
-    // Remove common unwanted elements
-    $('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
-    
-    // Extract metadata
-    const title = extractTitle();
-    const description = extractDescription();
-    const language = extractLanguage();
-
-    // Generate both formats
-    const markdown = formatAsMarkdown ? 
-      convertToMarkdown(baseUrl, options) : 
-      '';
-    
-    const plainText = convertToPlainText(baseUrl, options);
-
-    // Truncate if necessary
-    const finalMarkdown = markdown.substring(0, maxContentLength);
-    const finalPlainText = plainText.substring(0, maxContentLength);
-
-    // Count elements
-    const linkCount = $('a[href]').length;
-    const imageCount = $('img').length;
-    const wordCount = countWords(finalPlainText);
-
-    const result: ProcessedResult = {
-      markdown: finalMarkdown,
-      plainText: finalPlainText,
-      metadata: {
-        title,
-        description,
-        url: baseUrl,
-        processedAt: new Date().toISOString(),
-        textLength: finalPlainText.length,
-        markdownLength: finalMarkdown.length,
-        hasContent: finalPlainText.length > 0,
-        language,
-        wordCount,
-        linkCount,
-        imageCount
-      }
-    };
-
-    return result;
-
-  } catch (error) {
-    console.error('Error while getting processed text: ', error);
-    return createEmptyResult(baseUrl);
-  }
-}
-
-function extractTitle(): string {
-  return $('title').text()?.trim() || 
-         $('meta[property="og:title"]').attr('content')?.trim() ||
-         $('h1').first().text()?.trim() || 
-         'Untitled';
-}
-
-function extractDescription(): string {
-  return $('meta[name="description"]').attr('content')?.trim() ||
-         $('meta[property="og:description"]').attr('content')?.trim() ||
-         '';
-}
-
-function extractLanguage(): string {
-  return $('html').attr('lang') || 'en';
-}
-
-function countWords(text: string): number {
-  return text.split(/\s+/).filter(word => word.length > 0).length;
-}
-
-function convertToMarkdown(baseUrl: string, options: ProcessTextOptions): string {
-  const { keepImages, keepWebpageLinks, preserveLineBreaks } = options;
-  
-  // Start with metadata if available
-  let markdown = '';
-  const title = extractTitle();
-  if (title && title !== 'Untitled') {
-    markdown += `# ${title}\n\n`;
-  }
-
-  const description = extractDescription();
-  if (description) {
-    markdown += `> ${description}\n\n`;
-  }
-
-  // Clone the body to avoid modifying the original
-  const $body = $('body').clone();
-  
-  // Remove unwanted elements from the clone
-  $body.find('script, style, noscript, meta, link').remove();
-  $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
-
-  // Process in order of importance
-  const sections: string[] = [];
-
-  // Process main content areas first
-  const contentSelectors = [
-    'main', 'article', '[role="main"]', '.content', '.main', 
-    '#content', '#main', '.post', '.article'
-  ];
-
-  let mainContent = '';
-  for (const selector of contentSelectors) {
-    const $content = $body.find(selector).first();
-    if ($content.length > 0) {
-      mainContent = processElementToMarkdown($content, baseUrl, options, 0);
-      if (mainContent.trim().length > 100) { // Only use if substantial content
-        sections.push(mainContent);
-        $content.remove(); // Remove from body to avoid duplication
-        break;
-      }
-    }
-  }
-
-  // Process headers and structure
-  sections.push(processElementToMarkdown($body, baseUrl, options, 0));
-
-  // Combine sections
-  markdown += sections.filter(s => s.trim().length > 0).join('\n\n');
-
-  // Final cleanup
-  markdown = cleanMarkdown(markdown, preserveLineBreaks);
-  
-  return markdown;
-}
-
-function processElementToMarkdown($element: cheerio.Cheerio<any>, baseUrl: string, options: ProcessTextOptions, depth: number = 0): string {
-  if (depth > 10) return ''; // Prevent infinite recursion
-  
-  const { keepImages, keepWebpageLinks } = options;
-  let markdown = '';
-
-  $element.contents().each((index, node) => {
-    if (node.type === 'text') {
-      const text = $(node).text().trim();
-      if (text) {
-        markdown += text + ' ';
-      }
-    } else if (node.type === 'tag') {
-      const $node = $(node);
-      const tagName = node.name?.toLowerCase() || '';
-
-      switch (tagName) {
-        case 'h1':
-          markdown += `\n# ${$node.text().trim()}\n\n`;
-          break;
-        case 'h2':
-          markdown += `\n## ${$node.text().trim()}\n\n`;
-          break;
-        case 'h3':
-          markdown += `\n### ${$node.text().trim()}\n\n`;
-          break;
-        case 'h4':
-          markdown += `\n#### ${$node.text().trim()}\n\n`;
-          break;
-        case 'h5':
-          markdown += `\n##### ${$node.text().trim()}\n\n`;
-          break;
-        case 'h6':
-          markdown += `\n###### ${$node.text().trim()}\n\n`;
-          break;
-        case 'p':
-          const paragraphText = processElementToMarkdown($node, baseUrl, options, depth + 1);
-          if (paragraphText.trim()) {
-            markdown += `\n${paragraphText.trim()}\n\n`;
-          }
-          break;
-        case 'br':
-          markdown += '\n';
-          break;
-        case 'hr':
-          markdown += '\n---\n\n';
-          break;
-        case 'strong':
-        case 'b':
-          const strongText = processElementToMarkdown($node, baseUrl, options, depth + 1);
-          if (strongText.trim()) {
-            markdown += `**${strongText.trim()}**`;
-          }
-          break;
-        case 'em':
-        case 'i':
-          const emText = processElementToMarkdown($node, baseUrl, options, depth + 1);
-          if (emText.trim()) {
-            markdown += `*${emText.trim()}*`;
-          }
-          break;
-        case 'code':
-          if (!$node.closest('pre').length) {
-            const codeText = $node.text().trim();
-            if (codeText) {
-              markdown += `\`${codeText}\``;
-            }
-          }
-          break;
-        case 'pre':
-          const preText = $node.text().trim();
-          if (preText) {
-            const codeClass = $node.find('code').attr('class');
-            const language = codeClass ? codeClass.replace('language-', '') : '';
-            markdown += `\n\`\`\`${language}\n${preText}\n\`\`\`\n\n`;
-          }
-          break;
-        case 'blockquote':
-          const quoteText = processElementToMarkdown($node, baseUrl, options, depth + 1);
-          if (quoteText.trim()) {
-            const lines = quoteText.trim().split('\n');
-            markdown += '\n' + lines.map(line => `> ${line}`).join('\n') + '\n\n';
-          }
-          break;
-        case 'ul':
-          const listItems: string[] = [];
-          $node.find('> li').each((_, li) => {
-            const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1);
-            if (itemText.trim()) {
-              listItems.push(`- ${itemText.trim()}`);
-            }
-          });
-          if (listItems.length > 0) {
-            markdown += '\n' + listItems.join('\n') + '\n\n';
-          }
-          break;
-        case 'ol':
-          const olItems: string[] = [];
-          $node.find('> li').each((i, li) => {
-            const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1);
-            if (itemText.trim()) {
-              olItems.push(`${i + 1}. ${itemText.trim()}`);
-            }
-          });
-          if (olItems.length > 0) {
-            markdown += '\n' + olItems.join('\n') + '\n\n';
-          }
-          break;
-        case 'a':
-          if (keepWebpageLinks) {
-            const href = $node.attr('href');
-            const linkText = processElementToMarkdown($node, baseUrl, options, depth + 1).trim();
-            if (href && linkText) {
-              try {
-                const absoluteUrl = new URL(href, baseUrl).toString();
-                markdown += `[${linkText}](${absoluteUrl})`;
-              } catch {
-                markdown += linkText;
-              }
-            } else if (linkText) {
-              markdown += linkText;
-            }
-          } else {
-            markdown += processElementToMarkdown($node, baseUrl, options, depth + 1);
-          }
-          break;
-        case 'img':
-          if (keepImages) {
-            const src = $node.attr('src');
-            const alt = $node.attr('alt') || $node.attr('title') || '';
-            if (src && !shouldRemoveImage(src, options)) {
-              try {
-                const absoluteUrl = new URL(src, baseUrl).toString();
-                markdown += `![${alt}](${absoluteUrl})`;
-              } catch {
-                // Ignore invalid URLs
-              }
-            }
-          }
-          break;
-        case 'table':
-          markdown += processTableToMarkdown($node);
-          break;
-        case 'div':
-        case 'section':
-        case 'article':
-        case 'header':
-        case 'footer':
-        case 'nav':
-        case 'aside':
-          // Process block-level elements with their content
-          const blockContent = processElementToMarkdown($node, baseUrl, options, depth + 1);
-          if (blockContent.trim()) {
-            markdown += `\n${blockContent.trim()}\n\n`;
-          }
-          break;
-        default:
-          // For other tags, just process their content
-          markdown += processElementToMarkdown($node, baseUrl, options, depth + 1);
-          break;
-      }
-    }
-  });
-
-  return markdown;
-}
-
-function processTableToMarkdown($table: cheerio.Cheerio<any>): string {
-  const rows: string[][] = [];
-  let maxColumns = 0;
-
-  $table.find('tr').each((_, row) => {
-    const $row = $(row);
-    const cells: string[] = [];
-    
-    $row.find('th, td').each((_, cell) => {
-      const $cell = $(cell);
-      const text = $cell.text().trim();
-      const colspan = parseInt($cell.attr('colspan') || '1');
-      
-      cells.push(text);
-      // Add empty cells for colspan
-      for (let i = 1; i < colspan; i++) {
-        cells.push('');
-      }
-    });
-    
-    if (cells.length > 0) {
-      rows.push(cells);
-      maxColumns = Math.max(maxColumns, cells.length);
-    }
-  });
-
-  if (rows.length === 0) return '';
-
-  let markdownTable = '\n';
-  
-  // Header row
-  if (rows.length > 0) {
-    markdownTable += `| ${rows[0].join(' | ')} |\n`;
-    markdownTable += `|${' --- |'.repeat(rows[0].length)}\n`;
-    
-    // Data rows
-    for (let i = 1; i < rows.length; i++) {
-      markdownTable += `| ${rows[i].join(' | ')} |\n`;
-    }
-  }
-  
-  return markdownTable + '\n';
-}
-
-function convertToPlainText(baseUrl: string, options: ProcessTextOptions): string {
-  const { keepImages, keepWebpageLinks } = options;
-  
-  const $body = $('body').clone();
-  
-  // Remove unwanted elements
-  $body.find('script, style, noscript, meta, link').remove();
-  $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
-
-  // Process images
-  if (keepImages) {
-    $body.find('img').each((_, element) => {
-      const $img = $(element);
-      const src = $img.attr('src');
-      const alt = $img.attr('alt') || '';
-      
-      if (src && !shouldRemoveImage(src, options)) {
-        try {
-          const absoluteUrl = new URL(src, baseUrl).toString();
-          $img.replaceWith(`[Image: ${alt || 'image'} - ${absoluteUrl}]`);
-        } catch {
-          $img.remove();
-        }
-      } else {
-        $img.remove();
-      }
-    });
-  } else {
-    $body.find('img').remove();
-  }
-
-  // Process links
-  if (keepWebpageLinks) {
-    $body.find('a[href]').each((_, element) => {
-      const $link = $(element);
-      const href = $link.attr('href');
-      const text = $link.text().trim();
-      
-      if (href && text) {
-        try {
-          const absoluteUrl = new URL(href, baseUrl).toString();
-          $link.replaceWith(`${text} (${absoluteUrl})`);
-        } catch {
-          $link.replaceWith(text);
-        }
-      }
-    });
-  } else {
-    $body.find('a[href]').each((_, element) => {
-      const $link = $(element);
-      $link.replaceWith($link.text().trim());
-    });
-  }
-
-  let text = $body.text();
-  text = cleanText(text);
-  
-  return text;
-}
-
-function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean {
-  const { removeSvgImage, removeGifImage, removeImageTypes = [] } = options;
-  
-  const imageTypesToRemove: string[] = [];
-  if (removeSvgImage) imageTypesToRemove.push('.svg');
-  if (removeGifImage) imageTypesToRemove.push('.gif');
-  imageTypesToRemove.push(...removeImageTypes);
-  
-  return imageTypesToRemove.some(type => src.toLowerCase().includes(type.toLowerCase()));
-}
-
-function cleanMarkdown(markdown: string, preserveLineBreaks: boolean = true): string {
-  return markdown
-    // Normalize line breaks
-    .replace(/\r\n/g, '\n')
-    // Remove excessive empty lines (keep max 2)
-    .replace(/\n{3,}/g, '\n\n')
-    // Clean up spaces around headers
-    .replace(/\n\s*(#+)\s*/g, '\n$1 ')
-    // Remove spaces at start of lines
-    .replace(/^\s+/gm, '')
-    // Remove trailing whitespace
-    .replace(/[ \t]+$/gm, '')
-    // Fix multiple spaces
-    .replace(/[ ]{2,}/g, ' ')
-    // Ensure proper spacing after paragraphs
-    .replace(/([^\n])\n([^\n])/g, '$1\n\n$2')
-    .trim();
-}
-
-function cleanText(text: string): string {
-  return text
-    .replace(/\r\n/g, '\n')
-    .replace(/\s+/g, ' ')
-    .replace(/\n\s*\n/g, '\n\n')
-    .replace(/[ ]{2,}/g, ' ')
-    .trim();
-}
-
-function createEmptyResult(url: string): ProcessedResult {
-  return {
-    markdown: '',
-    plainText: '',
-    metadata: {
-      title: '',
-      description: '',
-      url: url,
-      processedAt: new Date().toISOString(),
-      textLength: 0,
-      markdownLength: 0,
-      hasContent: false,
-      wordCount: 0,
-      linkCount: 0,
-      imageCount: 0
-    }
-  };
-}
\ No newline at end of file
diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts
deleted file mode 100644
index 025fb52d6..000000000
--- a/server/src/markdownify/get_llm_ready_text.ts
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: MIT
-
-import { getPageSource, GetPageSourceOptions } from './get_html';
-import { getProcessedText, ProcessTextOptions, ProcessedResult } from './get_llm_input_text';
-
-export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions {}
-
-export async function urlToLlmText(
-  url: string,
-  options: UrlToLlmTextOptions = {}
-): Promise<ProcessedResult> {
-  try {
-    const pageSource = await getPageSource(url, options);
-    
-    if (!pageSource) {
-      return createEmptyResult(url);
-    }
-
-    const result = await getProcessedText(pageSource, url, options);
-    return result;
-    
-  } catch (error) {
-    console.error('Error while scraping url: ', error);
-    return createEmptyResult(url);
-  }
-}
-
-function createEmptyResult(url: string): ProcessedResult {
-  return {
-    markdown: '',
-    plainText: '',
-    metadata: {
-      title: '',
-      description: '',
-      url: url,
-      processedAt: new Date().toISOString(),
-      textLength: 0,
-      markdownLength: 0,
-      hasContent: false,
-      language: 'en',
-      wordCount: 0,
-      linkCount: 0,
-      imageCount: 0
-    }
-  };
-}
-
-export { getPageSource, getProcessedText };
\ No newline at end of file

From ecaa23f4b620542d5d560d9ae244c080921645f8 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 03:39:45 +0530
Subject: [PATCH 29/79] chore: install scrape plugins

---
 package.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/package.json b/package.json
index 330db1e3d..1303410d7 100644
--- a/package.json
+++ b/package.json
@@ -47,6 +47,7 @@
     "idcac-playwright": "^0.1.3",
     "ioredis": "^5.4.1",
     "joi": "^17.6.0",
+    "joplin-turndown-plugin-gfm": "^1.0.12",
     "jsonwebtoken": "^9.0.2",
     "jwt-decode": "^4.0.0",
     "koffi": "^2.14.1",

From 767fa5fe4fd80500ee0baedba71a94cda0316d65 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 03:48:30 +0530
Subject: [PATCH 30/79] chore: del go

---
 .../markdownify/html-to-markdown/.gitignore   |   3 -
 .../src/markdownify/html-to-markdown/go.mod   |  18 --
 .../src/markdownify/html-to-markdown/go.sum   |  85 ----------
 .../html-to-markdown/html-to-markdown.go      | 157 ------------------
 4 files changed, 263 deletions(-)
 delete mode 100644 server/src/markdownify/html-to-markdown/.gitignore
 delete mode 100644 server/src/markdownify/html-to-markdown/go.mod
 delete mode 100644 server/src/markdownify/html-to-markdown/go.sum
 delete mode 100644 server/src/markdownify/html-to-markdown/html-to-markdown.go

diff --git a/server/src/markdownify/html-to-markdown/.gitignore b/server/src/markdownify/html-to-markdown/.gitignore
deleted file mode 100644
index 909db1866..000000000
--- a/server/src/markdownify/html-to-markdown/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-html-to-markdown.*
-!html-to-markdown.go
-libhtml-to-markdown.*
\ No newline at end of file
diff --git a/server/src/markdownify/html-to-markdown/go.mod b/server/src/markdownify/html-to-markdown/go.mod
deleted file mode 100644
index f1d4f8774..000000000
--- a/server/src/markdownify/html-to-markdown/go.mod
+++ /dev/null
@@ -1,18 +0,0 @@
-module html-to-markdown
-
-go 1.23.0
-
-toolchain go1.24.0
-
-require (
-	github.com/PuerkitoBio/goquery v1.10.3
-	github.com/getmaxun/html-to-markdown v1.0.1
-	golang.org/x/net v0.43.0
-)
-
-require (
-	github.com/andybalholm/cascadia v1.3.3 // indirect
-	gopkg.in/yaml.v2 v2.4.0 // indirect
-)
-
-replace github.com/JohannesKaufmann/html-to-markdown => github.com/getmaxun/html-to-markdown v1.0.1
diff --git a/server/src/markdownify/html-to-markdown/go.sum b/server/src/markdownify/html-to-markdown/go.sum
deleted file mode 100644
index d2fc77e55..000000000
--- a/server/src/markdownify/html-to-markdown/go.sum
+++ /dev/null
@@ -1,85 +0,0 @@
-github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
-github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
-github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
-github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
-github.com/getmaxun/html-to-markdown v1.0.1 h1:ter2Nby2EeYx0ichgZ/Pc6uo3aBudfPPgPicLTh02VI=
-github.com/getmaxun/html-to-markdown v1.0.1/go.mod h1:ggHEOofo3wcKaTOuD/z/pf3KJnQns0nQV+Gy/R8iE3U=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
-github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
-github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
-github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
-github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
-github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
-github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
-golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
-golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
-golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
-golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
-golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
-golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
-golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
-golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
-golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
-golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
-golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
-golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
-golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
-golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
-golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
-golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
-golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
-golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
-golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
-golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
-golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
-golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
-golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
-golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
-golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
-golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
-golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
-golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
-golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
-golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
-golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
-golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
-golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
-golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
-gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go
deleted file mode 100644
index 43c658a2f..000000000
--- a/server/src/markdownify/html-to-markdown/html-to-markdown.go
+++ /dev/null
@@ -1,157 +0,0 @@
-package main
-
-/*
-#include <stdlib.h>
-*/
-import "C"
-
-import (
-	"strings"
-	"unsafe"
-	"unicode/utf8"
-
-	"github.com/PuerkitoBio/goquery"
-	md "github.com/getmaxun/html-to-markdown"
-	"github.com/getmaxun/html-to-markdown/plugin"
-	"golang.org/x/net/html"
-)
-
-//export ConvertHTMLToMarkdown
-func ConvertHTMLToMarkdown(input *C.char) *C.char {
-	// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C.
-	engine := md.NewConverter("", true, nil)
-	engine.Use(plugin.GitHubFlavored())
-
-	registerPreHandler(engine)
-
-	result, err := engine.ConvertString(C.GoString(input))
-	if err != nil {
-		// swallow conversion error (same as original)
-	}
-
-	return C.CString(result)
-}
-
-//export FreeCString
-// Frees C string memory.
-func FreeCString(str *C.char) {
-	C.free(unsafe.Pointer(str))
-}
-
-func main() {
-	// Required empty main for CGO.
-}
-
-// registerPreHandler configures a specialized PRE/code block rule
-// to properly extract nested content and detect languages.
-func registerPreHandler(conv *md.Converter) {
-	isNoiseNode := func(class string) bool {
-		l := strings.ToLower(class)
-		return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers")
-	}
-
-	findLanguage := func(sel *goquery.Selection) string {
-		cls := strings.ToLower(sel.AttrOr("class", ""))
-		for _, chunk := range strings.Fields(cls) {
-			if strings.HasPrefix(chunk, "language-") {
-				return strings.TrimPrefix(chunk, "language-")
-			}
-			if strings.HasPrefix(chunk, "lang-") {
-				return strings.TrimPrefix(chunk, "lang-")
-			}
-		}
-		return ""
-	}
-
-	// Walk nodes and extract visible text, injecting newlines at block boundaries.
-	var scrape func(n *html.Node, out *strings.Builder)
-	scrape = func(n *html.Node, out *strings.Builder) {
-		if n == nil {
-			return
-		}
-
-		switch n.Type {
-		case html.TextNode:
-			out.WriteString(n.Data)
-
-		case html.ElementNode:
-			tag := strings.ToLower(n.Data)
-
-			// skip gutter/line number elements
-			for _, attr := range n.Attr {
-				if attr.Key == "class" && isNoiseNode(attr.Val) {
-					return
-				}
-			}
-
-			if tag == "br" {
-				out.WriteString("\n")
-			}
-
-			for child := n.FirstChild; child != nil; child = child.NextSibling {
-				scrape(child, out)
-			}
-
-			switch tag {
-			case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot",
-				"section", "article", "blockquote", "pre",
-				"h1", "h2", "h3", "h4", "h5", "h6":
-				out.WriteString("\n")
-			}
-		}
-	}
-
-	// PRE blocks
-	conv.AddRules(md.Rule{
-		Filter: []string{"pre"},
-		Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string {
-			codeTag := s.Find("code").First()
-			lang := findLanguage(codeTag)
-			if lang == "" {
-				lang = findLanguage(s)
-			}
-
-			var buf strings.Builder
-			for _, node := range s.Nodes {
-				scrape(node, &buf)
-			}
-
-			raw := strings.TrimRight(buf.String(), "\n")
-
-			fRune, _ := utf8.DecodeRuneInString(opt.Fence)
-			fence := md.CalculateCodeFence(fRune, raw)
-
-			block := "\n\n" + fence + lang + "\n" + raw + "\n" + fence + "\n\n"
-			return md.String(block)
-		},
-	})
-
-	// Inline code rule
-	conv.AddRules(md.Rule{
-		Filter: []string{"code"},
-		Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string {
-			// do nothing when inside PRE
-			if s.ParentsFiltered("pre").Length() > 0 {
-				return nil
-			}
-
-			var buf strings.Builder
-			for _, node := range s.Nodes {
-				scrape(node, &buf)
-			}
-
-			text := md.TrimTrailingSpaces(strings.ReplaceAll(buf.String(), "\r\n", "\n"))
-
-			fence := "`"
-			if strings.Contains(text, "`") {
-				fence = "``"
-				if strings.Contains(text, "``") {
-					fence = "```"
-				}
-			}
-
-			inline := fence + text + fence
-			return md.String(inline)
-		},
-	})
-}

From b4644ba1065dce096e9f07f8d91eebb75d90e8aa Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 03:51:27 +0530
Subject: [PATCH 31/79] feat: use turndown

---
 server/src/markdownify/markdown.ts | 96 +++---------------------------
 1 file changed, 8 insertions(+), 88 deletions(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index c4709e6e5..d13992974 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -1,109 +1,27 @@
-import koffi from "koffi";
-import dotenv from "dotenv";
-import { stat } from "fs/promises";
-import path from "node:path";
-import os from "node:os";
+import TurndownService from "turndown";
+import { gfm } from "joplin-turndown-plugin-gfm";
 
-const exts = {
-  win32: ".dll",
-  darwin: ".dylib",
-  default: ".so",
-};
-
-const ext =
-  exts[os.platform() as keyof typeof exts] || exts.default;
-
-// Build path to the binary **inside the same folder**
-export const GO_MARKDOWN_PARSER_PATH = path.join(
-  __dirname,
-  "html-to-markdown",
-  `html-to-markdown${ext}`
-);
-
-dotenv.config();
-
-// ---------------------------------------------
-// Native Go binding wrapper
-// ---------------------------------------------
-class NativeMarkdownBridge {
-  private static singleton: NativeMarkdownBridge;
-  private fnConvert: any;
-
-  private constructor() {
-    const lib = koffi.load(GO_MARKDOWN_PARSER_PATH);
-
-    const freeFn = lib.func("FreeCString", "void", ["string"]);
-    const trackedType = "CString:" + crypto.randomUUID();
-    const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn);
-
-    this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [
-      "string",
-    ]);
-  }
-
-  static async load(): Promise<NativeMarkdownBridge> {
-    if (!NativeMarkdownBridge.singleton) {
-      try {
-        await stat(GO_MARKDOWN_PARSER_PATH);
-      } catch {
-        throw new Error("Go shared library not found");
-      }
-      NativeMarkdownBridge.singleton = new NativeMarkdownBridge();
-    }
-    return NativeMarkdownBridge.singleton;
-  }
-
-  async run(html: string): Promise<string> {
-    return new Promise((resolve, reject) => {
-      this.fnConvert.async(html, (err: Error, output: string) => {
-        err ? reject(err) : resolve(output);
-      });
-    });
-  }
-}
-
-// ---------------------------------------------
-// Main exposed function
-// ---------------------------------------------
 export async function parseMarkdown(
   html: string | null | undefined,
 ): Promise<string> {
   if (!html) return "";
 
-  // Try Go library first (if enabled)
-  try {
-      const engine = await NativeMarkdownBridge.load();
-      let md = await engine.run(html);
-
-      md = fixBrokenLinks(md);
-      md = stripSkipLinks(md);
-
-      return md;
-  } catch (err: any) {
-    if (err?.message !== "Go shared library not found") {
-        console.log("Go markdown parser failed, falling back to JS parser:", err);
-    } else {
-      console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH });
-    }
-  }
-
-  // Fallback parser
-  const TurndownService = require("turndown");
-  const { gfm } = require("joplin-turndown-plugin-gfm");
-
   const t = new TurndownService();
+
+  // Custom rule for inline links
   t.addRule("inlineLink", {
     filter: (node: any, opts: any) =>
       opts.linkStyle === "inlined" &&
       node.nodeName === "A" &&
       node.getAttribute("href"),
     replacement: (content: string, node: any) => {
-      const href = node.getAttribute("href").trim();
+      const href = node.getAttribute("href")?.trim() || "";
       const title = node.title ? ` "${node.title}"` : "";
       return `[${content.trim()}](${href}${title})\n`;
     },
   });
 
+  // GitHub-flavored markdown features
   t.use(gfm);
 
   try {
@@ -134,9 +52,11 @@ function fixBrokenLinks(md: string): string {
       result += ch;
     }
   }
+
   return result;
 }
 
 function stripSkipLinks(md: string): string {
   return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
 }
+

From b14d84d83ab24f9c35cbca7da1d05165af8e9b53 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 03:51:53 +0530
Subject: [PATCH 32/79] fix: -rm debug turndown

---
 server/src/markdownify/debug_turndown.ts | 132 -----------------------
 1 file changed, 132 deletions(-)
 delete mode 100644 server/src/markdownify/debug_turndown.ts

diff --git a/server/src/markdownify/debug_turndown.ts b/server/src/markdownify/debug_turndown.ts
deleted file mode 100644
index 1d62b109f..000000000
--- a/server/src/markdownify/debug_turndown.ts
+++ /dev/null
@@ -1,132 +0,0 @@
-import { getPageSource } from './get_html';
-import { getProcessedText } from './get_llm_input_text';
-import * as cheerio from 'cheerio';
-import TurndownService from 'turndown';
-
-async function debugTurndown() {
-    const testUrls = [
-        "https://amazon.com/",
-    ];
-
-    for (const url of testUrls) {
-        console.log(`\n${'='.repeat(70)}`);
-        console.log(`🔍 Testing URL: ${url}`);
-        console.log(`${'='.repeat(70)}`);
-        
-        try {
-            const pageSource = await getPageSource(url, {
-                wait: 3.0, // Longer wait time
-                timeout: 15000 // 15 second timeout
-            });
-            
-            if (!pageSource || pageSource.length < 100) {
-                console.error("❌ No page source received or content too short");
-                continue;
-            }
-
-            // Save raw HTML for inspection
-            const fs = await import('fs/promises');
-            const domain = new URL(url).hostname;
-            await fs.writeFile(`debug_${domain}_raw.html`, pageSource);
-            console.log(`💾 Raw HTML saved to debug_${domain}_raw.html (${pageSource.length} chars)`);
-
-            // Parse with cheerio
-            const $ = cheerio.load(pageSource);
-            
-            // Check what's in the body
-            const bodyText = $('body').text();
-            console.log(`📄 Body text length: ${bodyText.length} chars`);
-            console.log(`📄 Body preview: ${bodyText.substring(0, 200)}...`);
-
-            // Test content extraction
-            const contentSelectors = [
-                'main', 'article', '[role="main"]', '.content', '.main-content',
-                '#content', '#main', '.post', '.article'
-            ];
-
-            let mainContent: cheerio.Cheerio<any> = $('body');
-            let foundSelector = 'body (fallback)';
-            
-            for (const selector of contentSelectors) {
-                const $content = $(selector).first();
-                if ($content.length > 0 && $content.text().trim().length > 10) {
-                    console.log(`✅ Found content with selector: ${selector}`);
-                    console.log(`📝 Content text length: ${$content.text().length}`);
-                    mainContent = $content;
-                    foundSelector = selector;
-                    break;
-                }
-            }
-
-            console.log(`🎯 Using content from: ${foundSelector}`);
-
-            // Test Turndown directly
-            console.log("\n🧪 Testing Turndown directly...");
-            const turndownService = new TurndownService();
-            
-            if (mainContent.length > 0) {
-                const contentHtml = mainContent.html() || '';
-                if (contentHtml && contentHtml.length > 10) {
-                    console.log(`📦 Content HTML length: ${contentHtml.length} chars`);
-                    
-                    try {
-                        const contentMarkdown = turndownService.turndown(contentHtml);
-                        console.log(`📝 Turndown result length: ${contentMarkdown.length} chars`);
-                        
-                        if (contentMarkdown.length > 0) {
-                            console.log(`📝 Markdown preview: ${contentMarkdown.substring(0, 300)}...`);
-                            await fs.writeFile(`debug_${domain}_turndown.md`, contentMarkdown);
-                            console.log(`💾 Turndown output saved to debug_${domain}_turndown.md`);
-                        } else {
-                            console.log("❌ Turndown produced empty markdown");
-                        }
-                    } catch (turndownError) {
-                        console.error("❌ Turndown conversion failed:", turndownError);
-                    }
-                } else {
-                    console.log("❌ No HTML content found for Turndown");
-                }
-            }
-
-            // Test our full function
-            console.log("\n🧪 Testing full getProcessedText function...");
-            const result = await getProcessedText(pageSource, url, {
-                keepImages: true,
-                keepWebpageLinks: true,
-                removeScriptTag: true,
-                removeStyleTag: true,
-                formatAsMarkdown: true
-            });
-
-            console.log("📊 Result metadata:");
-            console.log(`- Markdown length: ${result.metadata.markdownLength} chars`);
-            console.log(`- Plain text length: ${result.metadata.textLength} chars`);
-            console.log(`- Has content: ${result.metadata.hasContent}`);
-            console.log(`- Content score: ${result.metadata.contentScore}/10`);
-
-            if (result.markdown && result.markdown.length > 0) {
-                console.log(`📄 Markdown preview (300 chars):`);
-                console.log(result.markdown.substring(0, 300) + '...');
-                await fs.writeFile(`debug_${domain}_full.md`, result.markdown);
-                console.log(`💾 Full output saved to debug_${domain}_full.md`);
-            } else {
-                console.log("❌ Empty markdown from full function");
-                
-                // Debug why it's empty
-                if (result.plainText && result.plainText.length > 0) {
-                    console.log("ℹ️  But plain text has content, so markdown conversion failed");
-                    await fs.writeFile(`debug_${domain}_plain.txt`, result.plainText);
-                    console.log(`💾 Plain text saved to debug_${domain}_plain.txt`);
-                }
-            }
-
-        } catch (error) {
-            console.error(`💥 Error processing ${url}:`, error);
-        }
-        
-        // Small delay between requests
-        await new Promise(resolve => setTimeout(resolve, 1000));
-    }
-}
-
-debugTurndown().catch(console.error);
\ No newline at end of file

From 839f9fa5ce1a9e3e4f9f0a9e79a816089d8e0fb5 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 04:10:12 +0530
Subject: [PATCH 33/79] fix: plugin imports

---
 server/src/markdownify/markdown.ts | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index d13992974..0aeca9e71 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -1,27 +1,22 @@
-import TurndownService from "turndown";
-import { gfm } from "joplin-turndown-plugin-gfm";
-
 export async function parseMarkdown(
   html: string | null | undefined,
 ): Promise<string> {
-  if (!html) return "";
+  const TurndownService = require("turndown");
+  const { gfm } = require("joplin-turndown-plugin-gfm");
 
   const t = new TurndownService();
-
-  // Custom rule for inline links
   t.addRule("inlineLink", {
     filter: (node: any, opts: any) =>
       opts.linkStyle === "inlined" &&
       node.nodeName === "A" &&
       node.getAttribute("href"),
     replacement: (content: string, node: any) => {
-      const href = node.getAttribute("href")?.trim() || "";
+      const href = node.getAttribute("href").trim();
       const title = node.title ? ` "${node.title}"` : "";
       return `[${content.trim()}](${href}${title})\n`;
     },
   });
 
-  // GitHub-flavored markdown features
   t.use(gfm);
 
   try {
@@ -52,7 +47,6 @@ function fixBrokenLinks(md: string): string {
       result += ch;
     }
   }
-
   return result;
 }
 
@@ -60,3 +54,4 @@ function stripSkipLinks(md: string): string {
   return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
 }
 
+

From 0a7a1eb9b839d764eb3757486e7b7e827cceabde Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 04:21:41 +0530
Subject: [PATCH 34/79] fix: make baseUrl optional param

---
 server/src/markdownify/markdown.ts | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index 0aeca9e71..9ee7c7205 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -1,8 +1,10 @@
 export async function parseMarkdown(
   html: string | null | undefined,
+  baseUrl?: string | null
 ): Promise<string> {
   const TurndownService = require("turndown");
   const { gfm } = require("joplin-turndown-plugin-gfm");
+  const { URL } = require('url');
 
   const t = new TurndownService();
   t.addRule("inlineLink", {
@@ -11,7 +13,18 @@ export async function parseMarkdown(
       node.nodeName === "A" &&
       node.getAttribute("href"),
     replacement: (content: string, node: any) => {
-      const href = node.getAttribute("href").trim();
+      let href = node.getAttribute("href").trim();
+      
+      // Convert relative URLs to absolute if baseUrl is provided
+      if (baseUrl && isRelativeUrl(href)) {
+        try {
+          const url = new URL(href, baseUrl);
+          href = url.toString();
+        } catch (err) {
+          // If URL construction fails, keep the original href
+        }
+      }
+      
       const title = node.title ? ` "${node.title}"` : "";
       return `[${content.trim()}](${href}${title})\n`;
     },
@@ -30,6 +43,10 @@ export async function parseMarkdown(
   }
 }
 
+function isRelativeUrl(url: string): boolean {
+  return !url.includes('://') && !url.startsWith('mailto:') && !url.startsWith('tel:');
+}
+
 // ---------------------------------------------
 // Helpers
 // ---------------------------------------------

From 9257b1564e24d3d81a8877e3da0fe3cdcb18e815 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 04:22:06 +0530
Subject: [PATCH 35/79] feat: pass url param

---
 server/src/markdownify/scrape.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts
index 39d1fa3a3..b58265a24 100644
--- a/server/src/markdownify/scrape.ts
+++ b/server/src/markdownify/scrape.ts
@@ -52,6 +52,6 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
   await browser.close();
 
   // Convert cleaned HTML → Markdown
-  const markdown = await parseMarkdown(cleanedHtml || "");
+  const markdown = await parseMarkdown(cleanedHtml, url);
   return markdown;
 }

From d1f13cf10ef8d12e206ccb1e9cff85f7fbe96d77 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 13:17:09 +0530
Subject: [PATCH 36/79] feat: add robot markdown creation section ui

---
 src/components/robot/pages/RobotCreate.tsx   | 144 ++++++++++++++++++-
 src/components/robot/pages/RobotEditPage.tsx |   7 +-
 2 files changed, 140 insertions(+), 11 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 70058642c..0e76fac44 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -13,21 +13,47 @@ import {
   Card,
   CircularProgress,
   Container,
-  CardContent
+  CardContent,
+  Tabs,
+  Tab
 } from '@mui/material';
-import { ArrowBack, PlayCircleOutline, Article } from '@mui/icons-material';
+import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material';
 import { useGlobalInfoStore } from '../../../context/globalInfo';
 import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
 import { AuthContext } from '../../../context/auth';
 import { GenericModal } from '../../ui/GenericModal';
 
 
+interface TabPanelProps {
+  children?: React.ReactNode;
+  index: number;
+  value: number;
+}
+
+function TabPanel(props: TabPanelProps) {
+  const { children, value, index, ...other } = props;
+
+  return (
+    <div
+      role="tabpanel"
+      hidden={value !== index}
+      id={`robot-tabpanel-${index}`}
+      aria-labelledby={`robot-tab-${index}`}
+      {...other}
+    >
+      {value === index && <Box>{children}</Box>}
+    </div>
+  );
+}
+
 const RobotCreate: React.FC = () => {
   const { t } = useTranslation();
   const navigate = useNavigate();
-  const { setBrowserId, setRecordingUrl, notify, setRecordingId } = useGlobalInfoStore();
+  const { setBrowserId, setRecordingUrl, notify, setRecordingId, setRerenderRobots } = useGlobalInfoStore();
 
+  const [tabValue, setTabValue] = useState(0);
   const [url, setUrl] = useState('');
+  const [markdownRobotName, setMarkdownRobotName] = useState('');
   const [needsLogin, setNeedsLogin] = useState(false);
   const [isLoading, setIsLoading] = useState(false);
   const [isWarningModalOpen, setWarningModalOpen] = useState(false);
@@ -36,6 +62,10 @@ const RobotCreate: React.FC = () => {
   const { state } = React.useContext(AuthContext);
   const { user } = state;
 
+  const handleTabChange = (event: React.SyntheticEvent, newValue: number) => {
+    setTabValue(newValue);
+  };
+
 
   const handleStartRecording = async () => {
     if (!url.trim()) {
@@ -146,11 +176,31 @@ const RobotCreate: React.FC = () => {
             <ArrowBack />
           </IconButton>
           <Typography variant="h5" component="h1">
-            New Data Extraction Robot
+            Create New Robot
           </Typography>
         </Box>
 
-        <Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
+        <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 3 }}>
+          <Tabs value={tabValue} onChange={handleTabChange} aria-label="robot type tabs">
+            <Tab
+              icon={<Code />}
+              iconPosition="start"
+              label="Data Extraction Robot"
+              id="robot-tab-0"
+              aria-controls="robot-tabpanel-0"
+            />
+            <Tab
+              icon={<Description />}
+              iconPosition="start"
+              label="Markdown Robot"
+              id="robot-tab-1"
+              aria-controls="robot-tabpanel-1"
+            />
+          </Tabs>
+        </Box>
+
+        <TabPanel value={tabValue} index={0}>
+          <Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
           <Box display="flex" flexDirection="column" alignItems="center">
             {/* Logo (kept as original) */}
             <img
@@ -295,6 +345,90 @@ const RobotCreate: React.FC = () => {
             </Grid>
           </Grid>
         </Box>
+        </TabPanel>
+
+        <TabPanel value={tabValue} index={1}>
+          <Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
+            <Box display="flex" flexDirection="column" alignItems="center">
+              <img
+                src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png"
+                width={73}
+                height={65}
+                style={{
+                  borderRadius: '5px',
+                  marginBottom: '30px'
+                }}
+                alt="Maxun Logo"
+              />
+
+              <Typography variant="h6" gutterBottom>
+                Create Markdown Robot
+              </Typography>
+              <Typography variant="body2" color="text.secondary" mb={3}>
+                Convert any webpage to clean markdown format
+              </Typography>
+
+              <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
+                <TextField
+                  placeholder="Example: My Blog Article Robot"
+                  variant="outlined"
+                  fullWidth
+                  value={markdownRobotName}
+                  onChange={(e) => setMarkdownRobotName(e.target.value)}
+                  label="Robot Name"
+                  sx={{ mb: 2 }}
+                />
+                <TextField
+                  placeholder="Example: https://example.com/blog/article"
+                  variant="outlined"
+                  fullWidth
+                  value={url}
+                  onChange={(e) => setUrl(e.target.value)}
+                  label="URL to convert"
+                />
+              </Box>
+
+              <Button
+                variant="contained"
+                fullWidth
+                onClick={async () => {
+                  if (!url.trim()) {
+                    notify('error', 'Please enter a valid URL');
+                    return;
+                  }
+                  if (!markdownRobotName.trim()) {
+                    notify('error', 'Please enter a robot name');
+                    return;
+                  }
+                  setIsLoading(true);
+                  const { createMarkdownRobot } = await import('../../../api/storage');
+                  const result = await createMarkdownRobot(url, markdownRobotName);
+                  setIsLoading(false);
+
+                  if (result) {
+                    setRerenderRobots(true);
+                    notify('success', `${markdownRobotName} created successfully!`);
+                    navigate('/robots');
+                  } else {
+                    notify('error', 'Failed to create markdown robot');
+                  }
+                }}
+                disabled={!url.trim() || !markdownRobotName.trim() || isLoading}
+                sx={{
+                  bgcolor: '#ff00c3',
+                  py: 1.4,
+                  fontSize: '1rem',
+                  textTransform: 'none',
+                  maxWidth: 700,
+                  borderRadius: 2
+                }}
+                startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
+              >
+                {isLoading ? 'Creating...' : 'Create Markdown Robot'}
+              </Button>
+            </Box>
+          </Card>
+        </TabPanel>
       </Box>
 
 
diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx
index 80671c1fb..d5e7cb2d5 100644
--- a/src/components/robot/pages/RobotEditPage.tsx
+++ b/src/components/robot/pages/RobotEditPage.tsx
@@ -795,11 +795,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
     navigate(basePath);
   };
 
-  const lastPair =
-    robot?.recording.workflow[robot?.recording.workflow.length - 1];
-  const targetUrl = lastPair?.what.find((action) => action.action === "goto")
-    ?.args?.[0];
-
   return (
     <RobotConfigPage
       title={t("robot_edit.title")}
@@ -826,7 +821,7 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
               <TextField
                 label={t("robot_duplication.fields.target_url")}
                 key={t("robot_duplication.fields.target_url")}
-                value={targetUrl || ""}
+                value={getTargetUrl() || ""}
                 onChange={(e) => handleTargetUrlChange(e.target.value)}
                 style={{ marginBottom: "20px" }}
               />

From c1373d8ca138f74a9506c24b19fc08ea7636865b Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 13:18:01 +0530
Subject: [PATCH 37/79] feat: display separate field md content

---
 src/components/run/RunContent.tsx | 87 +++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/src/components/run/RunContent.tsx b/src/components/run/RunContent.tsx
index 2cc1bb861..3a676a003 100644
--- a/src/components/run/RunContent.tsx
+++ b/src/components/run/RunContent.tsx
@@ -37,6 +37,7 @@ interface RunContentProps {
 export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler }: RunContentProps) => {
   const { t } = useTranslation();
   const [tab, setTab] = React.useState<string>('output');
+  const [markdownContent, setMarkdownContent] = useState<string>('');
 
   const [schemaData, setSchemaData] = useState<any[]>([]);
   const [schemaColumns, setSchemaColumns] = useState<string[]>([]);
@@ -63,6 +64,15 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
     setTab(tab);
   }, [interpretationInProgress]);
 
+  useEffect(() => {
+    if (row.serializableOutput?.markdown && Array.isArray(row.serializableOutput.markdown)) {
+      const markdownData = row.serializableOutput.markdown[0];
+      if (markdownData && markdownData.content) {
+        setMarkdownContent(markdownData.content);
+      }
+    }
+  }, [row.serializableOutput]);
+
   useEffect(() => {
     if (row.status === 'running' || row.status === 'queued' || row.status === 'scheduled') {
       setSchemaData([]);
@@ -374,6 +384,22 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
     }, 100);
   };
 
+  const downloadMarkdown = (content: string, filename: string) => {
+    const blob = new Blob([content], { type: 'text/markdown;charset=utf-8;' });
+    const url = URL.createObjectURL(blob);
+
+    const link = document.createElement("a");
+    link.href = url;
+    link.setAttribute("download", filename);
+    document.body.appendChild(link);
+    link.click();
+    document.body.removeChild(link);
+
+    setTimeout(() => {
+      URL.revokeObjectURL(url);
+    }, 100);
+  };
+
 
   const renderDataTable = (
     data: any[],
@@ -636,11 +662,70 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
 
   const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0;
   const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0;
+  const hasMarkdown = markdownContent.length > 0;
 
   return (
     <Box sx={{ width: '100%' }}>
       <TabContext value={tab}>
         <TabPanel value='output' sx={{ width: '100%', maxWidth: '900px' }}>
+          {hasMarkdown ? (
+            <Box>
+              <Accordion defaultExpanded sx={{ mb: 2 }}>
+                <AccordionSummary expandIcon={<ExpandMoreIcon />}>
+                  <Box sx={{ display: 'flex', alignItems: 'center' }}>
+                    <Typography variant='h6'>
+                      Markdown Output
+                    </Typography>
+                  </Box>
+                </AccordionSummary>
+                <AccordionDetails>
+                  <Paper
+                    sx={{
+                      p: 2,
+                      maxHeight: '500px',
+                      overflow: 'auto',
+                      backgroundColor: (theme) => theme.palette.mode === 'dark' ? '#1e1e1e' : '#f5f5f5'
+                    }}
+                  >
+                    <Typography
+                      component="pre"
+                      sx={{
+                        whiteSpace: 'pre-wrap',
+                        wordBreak: 'break-word',
+                        fontFamily: 'monospace',
+                        fontSize: '0.875rem'
+                      }}
+                    >
+                      {markdownContent}
+                    </Typography>
+                  </Paper>
+                  <Box sx={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', mb: 2, mt: 2 }}>
+                    <Box>
+                      <Button
+                        component="a"
+                        onClick={() => downloadMarkdown(markdownContent, 'output.md')}
+                        sx={{
+                          color: '#FF00C3',
+                          textTransform: 'none',
+                          p: 0,
+                          minWidth: 'auto',
+                          backgroundColor: 'transparent',
+                          '&:hover': {
+                            backgroundColor: 'transparent',
+                            textDecoration: 'underline',
+                          },
+                        }}
+                      >
+                        Download Markdown
+                      </Button>
+                    </Box>
+                  </Box>
+                </AccordionDetails>
+              </Accordion>
+            </Box>
+          ) : (
+            // Traditional robot output
+            <>
           {row.status === 'running' || row.status === 'queued' ? (
             <>
               <Box sx={{ display: 'flex', alignItems: 'center', mb: 2 }}>
@@ -939,6 +1024,8 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
               </AccordionDetails>
             </Accordion>
           )}
+          </>
+          )}
         </TabPanel>
       </TabContext>
     </Box>

From 0d45d1d7f1f4bbc4ee0e2eedd832797a050a86cf Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 13:19:12 +0530
Subject: [PATCH 38/79] feat: markdownify manual, scheduled, api runs

---
 server/src/api/record.ts                      | 110 ++++++++++++++++-
 server/src/pgboss-worker.ts                   |  95 +++++++++++++-
 .../workflow-management/scheduler/index.ts    | 116 +++++++++++++++++-
 3 files changed, 316 insertions(+), 5 deletions(-)

diff --git a/server/src/api/record.ts b/server/src/api/record.ts
index 29d1f2615..f55e2b3fb 100644
--- a/server/src/api/record.ts
+++ b/server/src/api/record.ts
@@ -344,7 +344,8 @@ function formatRunResponse(run: any) {
         runByAPI: run.runByAPI,
         data: {
             textData: {},
-            listData: {}
+            listData: {},
+            markdown: ''
         },
         screenshots: [] as any[],
     };
@@ -359,6 +360,10 @@ function formatRunResponse(run: any) {
         formattedRun.data.listData = output.scrapeList;
     }
 
+    if (output.markdown && Array.isArray(output.markdown)) {
+        formattedRun.data.markdown = output.markdown[0]?.content || '';
+    }
+
     if (run.binaryOutput) {
         Object.keys(run.binaryOutput).forEach(key => {
             if (run.binaryOutput[key]) {
@@ -651,6 +656,106 @@ async function executeRun(id: string, userId: string) {
             };
         }
 
+        if (recording.recording_meta.type === 'markdown') {
+            logger.log('info', `Executing markdown robot for API run ${id}`);
+
+            await run.update({
+                status: 'running',
+                log: 'Converting page to markdown'
+            });
+
+            try {
+                const { convertPageToMarkdown } = await import('../markdownify/scrape');
+                const url = recording.recording_meta.url;
+
+                if (!url) {
+                    throw new Error('No URL specified for markdown robot');
+                }
+
+                const markdown = await convertPageToMarkdown(url);
+
+                await run.update({
+                    status: 'success',
+                    finishedAt: new Date().toLocaleString(),
+                    log: 'Markdown conversion completed successfully',
+                    serializableOutput: {
+                        markdown: [{ content: markdown }]
+                    },
+                    binaryOutput: {},
+                });
+
+                logger.log('info', `Markdown robot execution completed for API run ${id}`);
+
+                try {
+                    const completionData = {
+                        runId: plainRun.runId,
+                        robotMetaId: plainRun.robotMetaId,
+                        robotName: recording.recording_meta.name,
+                        status: 'success',
+                        finishedAt: new Date().toLocaleString()
+                    };
+
+                    serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
+                } catch (socketError: any) {
+                    logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
+                }
+
+                const webhookPayload = {
+                    robot_id: plainRun.robotMetaId,
+                    run_id: plainRun.runId,
+                    robot_name: recording.recording_meta.name,
+                    status: 'success',
+                    started_at: plainRun.startedAt,
+                    finished_at: new Date().toLocaleString(),
+                    markdown: markdown,
+                    metadata: {
+                        browser_id: plainRun.browserId,
+                        user_id: userId,
+                    }
+                };
+
+                try {
+                    await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
+                    logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`);
+                } catch (webhookError: any) {
+                    logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
+                }
+
+                await destroyRemoteBrowser(plainRun.browserId, userId);
+
+                return {
+                    success: true,
+                    interpretationInfo: run.toJSON()
+                };
+            } catch (error: any) {
+                logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`);
+
+                await run.update({
+                    status: 'failed',
+                    finishedAt: new Date().toLocaleString(),
+                    log: `Markdown conversion failed: ${error.message}`,
+                });
+
+                try {
+                    const failureData = {
+                        runId: plainRun.runId,
+                        robotMetaId: plainRun.robotMetaId,
+                        robotName: recording.recording_meta.name,
+                        status: 'failed',
+                        finishedAt: new Date().toLocaleString()
+                    };
+
+                    serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
+                } catch (socketError: any) {
+                    logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
+                }
+
+                await destroyRemoteBrowser(plainRun.browserId, userId);
+
+                throw error;
+            }
+        }
+
         plainRun.status = 'running';
 
         browser = browserPool.getRemoteBrowser(plainRun.browserId);
@@ -889,12 +994,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
         if (!run) throw new Error('Run not found');
 
         if (run.status === 'success') {
-            return run.toJSON();
+            return run;
         } else if (run.status === 'failed') {
             throw new Error('Run failed');
         }
 
-        // Wait for the next polling interval
         await new Promise(resolve => setTimeout(resolve, interval));
     }
 }
diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts
index b9f411008..0fcd7f650 100644
--- a/server/src/pgboss-worker.ts
+++ b/server/src/pgboss-worker.ts
@@ -187,7 +187,100 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
       if (!recording) {
         throw new Error(`Recording for run ${data.runId} not found`);
       }
-      
+
+      if (recording.recording_meta.type === 'markdown') {
+        logger.log('info', `Executing markdown robot for run ${data.runId}`);
+
+        await run.update({
+          status: 'running',
+          log: 'Converting page to markdown'
+        });
+
+        try {
+          const { convertPageToMarkdown } = await import('./markdownify/scrape');
+          const url = recording.recording_meta.url;
+
+          if (!url) {
+            throw new Error('No URL specified for markdown robot');
+          }
+
+          const markdown = await convertPageToMarkdown(url);
+
+          await run.update({
+            status: 'success',
+            finishedAt: new Date().toLocaleString(),
+            log: 'Markdown conversion completed successfully',
+            serializableOutput: {
+              markdown: [{ content: markdown }]
+            },
+            binaryOutput: {},
+          });
+
+          logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
+
+          try {
+            const completionData = {
+              runId: data.runId,
+              robotMetaId: plainRun.robotMetaId,
+              robotName: recording.recording_meta.name,
+              status: 'success',
+              finishedAt: new Date().toLocaleString()
+            };
+
+            serverIo.of(browserId).emit('run-completed', completionData);
+            serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData);
+          } catch (socketError: any) {
+            logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
+          }
+
+          try {
+            const webhookPayload = {
+              runId: data.runId,
+              robotId: plainRun.robotMetaId,
+              robotName: recording.recording_meta.name,
+              status: 'success',
+              finishedAt: new Date().toLocaleString(),
+              markdown: markdown
+            };
+            await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
+            logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
+          } catch (webhookError: any) {
+            logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
+          }
+
+          await destroyRemoteBrowser(browserId, data.userId);
+
+          return { success: true };
+        } catch (error: any) {
+          logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`);
+
+          await run.update({
+            status: 'failed',
+            finishedAt: new Date().toLocaleString(),
+            log: `Markdown conversion failed: ${error.message}`,
+          });
+
+          try {
+            const failureData = {
+              runId: data.runId,
+              robotMetaId: plainRun.robotMetaId,
+              robotName: recording.recording_meta.name,
+              status: 'failed',
+              finishedAt: new Date().toLocaleString()
+            };
+
+            serverIo.of(browserId).emit('run-completed', failureData);
+            serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData);
+          } catch (socketError: any) {
+            logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
+          }
+
+          await destroyRemoteBrowser(browserId, data.userId);
+
+          throw error;
+        }
+      }
+
       const isRunAborted = async (): Promise<boolean> => {
         try {
           const currentRun = await Run.findOne({ where: { runId: data.runId } });
diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts
index 899cb7f61..ba47b3e0e 100644
--- a/server/src/workflow-management/scheduler/index.ts
+++ b/server/src/workflow-management/scheduler/index.ts
@@ -207,6 +207,120 @@ async function executeRun(id: string, userId: string) {
       }
     }
 
+    if (recording.recording_meta.type === 'markdown') {
+      logger.log('info', `Executing markdown robot for scheduled run ${id}`);
+
+      await run.update({
+        status: 'running',
+        log: 'Converting page to markdown'
+      });
+
+      try {
+        const runStartedData = {
+          runId: plainRun.runId,
+          robotMetaId: plainRun.robotMetaId,
+          robotName: recording.recording_meta.name,
+          status: 'running',
+          startedAt: plainRun.startedAt
+        };
+
+        serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
+        logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`);
+      } catch (socketError: any) {
+        logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`);
+      }
+
+      try {
+        const { convertPageToMarkdown } = await import('../../markdownify/scrape');
+        const url = recording.recording_meta.url;
+
+        if (!url) {
+          throw new Error('No URL specified for markdown robot');
+        }
+
+        const markdown = await convertPageToMarkdown(url);
+
+        await run.update({
+          status: 'success',
+          finishedAt: new Date().toLocaleString(),
+          log: 'Markdown conversion completed successfully',
+          serializableOutput: {
+            markdown: [{ content: markdown }]
+          },
+          binaryOutput: {},
+        });
+
+        logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
+
+        try {
+          const completionData = {
+            runId: plainRun.runId,
+            robotMetaId: plainRun.robotMetaId,
+            robotName: recording.recording_meta.name,
+            status: 'success',
+            finishedAt: new Date().toLocaleString()
+          };
+
+          serverIo.of(plainRun.browserId).emit('run-completed', completionData);
+          serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
+        } catch (socketError: any) {
+          logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
+        }
+
+        const webhookPayload = {
+          robot_id: plainRun.robotMetaId,
+          run_id: plainRun.runId,
+          robot_name: recording.recording_meta.name,
+          status: 'success',
+          started_at: plainRun.startedAt,
+          finished_at: new Date().toLocaleString(),
+          markdown: markdown,
+          metadata: {
+            browser_id: plainRun.browserId,
+            user_id: userId,
+          }
+        };
+
+        try {
+          await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
+          logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`);
+        } catch (webhookError: any) {
+          logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
+        }
+
+        await destroyRemoteBrowser(plainRun.browserId, userId);
+
+        return true;
+      } catch (error: any) {
+        logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`);
+
+        await run.update({
+          status: 'failed',
+          finishedAt: new Date().toLocaleString(),
+          log: `Markdown conversion failed: ${error.message}`,
+        });
+
+        try {
+          const failureData = {
+            runId: plainRun.runId,
+            robotMetaId: plainRun.robotMetaId,
+            robotName: recording.recording_meta.name,
+            status: 'failed',
+            finishedAt: new Date().toLocaleString()
+          };
+
+          serverIo.of(plainRun.browserId).emit('run-completed', failureData);
+          serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
+        } catch (socketError: any) {
+          logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
+        }
+
+        await destroyRemoteBrowser(plainRun.browserId, userId);
+
+        throw error;
+      }
+    }
+
     plainRun.status = 'running';
 
     try {
@@ -217,7 +331,7 @@ async function executeRun(id: string, userId: string) {
         status: 'running',
         startedAt: plainRun.startedAt
       };
-      
+
       serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
       logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`);
     } catch (socketError: any) {

From b19e02f13775b4ceb71f6850f334d1ec95489915 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 13:22:54 +0530
Subject: [PATCH 39/79] feat: add markdown route

---
 server/src/routes/storage.ts | 81 +++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts
index 89872d6ae..ee23ee442 100644
--- a/server/src/routes/storage.ts
+++ b/server/src/routes/storage.ts
@@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
     }
 
     if (targetUrl) {
+      robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
+
       const updatedWorkflow = [...robot.recording.workflow];
+      let foundGoto = false;
 
       for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
         const step = updatedWorkflow[i];
@@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
 
             robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
             robot.changed('recording', true);
+            foundGoto = true;
             i = -1;
             break;
           }
@@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
       }
     };
 
-    if (name) {
+    if (name || targetUrl) {
       updates.recording_meta = {
         ...robot.recording_meta,
-        name
+        ...(name && { name }),
+        ...(targetUrl && { url: targetUrl })
       };
     }
 
@@ -432,6 +437,78 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
   }
 });
 
+/**
+ * POST endpoint for creating a markdown robot
+ */
+router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => {
+  try {
+    const { url, name } = req.body;
+
+    if (!url) {
+      return res.status(400).json({ error: 'The "url" field is required.' });
+    }
+
+    if (!req.user) {
+      return res.status(401).send({ error: 'Unauthorized' });
+    }
+
+    // Validate URL format
+    try {
+      new URL(url);
+    } catch (err) {
+      return res.status(400).json({ error: 'Invalid URL format' });
+    }
+
+    const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
+    const currentTimestamp = new Date().toLocaleString();
+    const robotId = uuid();
+
+    const newRobot = await Robot.create({
+      id: uuid(),
+      userId: req.user.id,
+      recording_meta: {
+        name: robotName,
+        id: robotId,
+        createdAt: currentTimestamp,
+        updatedAt: currentTimestamp,
+        pairs: 0,
+        params: [],
+        type: 'markdown',
+        url: url,
+      },
+      recording: { workflow: [] },
+      google_sheet_email: null,
+      google_sheet_name: null,
+      google_sheet_id: null,
+      google_access_token: null,
+      google_refresh_token: null,
+      schedule: null,
+    });
+
+    logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
+    capture(
+      'maxun-oss-markdown-robot-created',
+      {
+        robot_meta: newRobot.recording_meta,
+        url: url,
+      }
+    );
+
+    return res.status(201).json({
+      message: 'Markdown robot created successfully.',
+      robot: newRobot,
+    });
+  } catch (error) {
+    if (error instanceof Error) {
+      logger.log('error', `Error creating markdown robot: ${error.message}`);
+      return res.status(500).json({ error: error.message });
+    } else {
+      logger.log('error', 'Unknown error creating markdown robot');
+      return res.status(500).json({ error: 'An unknown error occurred.' });
+    }
+  }
+});
+
 /**
  * DELETE endpoint for deleting a recording from the storage.
  */

From 05d2d1b7fef11db4bb7691976dc63cba1adb54ad Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 13:25:43 +0530
Subject: [PATCH 40/79] feat: add optional type and url fields

---
 server/src/models/Robot.ts                        | 2 ++
 src/components/robot/pages/RobotDuplicatePage.tsx | 7 +------
 src/components/robot/pages/RobotEditPage.tsx      | 7 +------
 src/components/robot/pages/RobotSettingsPage.tsx  | 1 +
 src/context/globalInfo.tsx                        | 2 ++
 5 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts
index eae9438ec..5acbdf133 100644
--- a/server/src/models/Robot.ts
+++ b/server/src/models/Robot.ts
@@ -9,6 +9,8 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
+  type?: 'traditional' | 'markdown';
+  url?: string;
 }
 
 interface RobotWorkflow {
diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx
index b02cecdeb..7c45c8e83 100644
--- a/src/components/robot/pages/RobotDuplicatePage.tsx
+++ b/src/components/robot/pages/RobotDuplicatePage.tsx
@@ -24,12 +24,7 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
-  type?: string;
-  description?: string;
-  usedByUsers?: number[];
-  subscriptionLevel?: number;
-  access?: string;
-  sample?: any[];
+  type?: 'traditional' | 'markdown';
   url?: string;
 }
 
diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx
index d5e7cb2d5..19b9e43b2 100644
--- a/src/components/robot/pages/RobotEditPage.tsx
+++ b/src/components/robot/pages/RobotEditPage.tsx
@@ -24,12 +24,7 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
-  type?: string;
-  description?: string;
-  usedByUsers?: number[];
-  subscriptionLevel?: number;
-  access?: string;
-  sample?: any[];
+  type?: 'traditional' | 'markdown';
   url?: string;
 }
 
diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx
index 118329358..96b7d3ecf 100644
--- a/src/components/robot/pages/RobotSettingsPage.tsx
+++ b/src/components/robot/pages/RobotSettingsPage.tsx
@@ -16,6 +16,7 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
+  type?: 'traditional' | 'markdown';
   url?: string;
 }
 
diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx
index 69969a09c..a0c79622a 100644
--- a/src/context/globalInfo.tsx
+++ b/src/context/globalInfo.tsx
@@ -27,6 +27,8 @@ interface RobotMeta {
     pairs: number;
     updatedAt: string;
     params: any[];
+    type?: 'traditional' | 'markdown';
+    url?: string;
 }
 
 interface RobotWorkflow {

From d444756f673173f8e87e796963e320a6c82f41a8 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 13:33:10 +0530
Subject: [PATCH 41/79] chore: add static markdown import

---
 server/src/api/record.ts                          | 2 +-
 server/src/pgboss-worker.ts                       | 4 ++--
 server/src/workflow-management/scheduler/index.ts | 2 +-
 src/components/robot/pages/RobotCreate.tsx        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/server/src/api/record.ts b/server/src/api/record.ts
index f55e2b3fb..fd7376abc 100644
--- a/server/src/api/record.ts
+++ b/server/src/api/record.ts
@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
 import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
 import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
 import { sendWebhook } from "../routes/webhook";
+import { convertPageToMarkdown } from '../markdownify/scrape';
 
 chromium.use(stealthPlugin());
 
@@ -665,7 +666,6 @@ async function executeRun(id: string, userId: string) {
             });
 
             try {
-                const { convertPageToMarkdown } = await import('../markdownify/scrape');
                 const url = recording.recording_meta.url;
 
                 if (!url) {
diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts
index 0fcd7f650..b2d5bdb30 100644
--- a/server/src/pgboss-worker.ts
+++ b/server/src/pgboss-worker.ts
@@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
 import { io as serverIo } from "./server";
 import { sendWebhook } from './routes/webhook';
 import { BinaryOutputService } from './storage/mino';
+import { convertPageToMarkdown } from './markdownify/scrape';
 
 if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
     throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
@@ -183,7 +184,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
     try {  
       // Find the recording
       const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
-      
+
       if (!recording) {
         throw new Error(`Recording for run ${data.runId} not found`);
       }
@@ -197,7 +198,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
         });
 
         try {
-          const { convertPageToMarkdown } = await import('./markdownify/scrape');
           const url = recording.recording_meta.url;
 
           if (!url) {
diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts
index ba47b3e0e..7c2cb4085 100644
--- a/server/src/workflow-management/scheduler/index.ts
+++ b/server/src/workflow-management/scheduler/index.ts
@@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core";
 import { Page } from "playwright";
 import { sendWebhook } from "../../routes/webhook";
 import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
+import { convertPageToMarkdown } from "../../markdownify/scrape";
 chromium.use(stealthPlugin());
 
 async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -231,7 +232,6 @@ async function executeRun(id: string, userId: string) {
       }
 
       try {
-        const { convertPageToMarkdown } = await import('../../markdownify/scrape');
         const url = recording.recording_meta.url;
 
         if (!url) {
diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 0e76fac44..4bec52d88 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -20,6 +20,7 @@ import {
 import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material';
 import { useGlobalInfoStore } from '../../../context/globalInfo';
 import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
+import { createMarkdownRobot } from "../../../api/storage";
 import { AuthContext } from '../../../context/auth';
 import { GenericModal } from '../../ui/GenericModal';
 
@@ -401,7 +402,6 @@ const RobotCreate: React.FC = () => {
                     return;
                   }
                   setIsLoading(true);
-                  const { createMarkdownRobot } = await import('../../../api/storage');
                   const result = await createMarkdownRobot(url, markdownRobotName);
                   setIsLoading(false);
 

From ddcb3dfe4b9c99144b9b14cd0b197f52c3d81d6b Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 15:35:31 +0530
Subject: [PATCH 42/79] feat: extend turndown + clean

---
 server/src/markdownify/markdown.ts | 164 +++++++++++++++++++++++++----
 1 file changed, 141 insertions(+), 23 deletions(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index 9ee7c7205..e660679d9 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -4,36 +4,76 @@ export async function parseMarkdown(
 ): Promise<string> {
   const TurndownService = require("turndown");
   const { gfm } = require("joplin-turndown-plugin-gfm");
-  const { URL } = require('url');
+  const cheerio = require("cheerio");
+  const { URL } = require("url");
+
+  if (!html) return "";
+
+  const tidiedHtml = tidyHtml(html);
 
   const t = new TurndownService();
+
+  // Remove irrelevant tags 
+  const elementsToRemove = [
+    "meta",
+    "style",
+    "script",
+    "noscript",
+    "link",
+    "textarea",
+  ];
+
+  t.addRule("remove-irrelevant", {
+    filter: elementsToRemove,
+    replacement: () => "",
+  });
+
+  t.addRule("truncate-svg", {
+    filter: "svg",
+    replacement: () => "",
+  });
+
+  t.addRule("improved-paragraph", {
+    filter: "p",
+    replacement: (innerText: string) => {
+      const trimmed = innerText.trim();
+      if (!trimmed) return "";
+      return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
+    },
+  });
+
   t.addRule("inlineLink", {
     filter: (node: any, opts: any) =>
       opts.linkStyle === "inlined" &&
       node.nodeName === "A" &&
       node.getAttribute("href"),
+
     replacement: (content: string, node: any) => {
       let href = node.getAttribute("href").trim();
-      
-      // Convert relative URLs to absolute if baseUrl is provided
+
+      // Relative → absolute
       if (baseUrl && isRelativeUrl(href)) {
         try {
-          const url = new URL(href, baseUrl);
-          href = url.toString();
-        } catch (err) {
-          // If URL construction fails, keep the original href
-        }
+          const u = new URL(href, baseUrl);
+          href = u.toString();
+        } catch {}
       }
-      
-      const title = node.title ? ` "${node.title}"` : "";
+
+      // Clean URL
+      href = cleanUrl(href);
+
+      const title = node.title ? ` "${cleanAttribute(node.title)}"` : "";
       return `[${content.trim()}](${href}${title})\n`;
     },
   });
 
   t.use(gfm);
 
+  // ---------------------------------------------------
+  // Convert
+  // ---------------------------------------------------
   try {
-    let out = await t.turndown(html);
+    let out = await t.turndown(tidiedHtml);
     out = fixBrokenLinks(out);
     out = stripSkipLinks(out);
     return out;
@@ -43,13 +83,98 @@ export async function parseMarkdown(
   }
 }
 
-function isRelativeUrl(url: string): boolean {
-  return !url.includes('://') && !url.startsWith('mailto:') && !url.startsWith('tel:');
-}
-
 // ---------------------------------------------
 // Helpers
 // ---------------------------------------------
+function isRelativeUrl(url: string): boolean {
+  return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
+}
+
+function cleanUrl(u: string): string {
+  try {
+    return u;
+  } catch {
+    return u;
+  }
+}
+
+// CODE 1: attribute cleaner
+function cleanAttribute(attr: string) {
+  return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
+}
+
+// ---------------------------------------------------------
+// CODE 1: Full tidyHtml cleaning logic (ported verbatim)
+// ---------------------------------------------------------
+function tidyHtml(html: string): string {
+  const cheerio = require("cheerio");
+  const $ = cheerio.load(html);
+
+  // Fix broken attributes
+  $("*").each(function (this: any) {
+    const element = $(this);
+    const attributes = Object.keys(this.attribs);
+
+    for (let i = 0; i < attributes.length; i++) {
+      let attr = attributes[i];
+      if (attr.includes('"')) {
+        element.remove();
+      }
+    }
+  });
+
+  const manuallyCleanedElements = [
+    "aside",
+    "embed",
+    "head",
+    "iframe",
+    "menu",
+    "object",
+    "script",
+    "applet",
+    "audio",
+    "canvas",
+    "map",
+    "svg",
+    "video",
+    "area",
+    "blink",
+    "datalist",
+    "dialog",
+    "frame",
+    "frameset",
+    "link",
+    "input",
+    "ins",
+    "legend",
+    "marquee",
+    "math",
+    "menuitem",
+    "nav",
+    "noscript",
+    "optgroup",
+    "output",
+    "param",
+    "progress",
+    "rp",
+    "rt",
+    "rtc",
+    "source",
+    "style",
+    "track",
+    "textarea",
+    "time",
+    "use",
+    "img",
+    "picture",
+    "figure",
+  ];
+
+  manuallyCleanedElements.forEach((tag) => $(tag).remove());
+  return $("body").html();
+}
+
+
 function fixBrokenLinks(md: string): string {
   let depth = 0;
   let result = "";
@@ -57,12 +182,7 @@ function fixBrokenLinks(md: string): string {
   for (const ch of md) {
     if (ch === "[") depth++;
     if (ch === "]") depth = Math.max(0, depth - 1);
-
-    if (depth > 0 && ch === "\n") {
-      result += "\\\n";
-    } else {
-      result += ch;
-    }
+    result += depth > 0 && ch === "\n" ? "\\\n" : ch;
   }
   return result;
 }
@@ -70,5 +190,3 @@ function fixBrokenLinks(md: string): string {
 function stripSkipLinks(md: string): string {
   return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
 }
-
-

From 8346c9637a694b97067ee6a7bf28d26869905956 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 15:37:26 +0530
Subject: [PATCH 43/79] chore: cleanup

---
 server/src/markdownify/markdown.ts | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index e660679d9..92551b51b 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -98,14 +98,10 @@ function cleanUrl(u: string): string {
   }
 }
 
-// CODE 1: attribute cleaner
 function cleanAttribute(attr: string) {
   return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
 }
 
-// ---------------------------------------------------------
-// CODE 1: Full tidyHtml cleaning logic (ported verbatim)
-// ---------------------------------------------------------
 function tidyHtml(html: string): string {
   const cheerio = require("cheerio");
   const $ = cheerio.load(html);

From 924d687e20fb20cd321f242bfa6c8a0666fc52b0 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 15:54:39 +0530
Subject: [PATCH 44/79] feat: add create markdown api

---
 src/api/storage.ts | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/api/storage.ts b/src/api/storage.ts
index b5dc32ded..ca4d975cf 100644
--- a/src/api/storage.ts
+++ b/src/api/storage.ts
@@ -28,6 +28,27 @@ export const getStoredRecordings = async (): Promise<string[] | null> => {
   }
 };
 
+export const createMarkdownRobot = async (url: string, name?: string): Promise<any> => {
+  try {
+    const response = await axios.post(`${apiUrl}/storage/recordings/markdown`, {
+      url,
+      name,
+    }, {
+      headers: { 'Content-Type': 'application/json' },
+      withCredentials: true
+    });
+
+    if (response.status === 201) {
+      return response.data;
+    } else {
+      throw new Error('Failed to create markdown robot');
+    }
+  } catch (error: any) {
+    console.error('Error creating markdown robot:', error);
+    return null;
+  }
+};
+
 export const updateRecording = async (id: string, data: { 
   name?: string; 
   limits?: Array<{pairIndex: number, actionIndex: number, argIndex: number, limit: number}>;

From e711326c0e17a809c6a195420cd14e1fd1508815 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:00:02 +0530
Subject: [PATCH 45/79] feat: extract

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 4bec52d88..de1ae6c06 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -186,7 +186,7 @@ const RobotCreate: React.FC = () => {
             <Tab
               icon={<Code />}
               iconPosition="start"
-              label="Data Extraction Robot"
+              label="Extract"
               id="robot-tab-0"
               aria-controls="robot-tabpanel-0"
             />

From 51a0c3a769fcb0a8300087d2b77c2287e5c68203 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:03:36 +0530
Subject: [PATCH 46/79] chore: remove icon

---
 src/components/robot/pages/RobotCreate.tsx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index de1ae6c06..40ef4cc17 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -184,8 +184,6 @@ const RobotCreate: React.FC = () => {
         <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 3 }}>
           <Tabs value={tabValue} onChange={handleTabChange} aria-label="robot type tabs">
             <Tab
-              icon={<Code />}
-              iconPosition="start"
               label="Extract"
               id="robot-tab-0"
               aria-controls="robot-tabpanel-0"

From 672a1822cb110e777ecadc55ba068fd78adb7150 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:03:55 +0530
Subject: [PATCH 47/79] feat: extract

---
 src/components/robot/pages/RobotCreate.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 40ef4cc17..359fc7006 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -185,8 +185,8 @@ const RobotCreate: React.FC = () => {
           <Tabs value={tabValue} onChange={handleTabChange} aria-label="robot type tabs">
             <Tab
               label="Extract"
-              id="robot-tab-0"
-              aria-controls="robot-tabpanel-0"
+              id="extract-robot"
+              aria-controls="extract-robot"
             />
             <Tab
               icon={<Description />}

From d0b8d0c6d77db1a61ac108e5e0a9a132a6fa37d1 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:05:02 +0530
Subject: [PATCH 48/79] chore: remove icon

---
 src/components/robot/pages/RobotCreate.tsx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 359fc7006..9fb927126 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -189,8 +189,6 @@ const RobotCreate: React.FC = () => {
               aria-controls="extract-robot"
             />
             <Tab
-              icon={<Description />}
-              iconPosition="start"
               label="Markdown Robot"
               id="robot-tab-1"
               aria-controls="robot-tabpanel-1"

From 53bf9eb09234787538e97bb2ad345e85a4c72b6e Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:07:30 +0530
Subject: [PATCH 49/79] feat: scrape

---
 src/components/robot/pages/RobotCreate.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 9fb927126..bc38ca0b5 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -190,8 +190,8 @@ const RobotCreate: React.FC = () => {
             />
             <Tab
               label="Markdown Robot"
-              id="robot-tab-1"
-              aria-controls="robot-tabpanel-1"
+              id="scrape-robot"
+              aria-controls="scrape-robot"
             />
           </Tabs>
         </Box>

From 8428314a2cf6342a9630833d09bccfb9f3ff012d Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:08:05 +0530
Subject: [PATCH 50/79] feat: scrape

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index bc38ca0b5..3f3c88f9b 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -189,7 +189,7 @@ const RobotCreate: React.FC = () => {
               aria-controls="extract-robot"
             />
             <Tab
-              label="Markdown Robot"
+              label="Scrape"
               id="scrape-robot"
               aria-controls="scrape-robot"
             />

From 6de6c3b04294dfdfad67311a00197390f046e507 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:08:51 +0530
Subject: [PATCH 51/79] feat: remove header

---
 src/components/robot/pages/RobotCreate.tsx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 3f3c88f9b..c75760021 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -358,9 +358,6 @@ const RobotCreate: React.FC = () => {
                 alt="Maxun Logo"
               />
 
-              <Typography variant="h6" gutterBottom>
-                Create Markdown Robot
-              </Typography>
               <Typography variant="body2" color="text.secondary" mb={3}>
                 Convert any webpage to clean markdown format
               </Typography>

From ef4311606673a2590cc1b302630c0ce9860381c5 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:10:05 +0530
Subject: [PATCH 52/79] feat: markdown

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index c75760021..98feb5fe6 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -359,7 +359,7 @@ const RobotCreate: React.FC = () => {
               />
 
               <Typography variant="body2" color="text.secondary" mb={3}>
-                Convert any webpage to clean markdown format
+                Turn websites into LLM-ready Markdown content for AI apps.
               </Typography>
 
               <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>

From f745089d9ac84228d70ad035c02c811e2eea4f38 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:17:27 +0530
Subject: [PATCH 53/79] feat: markdown

---
 src/components/robot/pages/RobotCreate.tsx | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 98feb5fe6..b014586cc 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -211,6 +211,10 @@ const RobotCreate: React.FC = () => {
               alt="Maxun Logo"
             />
 
+              <Typography variant="body2" color="text.secondary" mb={3}>
+                Extract structured data from websites in a few clicks.
+              </Typography>
+
             {/* Origin URL Input */}
             <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
               <TextField
@@ -364,21 +368,20 @@ const RobotCreate: React.FC = () => {
 
               <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
                 <TextField
-                  placeholder="Example: My Blog Article Robot"
+                  placeholder="Example: YC Companies Scraper"
                   variant="outlined"
                   fullWidth
                   value={markdownRobotName}
                   onChange={(e) => setMarkdownRobotName(e.target.value)}
-                  label="Robot Name"
                   sx={{ mb: 2 }}
                 />
                 <TextField
-                  placeholder="Example: https://example.com/blog/article"
+                  placeholder="Example: https://www.ycombinator.com/companies/"
                   variant="outlined"
                   fullWidth
                   value={url}
                   onChange={(e) => setUrl(e.target.value)}
-                  label="URL to convert"
+                  label="Website URL"
                 />
               </Box>
 

From 81d69a44c1a861643f4779e28a71635730b3645f Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:17:45 +0530
Subject: [PATCH 54/79] chore: lint

---
 src/components/robot/pages/RobotCreate.tsx | 260 ++++++++++-----------
 1 file changed, 129 insertions(+), 131 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index b014586cc..5cd4e74a6 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -198,154 +198,152 @@ const RobotCreate: React.FC = () => {
 
         <TabPanel value={tabValue} index={0}>
           <Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
-          <Box display="flex" flexDirection="column" alignItems="center">
-            {/* Logo (kept as original) */}
-            <img
-              src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png"
-              width={73}
-              height={65}
-              style={{
-                borderRadius: '5px',
-                marginBottom: '30px'
-              }}
-              alt="Maxun Logo"
-            />
+            <Box display="flex" flexDirection="column" alignItems="center">
+              {/* Logo (kept as original) */}
+              <img
+                src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png"
+                width={73}
+                height={65}
+                style={{
+                  borderRadius: '5px',
+                  marginBottom: '30px'
+                }}
+                alt="Maxun Logo"
+              />
 
               <Typography variant="body2" color="text.secondary" mb={3}>
                 Extract structured data from websites in a few clicks.
               </Typography>
 
-            {/* Origin URL Input */}
-            <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
-              <TextField
-                placeholder="Example: https://www.ycombinator.com/companies/"
-                variant="outlined"
-                fullWidth
-                value={url}
-                onChange={(e) => setUrl(e.target.value)}
-              />
-            </Box>
-
-            {/* Checkbox */}
-            <Box sx={{ width: '100%', maxWidth: 700, mb: 3, textAlign: 'left' }}>
-              <FormControlLabel
-                control={
-                  <Checkbox
-                    checked={needsLogin}
-                    onChange={(e) => setNeedsLogin(e.target.checked)}
-                    color="primary"
-                  />
-                }
-                label="This website needs logging in."
-              />
-            </Box>
-
-            {/* Button */}
-            <Button
-              variant="contained"
-              fullWidth
-              onClick={handleStartRecording}
-              disabled={!url.trim() || isLoading}
-              sx={{
-                bgcolor: '#ff00c3',
-                py: 1.4,
-                fontSize: '1rem',
-                textTransform: 'none',
-                maxWidth: 700,
-                borderRadius: 2
-              }}
-              startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
-            >
-              {isLoading ? 'Starting...' : 'Start Recording'}
-            </Button>
-          </Box>
-        </Card>
-
-
-
-        <Box mt={6} textAlign="center">
-          <Typography variant="h6" gutterBottom>
-            First time creating a robot?
-          </Typography>
-          <Typography variant="body2" color="text.secondary" mb={3}>
-            Get help and learn how to use Maxun effectively.
-          </Typography>
+              {/* Origin URL Input */}
+              <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
+                <TextField
+                  placeholder="Example: https://www.ycombinator.com/companies/"
+                  variant="outlined"
+                  fullWidth
+                  value={url}
+                  onChange={(e) => setUrl(e.target.value)}
+                />
+              </Box>
 
-          <Grid container spacing={3} justifyContent="center">
+              {/* Checkbox */}
+              <Box sx={{ width: '100%', maxWidth: 700, mb: 3, textAlign: 'left' }}>
+                <FormControlLabel
+                  control={
+                    <Checkbox
+                      checked={needsLogin}
+                      onChange={(e) => setNeedsLogin(e.target.checked)}
+                      color="primary"
+                    />
+                  }
+                  label="This website needs logging in."
+                />
+              </Box>
 
-            {/* YouTube Tutorials */}
-            <Grid item xs={12} sm={6} md={4}>
-              <Card
+              {/* Button */}
+              <Button
+                variant="contained"
+                fullWidth
+                onClick={handleStartRecording}
+                disabled={!url.trim() || isLoading}
                 sx={{
-                  height: 140,
-                  cursor: "pointer",
+                  bgcolor: '#ff00c3',
+                  py: 1.4,
+                  fontSize: '1rem',
+                  textTransform: 'none',
+                  maxWidth: 700,
+                  borderRadius: 2
                 }}
-                onClick={() => window.open("https://www.youtube.com/@MaxunOSS/videos", "_blank")}
+                startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
               >
-                <CardContent
+                {isLoading ? 'Starting...' : 'Start Recording'}
+              </Button>
+            </Box>
+          </Card>
+
+          <Box mt={6} textAlign="center">
+            <Typography variant="h6" gutterBottom>
+              First time creating a robot?
+            </Typography>
+            <Typography variant="body2" color="text.secondary" mb={3}>
+              Get help and learn how to use Maxun effectively.
+            </Typography>
+
+            <Grid container spacing={3} justifyContent="center">
+
+              {/* YouTube Tutorials */}
+              <Grid item xs={12} sm={6} md={4}>
+                <Card
                   sx={{
-                    display: "flex",
-                    flexDirection: "column",
-                    alignItems: "center",
-                    justifyContent: "center", // center content
-                    height: "100%",
-                    textAlign: "center",
-                    p: 2,
-                    color: (theme) =>
-                      theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
+                    height: 140,
+                    cursor: "pointer",
                   }}
+                  onClick={() => window.open("https://www.youtube.com/@MaxunOSS/videos", "_blank")}
                 >
-                  <PlayCircleOutline sx={{ fontSize: "32px", mb: 2 }} />
-
-                  <Box sx={{ textAlign: "center" }}>
-                    <Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}>
-                      Video Tutorials
-                    </Typography>
-                    <Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
-                      Watch step-by-step guides
-                    </Typography>
-                  </Box>
-                </CardContent>
-              </Card>
-            </Grid>
-
-            {/* Documentation */}
-            <Grid item xs={12} sm={6} md={4}>
-              <Card
-                sx={{
-                  height: 140,
-                  cursor: "pointer",
-                }}
-                onClick={() => window.open("https://docs.maxun.dev", "_blank")}
-              >
-                <CardContent
+                  <CardContent
+                    sx={{
+                      display: "flex",
+                      flexDirection: "column",
+                      alignItems: "center",
+                      justifyContent: "center", // center content
+                      height: "100%",
+                      textAlign: "center",
+                      p: 2,
+                      color: (theme) =>
+                        theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
+                    }}
+                  >
+                    <PlayCircleOutline sx={{ fontSize: "32px", mb: 2 }} />
+
+                    <Box sx={{ textAlign: "center" }}>
+                      <Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}>
+                        Video Tutorials
+                      </Typography>
+                      <Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
+                        Watch step-by-step guides
+                      </Typography>
+                    </Box>
+                  </CardContent>
+                </Card>
+              </Grid>
+
+              {/* Documentation */}
+              <Grid item xs={12} sm={6} md={4}>
+                <Card
                   sx={{
-                    display: "flex",
-                    flexDirection: "column",
-                    alignItems: "center",
-                    justifyContent: "center", // center everything
-                    height: "100%",
-                    textAlign: "center",
-                    p: 2,
-                    color: (theme) =>
-                      theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
+                    height: 140,
+                    cursor: "pointer",
                   }}
+                  onClick={() => window.open("https://docs.maxun.dev", "_blank")}
                 >
-                  <Article sx={{ fontSize: "32px", mb: 2 }} />
-
-                  <Box sx={{ textAlign: "center" }}>
-                    <Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}>
-                      Documentation
-                    </Typography>
-                    <Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
-                      Explore detailed guides
-                    </Typography>
-                  </Box>
-                </CardContent>
-              </Card>
+                  <CardContent
+                    sx={{
+                      display: "flex",
+                      flexDirection: "column",
+                      alignItems: "center",
+                      justifyContent: "center", // center everything
+                      height: "100%",
+                      textAlign: "center",
+                      p: 2,
+                      color: (theme) =>
+                        theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
+                    }}
+                  >
+                    <Article sx={{ fontSize: "32px", mb: 2 }} />
+
+                    <Box sx={{ textAlign: "center" }}>
+                      <Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}>
+                        Documentation
+                      </Typography>
+                      <Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
+                        Explore detailed guides
+                      </Typography>
+                    </Box>
+                  </CardContent>
+                </Card>
+              </Grid>
             </Grid>
-          </Grid>
-        </Box>
+          </Box>
         </TabPanel>
 
         <TabPanel value={tabValue} index={1}>

From eb86b6eb175269553c4fd0166675a7793c51e968 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:18:17 +0530
Subject: [PATCH 55/79] feat: markdown

---
 src/components/robot/pages/RobotCreate.tsx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 5cd4e74a6..62b7c9e61 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -372,6 +372,7 @@ const RobotCreate: React.FC = () => {
                   value={markdownRobotName}
                   onChange={(e) => setMarkdownRobotName(e.target.value)}
                   sx={{ mb: 2 }}
+                  label="Robot Name"
                 />
                 <TextField
                   placeholder="Example: https://www.ycombinator.com/companies/"

From febc6c119e5a55d4199aee006dc2e1c9887da8cc Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:21:09 +0530
Subject: [PATCH 56/79] feat: markdown

---
 src/components/robot/pages/RobotCreate.tsx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 62b7c9e61..50f1d9ed1 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -189,9 +189,9 @@ const RobotCreate: React.FC = () => {
               aria-controls="extract-robot"
             />
             <Tab
-              label="Scrape"
-              id="scrape-robot"
-              aria-controls="scrape-robot"
+              label="Markdown"
+              id="markdown-robot"
+              aria-controls="markdown-robot"
             />
           </Tabs>
         </Box>

From dbb6c8728978f45ae6adf553b8d616de3f50fc80 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:24:19 +0530
Subject: [PATCH 57/79] feat: change mui default tabs

---
 src/components/robot/pages/RobotCreate.tsx | 37 ++++++++++++++--------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 50f1d9ed1..0cc1491c7 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -181,20 +181,29 @@ const RobotCreate: React.FC = () => {
           </Typography>
         </Box>
 
-        <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 3 }}>
-          <Tabs value={tabValue} onChange={handleTabChange} aria-label="robot type tabs">
-            <Tab
-              label="Extract"
-              id="extract-robot"
-              aria-controls="extract-robot"
-            />
-            <Tab
-              label="Markdown"
-              id="markdown-robot"
-              aria-controls="markdown-robot"
-            />
-          </Tabs>
-        </Box>
+        <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 2 }}>
+  <Tabs
+    value={tabValue}
+    onChange={handleTabChange}
+    aria-label="robot type tabs"
+    sx={{
+      minHeight: 36,
+      '& .MuiTab-root': {
+        minHeight: 36,
+        paddingX: 2,
+        paddingY: 0.5,
+        minWidth: 0,
+      },
+      '& .MuiTabs-indicator': {
+        height: 2,
+      },
+    }}
+  >
+    <Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
+    <Tab label="Markdown" id="markdown-robot" aria-controls="markdown-robot" />
+  </Tabs>
+</Box>
+
 
         <TabPanel value={tabValue} index={0}>
           <Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>

From 606790e483a1a06f903ba6bb9147d345058ad00d Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:24:29 +0530
Subject: [PATCH 58/79] chore: lint

---
 src/components/robot/pages/RobotCreate.tsx | 42 +++++++++++-----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 0cc1491c7..014312e64 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -182,27 +182,27 @@ const RobotCreate: React.FC = () => {
         </Box>
 
         <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 2 }}>
-  <Tabs
-    value={tabValue}
-    onChange={handleTabChange}
-    aria-label="robot type tabs"
-    sx={{
-      minHeight: 36,
-      '& .MuiTab-root': {
-        minHeight: 36,
-        paddingX: 2,
-        paddingY: 0.5,
-        minWidth: 0,
-      },
-      '& .MuiTabs-indicator': {
-        height: 2,
-      },
-    }}
-  >
-    <Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
-    <Tab label="Markdown" id="markdown-robot" aria-controls="markdown-robot" />
-  </Tabs>
-</Box>
+          <Tabs
+            value={tabValue}
+            onChange={handleTabChange}
+            aria-label="robot type tabs"
+            sx={{
+              minHeight: 36,
+              '& .MuiTab-root': {
+                minHeight: 36,
+                paddingX: 2,
+                paddingY: 0.5,
+                minWidth: 0,
+              },
+              '& .MuiTabs-indicator': {
+                height: 2,
+              },
+            }}
+          >
+            <Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
+            <Tab label="Markdown" id="markdown-robot" aria-controls="markdown-robot" />
+          </Tabs>
+        </Box>
 
 
         <TabPanel value={tabValue} index={0}>

From 3dac1a09fbb57b6d563ba1bda18fa2a88ba1517b Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:26:24 +0530
Subject: [PATCH 59/79] feat: change mui default tabs

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 014312e64..8863838ff 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -191,7 +191,7 @@ const RobotCreate: React.FC = () => {
               '& .MuiTab-root': {
                 minHeight: 36,
                 paddingX: 2,
-                paddingY: 0.5,
+                paddingY: 1.5,
                 minWidth: 0,
               },
               '& .MuiTabs-indicator': {

From 96019058e96196e80ce7516cc9f81ef5a152ff41 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:36:43 +0530
Subject: [PATCH 60/79] feat: turn to markdown

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 8863838ff..88daa49b4 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -428,7 +428,7 @@ const RobotCreate: React.FC = () => {
                 }}
                 startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
               >
-                {isLoading ? 'Creating...' : 'Create Markdown Robot'}
+                {isLoading ? 'Turning...' : 'Turn to Markdown'}
               </Button>
             </Box>
           </Card>

From 930c7b6c7490bbdaafab60f07cb56fa73417d2ba Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 16:56:43 +0530
Subject: [PATCH 61/79] fix: lesser restrictions

---
 server/src/markdownify/markdown.ts | 83 ++++--------------------------
 1 file changed, 11 insertions(+), 72 deletions(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index 92551b51b..439a3a622 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -13,21 +13,6 @@ export async function parseMarkdown(
 
   const t = new TurndownService();
 
-  // Remove irrelevant tags 
-  const elementsToRemove = [
-    "meta",
-    "style",
-    "script",
-    "noscript",
-    "link",
-    "textarea",
-  ];
-
-  t.addRule("remove-irrelevant", {
-    filter: elementsToRemove,
-    replacement: () => "",
-  });
-
   t.addRule("truncate-svg", {
     filter: "svg",
     replacement: () => "",
@@ -106,64 +91,18 @@ function tidyHtml(html: string): string {
   const cheerio = require("cheerio");
   const $ = cheerio.load(html);
 
-  // Fix broken attributes
-  $("*").each(function (this: any) {
-    const element = $(this);
-    const attributes = Object.keys(this.attribs);
-
-    for (let i = 0; i < attributes.length; i++) {
-      let attr = attributes[i];
-      if (attr.includes('"')) {
-        element.remove();
-      }
-    }
-  });
-
   const manuallyCleanedElements = [
-    "aside",
-    "embed",
-    "head",
-    "iframe",
-    "menu",
-    "object",
-    "script",
-    "applet",
-    "audio",
-    "canvas",
-    "map",
-    "svg",
-    "video",
-    "area",
-    "blink",
-    "datalist",
-    "dialog",
-    "frame",
-    "frameset",
-    "link",
-    "input",
-    "ins",
-    "legend",
-    "marquee",
-    "math",
-    "menuitem",
-    "nav",
-    "noscript",
-    "optgroup",
-    "output",
-    "param",
-    "progress",
-    "rp",
-    "rt",
-    "rtc",
-    "source",
-    "style",
-    "track",
-    "textarea",
-    "time",
-    "use",
-    "img",
-    "picture",
-    "figure",
+  "script",
+  "style",
+  "iframe",
+  "noscript",
+  "meta",
+  "link",
+  "object",
+  "embed",
+  "canvas",
+  "audio",
+  "video"
   ];
 
   manuallyCleanedElements.forEach((tag) => $(tag).remove());

From 691dedc351d88f6daece4f19960ee1744cd172de Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 17:22:33 +0530
Subject: [PATCH 62/79] fix: lesser restrictions

---
 server/src/markdownify/markdown.ts | 91 ++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 29 deletions(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index 439a3a622..f50136f68 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -11,13 +11,34 @@ export async function parseMarkdown(
 
   const tidiedHtml = tidyHtml(html);
 
-  const t = new TurndownService();
+  const t = new TurndownService({
+    headingStyle: "atx", // ensures #### instead of ------
+    codeBlockStyle: "fenced",
+  });
+
+  // ---------------------------------------------
+  // Fix 1: Proper ATX headings #### instead of underline-style
+  // ---------------------------------------------
+  t.addRule("forceAtxHeadings", {
+    filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
+    replacement: (content: string, node: any) => {
+      const level = Number(node.nodeName.charAt(1));
+      const clean = content.trim();
+      return `\n${"#".repeat(level)} ${clean}\n`;
+    },
+  });
 
+  // ---------------------------------------------
+  // Remove SVGs
+  // ---------------------------------------------
   t.addRule("truncate-svg", {
     filter: "svg",
     replacement: () => "",
   });
 
+  // ---------------------------------------------
+  // Improved paragraph cleanup
+  // ---------------------------------------------
   t.addRule("improved-paragraph", {
     filter: "p",
     replacement: (innerText: string) => {
@@ -27,16 +48,28 @@ export async function parseMarkdown(
     },
   });
 
+  // ---------------------------------------------
+  // Fix 2: Inline link with fallback text
+  // ---------------------------------------------
   t.addRule("inlineLink", {
     filter: (node: any, opts: any) =>
-      opts.linkStyle === "inlined" &&
-      node.nodeName === "A" &&
-      node.getAttribute("href"),
+      node.nodeName === "A" && node.getAttribute("href"),
 
     replacement: (content: string, node: any) => {
+      let text = content.trim();
+
+      // Fallback: aria-label → title → domain
+      if (!text) {
+        text =
+          node.getAttribute("aria-label")?.trim() ||
+          node.getAttribute("title")?.trim() ||
+          getDomainFromUrl(node.getAttribute("href")) ||
+          "link";
+      }
+
       let href = node.getAttribute("href").trim();
 
-      // Relative → absolute
+      // relative → absolute
       if (baseUrl && isRelativeUrl(href)) {
         try {
           const u = new URL(href, baseUrl);
@@ -44,45 +77,46 @@ export async function parseMarkdown(
         } catch {}
       }
 
-      // Clean URL
       href = cleanUrl(href);
 
-      const title = node.title ? ` "${cleanAttribute(node.title)}"` : "";
-      return `[${content.trim()}](${href}${title})\n`;
+      return `[${text}](${href})`;
     },
   });
 
   t.use(gfm);
 
-  // ---------------------------------------------------
-  // Convert
-  // ---------------------------------------------------
+  // Convert HTML → Markdown
   try {
     let out = await t.turndown(tidiedHtml);
     out = fixBrokenLinks(out);
     out = stripSkipLinks(out);
-    return out;
+    return out.trim();
   } catch (err) {
     console.error("HTML→Markdown failed", { err });
     return "";
   }
 }
 
-// ---------------------------------------------
+// -----------------------------------------------------
 // Helpers
-// ---------------------------------------------
+// -----------------------------------------------------
 function isRelativeUrl(url: string): boolean {
   return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
 }
 
-function cleanUrl(u: string): string {
+function getDomainFromUrl(url: string): string | null {
   try {
-    return u;
+    const u = new URL(url);
+    return u.hostname.replace("www.", "");
   } catch {
-    return u;
+    return null;
   }
 }
 
+function cleanUrl(u: string): string {
+  return u;
+}
+
 function cleanAttribute(attr: string) {
   return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
 }
@@ -92,24 +126,23 @@ function tidyHtml(html: string): string {
   const $ = cheerio.load(html);
 
   const manuallyCleanedElements = [
-  "script",
-  "style",
-  "iframe",
-  "noscript",
-  "meta",
-  "link",
-  "object",
-  "embed",
-  "canvas",
-  "audio",
-  "video"
+    "script",
+    "style",
+    "iframe",
+    "noscript",
+    "meta",
+    "link",
+    "object",
+    "embed",
+    "canvas",
+    "audio",
+    "video",
   ];
 
   manuallyCleanedElements.forEach((tag) => $(tag).remove());
   return $("body").html();
 }
 
-
 function fixBrokenLinks(md: string): string {
   let depth = 0;
   let result = "";

From 7f48e276f1370fc28cba8d9f98582516d84243e0 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 17:23:04 +0530
Subject: [PATCH 63/79] chore: lint

---
 server/src/markdownify/markdown.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts
index f50136f68..eb4567f3b 100644
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -17,7 +17,7 @@ export async function parseMarkdown(
   });
 
   // ---------------------------------------------
-  // Fix 1: Proper ATX headings #### instead of underline-style
+  // Proper ATX headings #### instead of underline-style
   // ---------------------------------------------
   t.addRule("forceAtxHeadings", {
     filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
@@ -49,7 +49,7 @@ export async function parseMarkdown(
   });
 
   // ---------------------------------------------
-  // Fix 2: Inline link with fallback text
+  // Inline link with fallback text
   // ---------------------------------------------
   t.addRule("inlineLink", {
     filter: (node: any, opts: any) =>
@@ -74,7 +74,7 @@ export async function parseMarkdown(
         try {
           const u = new URL(href, baseUrl);
           href = u.toString();
-        } catch {}
+        } catch { }
       }
 
       href = cleanUrl(href);

From fef038b8cfb4dd8db1cd84c0303bab991e8ea64d Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 17:24:15 +0530
Subject: [PATCH 64/79] chore: cleanup wanted deps

---
 package.json | 2 --
 1 file changed, 2 deletions(-)

diff --git a/package.json b/package.json
index 1303410d7..405571436 100644
--- a/package.json
+++ b/package.json
@@ -28,7 +28,6 @@
     "bcrypt": "^5.1.1",
     "body-parser": "^1.20.3",
     "buffer": "^6.0.3",
-    "cheerio": "^1.1.2",
     "connect-pg-simple": "^10.0.0",
     "cookie-parser": "^1.4.6",
     "cors": "^2.8.5",
@@ -50,7 +49,6 @@
     "joplin-turndown-plugin-gfm": "^1.0.12",
     "jsonwebtoken": "^9.0.2",
     "jwt-decode": "^4.0.0",
-    "koffi": "^2.14.1",
     "lodash": "^4.17.21",
     "loglevel": "^1.8.0",
     "loglevel-plugin-remote": "^0.6.8",

From 5aafe6eaaf3feb0a40402542742be2037f0f7793 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 17:55:47 +0530
Subject: [PATCH 65/79] feat: add html

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 88daa49b4..9d88279bb 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -370,7 +370,7 @@ const RobotCreate: React.FC = () => {
               />
 
               <Typography variant="body2" color="text.secondary" mb={3}>
-                Turn websites into LLM-ready Markdown content for AI apps.
+                Turn websites into LLM-ready Markdown & clean HTML for AI apps.
               </Typography>
 
               <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>

From 418100c1698446dd7e6ec7f9d6b99ffed1067919 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 17:56:37 +0530
Subject: [PATCH 66/79] feat: scrape robot

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 9d88279bb..ec01b60b9 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -200,7 +200,7 @@ const RobotCreate: React.FC = () => {
             }}
           >
             <Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
-            <Tab label="Markdown" id="markdown-robot" aria-controls="markdown-robot" />
+            <Tab label="Scrape" id="scrape-robot" aria-controls="scrape-robot" />
           </Tabs>
         </Box>
 

From f3c79bd30322285e6d183e2bbf2e144cff0112a5 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 17:59:30 +0530
Subject: [PATCH 67/79] feat: scrape robot

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index ec01b60b9..119e9be91 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -428,7 +428,7 @@ const RobotCreate: React.FC = () => {
                 }}
                 startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
               >
-                {isLoading ? 'Turning...' : 'Turn to Markdown'}
+                {isLoading ? 'Creating...' : 'Create Robot'}
               </Button>
             </Box>
           </Card>

From e90cd9961e1b78cd475a50bf5ff15857142d7fc5 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 18:49:39 +0530
Subject: [PATCH 68/79] feat: add html scrape support

---
 server/src/api/record.ts                      | 122 ++++++++++++++----
 server/src/markdownify/scrape.ts              |  54 ++++++++
 server/src/models/Robot.ts                    |   3 +-
 server/src/pgboss-worker.ts                   |  47 +++++--
 server/src/routes/storage.ts                  |  19 ++-
 .../workflow-management/scheduler/index.ts    |  75 ++++++++---
 src/components/robot/RecordingsTable.tsx      |  65 ++++++----
 src/components/robot/pages/RobotCreate.tsx    |  74 +++++++++--
 .../robot/pages/RobotDuplicatePage.tsx        |   3 +-
 src/components/robot/pages/RobotEditPage.tsx  |   3 +-
 .../robot/pages/RobotSettingsPage.tsx         |   3 +-
 src/context/globalInfo.tsx                    |   3 +-
 12 files changed, 366 insertions(+), 105 deletions(-)

diff --git a/server/src/api/record.ts b/server/src/api/record.ts
index fd7376abc..cbf4f67e4 100644
--- a/server/src/api/record.ts
+++ b/server/src/api/record.ts
@@ -18,7 +18,7 @@ import { WorkflowFile } from "maxun-core";
 import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
 import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
 import { sendWebhook } from "../routes/webhook";
-import { convertPageToMarkdown } from '../markdownify/scrape';
+import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
 
 chromium.use(stealthPlugin());
 
@@ -346,7 +346,8 @@ function formatRunResponse(run: any) {
         data: {
             textData: {},
             listData: {},
-            markdown: ''
+            markdown: '',
+            html: ''
         },
         screenshots: [] as any[],
     };
@@ -365,6 +366,10 @@ function formatRunResponse(run: any) {
         formattedRun.data.markdown = output.markdown[0]?.content || '';
     }
 
+    if (output.html && Array.isArray(output.html)) {
+        formattedRun.data.html = output.html[0]?.content || '';
+    }
+
     if (run.binaryOutput) {
         Object.keys(run.binaryOutput).forEach(key => {
             if (run.binaryOutput[key]) {
@@ -575,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
   }
 }
 
-async function readyForRunHandler(browserId: string, id: string, userId: string){
+async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
     try {
-        const result = await executeRun(id, userId);
+        const result = await executeRun(id, userId, requestedFormats);
 
         if (result && result.success) {
             logger.log('info', `Interpretation of ${id} succeeded`);
@@ -614,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
     return copy;
 };
 
-async function executeRun(id: string, userId: string) {
+async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
     let browser: any = null;
     
     try {
@@ -657,12 +662,19 @@ async function executeRun(id: string, userId: string) {
             };
         }
 
-        if (recording.recording_meta.type === 'markdown') {
-            logger.log('info', `Executing markdown robot for API run ${id}`);
+        if (recording.recording_meta.type === 'scrape') {
+            logger.log('info', `Executing scrape robot for API run ${id}`);
+
+            let formats = recording.recording_meta.formats || ['markdown'];
+
+            // Override if API request defines formats
+            if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
+                formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
+            }
 
             await run.update({
                 status: 'running',
-                log: 'Converting page to markdown'
+                log: `Converting page to: ${formats.join(', ')}`
             });
 
             try {
@@ -672,20 +684,33 @@ async function executeRun(id: string, userId: string) {
                     throw new Error('No URL specified for markdown robot');
                 }
 
-                const markdown = await convertPageToMarkdown(url);
+                let markdown = '';
+                let html = '';
+                const serializableOutput: any = {};
+
+                // Markdown conversion
+                if (formats.includes('markdown')) {
+                    markdown = await convertPageToMarkdown(url);
+                    serializableOutput.markdown = [{ content: markdown }];
+                }
+
+                // HTML conversion
+                if (formats.includes('html')) {
+                    html = await convertPageToHTML(url);
+                    serializableOutput.html = [{ content: html }];
+                }
 
                 await run.update({
                     status: 'success',
                     finishedAt: new Date().toLocaleString(),
-                    log: 'Markdown conversion completed successfully',
-                    serializableOutput: {
-                        markdown: [{ content: markdown }]
-                    },
+                    log: `${formats.join(', ')} conversion completed successfully`,
+                    serializableOutput,
                     binaryOutput: {},
                 });
 
                 logger.log('info', `Markdown robot execution completed for API run ${id}`);
 
+                // Push success socket event
                 try {
                     const completionData = {
                         runId: plainRun.runId,
@@ -695,30 +720,45 @@ async function executeRun(id: string, userId: string) {
                         finishedAt: new Date().toLocaleString()
                     };
 
-                    serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
+                    serverIo
+                        .of('/queued-run')
+                        .to(`user-${userId}`)
+                        .emit('run-completed', completionData);
                 } catch (socketError: any) {
-                    logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
+                    logger.log(
+                        'warn',
+                        `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
+                    );
                 }
 
-                const webhookPayload = {
+                // Build webhook payload
+                const webhookPayload: any = {
                     robot_id: plainRun.robotMetaId,
                     run_id: plainRun.runId,
                     robot_name: recording.recording_meta.name,
                     status: 'success',
                     started_at: plainRun.startedAt,
                     finished_at: new Date().toLocaleString(),
-                    markdown: markdown,
                     metadata: {
                         browser_id: plainRun.browserId,
                         user_id: userId,
-                    }
+                    },
                 };
 
+                if (formats.includes('markdown')) webhookPayload.markdown = markdown;
+                if (formats.includes('html')) webhookPayload.html = html;
+
                 try {
                     await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
-                    logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`);
+                    logger.log(
+                        'info',
+                        `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
+                    );
                 } catch (webhookError: any) {
-                    logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
+                    logger.log(
+                        'warn',
+                        `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
+                    );
                 }
 
                 await destroyRemoteBrowser(plainRun.browserId, userId);
@@ -728,14 +768,18 @@ async function executeRun(id: string, userId: string) {
                     interpretationInfo: run.toJSON()
                 };
             } catch (error: any) {
-                logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`);
+                logger.log(
+                    'error',
+                    `${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
+                );
 
                 await run.update({
                     status: 'failed',
                     finishedAt: new Date().toLocaleString(),
-                    log: `Markdown conversion failed: ${error.message}`,
+                    log: `${formats.join(', ')} conversion failed: ${error.message}`,
                 });
 
+                // Send failure socket event
                 try {
                     const failureData = {
                         runId: plainRun.runId,
@@ -745,9 +789,15 @@ async function executeRun(id: string, userId: string) {
                         finishedAt: new Date().toLocaleString()
                     };
 
-                    serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
+                    serverIo
+                        .of('/queued-run')
+                        .to(`user-${userId}`)
+                        .emit('run-completed', failureData);
                 } catch (socketError: any) {
-                    logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
+                    logger.log(
+                        'warn',
+                        `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
+                    );
                 }
 
                 await destroyRemoteBrowser(plainRun.browserId, userId);
@@ -953,7 +1003,7 @@ async function executeRun(id: string, userId: string) {
     }
 }
 
-export async function handleRunRecording(id: string, userId: string) {
+export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
     try {
         const result = await createWorkflowAndStoreMetadata(id, userId);
         const { browserId, runId: newRunId } = result;
@@ -967,7 +1017,7 @@ export async function handleRunRecording(id: string, userId: string) {
             rejectUnauthorized: false
         });
 
-        socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
+        socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));
 
         logger.log('info', `Running Robot: ${id}`);
 
@@ -1018,6 +1068,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
  *           type: string
  *         required: true
  *         description: The ID of the robot to run.
+ *     requestBody:
+ *       required: false
+ *       content:
+ *         application/json:
+ *           schema:
+ *             type: object
+ *             properties:
+ *               formats:
+ *                 type: array
+ *                 items:
+ *                   type: string
+ *                   enum: [markdown, html]
+ *                 description: Optional override formats for this run.
+ *           example:
+ *             formats: ["html"]
  *     responses:
  *       200:
  *         description: Robot run started successfully.
@@ -1076,7 +1141,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
         if (!req.user) {
             return res.status(401).json({ ok: false, error: 'Unauthorized' });
         }
-        const runId = await handleRunRecording(req.params.id, req.user.id);
+
+        const requestedFormats = req.body.formats;
+
+        const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
 
         if (!runId) {
             throw new Error('Run ID is undefined');
diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts
index b58265a24..935fa0cb6 100644
--- a/server/src/markdownify/scrape.ts
+++ b/server/src/markdownify/scrape.ts
@@ -55,3 +55,57 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
   const markdown = await parseMarkdown(cleanedHtml, url);
   return markdown;
 }
+
+/**
+ * Fetches a webpage, strips scripts/styles/images/etc,
+ * returns clean HTML.
+ */
+export async function convertPageToHTML(url: string): Promise<string> {
+  const browser = await chromium.launch();
+  const page = await browser.newPage();
+
+  await page.goto(url, { waitUntil: "networkidle" });
+
+  await page.addInitScript(() => {
+    const selectors = [
+      "script",
+      "style",
+      "link[rel='stylesheet']",
+      "noscript",
+      "meta",
+      "svg",
+      "img",
+      "picture",
+      "source",
+      "video",
+      "audio",
+      "iframe",
+      "object",
+      "embed"
+    ];
+
+    selectors.forEach(sel => {
+      document.querySelectorAll(sel).forEach(e => e.remove());
+    });
+
+    // Remove inline event handlers (onclick, onload…)
+    const all = document.querySelectorAll("*");
+    all.forEach(el => {
+      [...el.attributes].forEach(attr => {
+        if (attr.name.startsWith("on")) {
+          el.removeAttribute(attr.name);
+        }
+      });
+    });
+  });
+
+  // Re-extract HTML after cleanup
+  const cleanedHtml = await page.evaluate(() => {
+    return document.documentElement.outerHTML;
+  });
+
+  await browser.close();
+
+  // Return cleaned HTML directly
+  return cleanedHtml;
+}
diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts
index 5acbdf133..39218de24 100644
--- a/server/src/models/Robot.ts
+++ b/server/src/models/Robot.ts
@@ -9,8 +9,9 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
-  type?: 'traditional' | 'markdown';
+  type?: 'extract' | 'scrape';
   url?: string;
+  formats?: ('markdown' | 'html')[];
 }
 
 interface RobotWorkflow {
diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts
index b2d5bdb30..66e852b85 100644
--- a/server/src/pgboss-worker.ts
+++ b/server/src/pgboss-worker.ts
@@ -20,7 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
 import { io as serverIo } from "./server";
 import { sendWebhook } from './routes/webhook';
 import { BinaryOutputService } from './storage/mino';
-import { convertPageToMarkdown } from './markdownify/scrape';
+import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';
 
 if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
     throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
@@ -189,12 +189,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
         throw new Error(`Recording for run ${data.runId} not found`);
       }
 
-      if (recording.recording_meta.type === 'markdown') {
-        logger.log('info', `Executing markdown robot for run ${data.runId}`);
+      if (recording.recording_meta.type === 'scrape') {
+        logger.log('info', `Executing scrape robot for run ${data.runId}`);
+
+        const formats = recording.recording_meta.formats || ['markdown'];
 
         await run.update({
           status: 'running',
-          log: 'Converting page to markdown'
+          log: `Converting page to ${formats.join(', ')}`
         });
 
         try {
@@ -204,20 +206,34 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
             throw new Error('No URL specified for markdown robot');
           }
 
-          const markdown = await convertPageToMarkdown(url);
+          let markdown = '';
+          let html = '';
+          const serializableOutput: any = {};
+
+          // Markdown conversion
+          if (formats.includes('markdown')) {
+            markdown = await convertPageToMarkdown(url);
+            serializableOutput.markdown = [{ content: markdown }];
+          }
+
+          // HTML conversion
+          if (formats.includes('html')) {
+            html = await convertPageToHTML(url);
+            serializableOutput.html = [{ content: html }];
+          }
 
+          // Success update
           await run.update({
             status: 'success',
             finishedAt: new Date().toLocaleString(),
-            log: 'Markdown conversion completed successfully',
-            serializableOutput: {
-              markdown: [{ content: markdown }]
-            },
+            log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
+            serializableOutput,
             binaryOutput: {},
           });
 
           logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
 
+          // Notify sockets
           try {
             const completionData = {
               runId: data.runId,
@@ -233,15 +249,19 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
             logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
           }
 
+          // Webhooks
           try {
-            const webhookPayload = {
+            const webhookPayload: any = {
               runId: data.runId,
               robotId: plainRun.robotMetaId,
               robotName: recording.recording_meta.name,
               status: 'success',
               finishedAt: new Date().toLocaleString(),
-              markdown: markdown
             };
+
+            if (formats.includes('markdown')) webhookPayload.markdown = markdown;
+            if (formats.includes('html')) webhookPayload.html = html;
+
             await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
             logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
           } catch (webhookError: any) {
@@ -251,13 +271,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
           await destroyRemoteBrowser(browserId, data.userId);
 
           return { success: true };
+
         } catch (error: any) {
-          logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`);
+          logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`);
 
           await run.update({
             status: 'failed',
             finishedAt: new Date().toLocaleString(),
-            log: `Markdown conversion failed: ${error.message}`,
+            log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`,
           });
 
           try {
diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts
index ee23ee442..44279e9cd 100644
--- a/server/src/routes/storage.ts
+++ b/server/src/routes/storage.ts
@@ -440,9 +440,9 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
 /**
  * POST endpoint for creating a markdown robot
  */
-router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => {
+router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => {
   try {
-    const { url, name } = req.body;
+    const { url, name, formats } = req.body;
 
     if (!url) {
       return res.status(400).json({ error: 'The "url" field is required.' });
@@ -459,6 +459,18 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ
       return res.status(400).json({ error: 'Invalid URL format' });
     }
 
+    // Validate format
+    const validFormats = ['markdown', 'html'];
+
+    if (!Array.isArray(formats) || formats.length === 0) {
+      return res.status(400).json({ error: 'At least one output format must be selected.' });
+    }
+
+    const invalid = formats.filter(f => !validFormats.includes(f));
+    if (invalid.length > 0) {
+      return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` });
+    }
+
     const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
     const currentTimestamp = new Date().toLocaleString();
     const robotId = uuid();
@@ -473,8 +485,9 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ
         updatedAt: currentTimestamp,
         pairs: 0,
         params: [],
-        type: 'markdown',
+        type: 'scrape',
         url: url,
+        formats: formats,
       },
       recording: { workflow: [] },
       google_sheet_email: null,
diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts
index 7c2cb4085..d5ba76f40 100644
--- a/server/src/workflow-management/scheduler/index.ts
+++ b/server/src/workflow-management/scheduler/index.ts
@@ -15,7 +15,7 @@ import { WorkflowFile } from "maxun-core";
 import { Page } from "playwright";
 import { sendWebhook } from "../../routes/webhook";
 import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
-import { convertPageToMarkdown } from "../../markdownify/scrape";
+import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
 chromium.use(stealthPlugin());
 
 async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -208,12 +208,14 @@ async function executeRun(id: string, userId: string) {
       }
     }
 
-    if (recording.recording_meta.type === 'markdown') {
-      logger.log('info', `Executing markdown robot for scheduled run ${id}`);
+    if (recording.recording_meta.type === 'scrape') {
+      logger.log('info', `Executing scrape robot for scheduled run ${id}`);
+
+      const formats = recording.recording_meta.formats || ['markdown'];
 
       await run.update({
         status: 'running',
-        log: 'Converting page to markdown'
+        log: `Converting page to: ${formats.join(', ')}`
       });
 
       try {
@@ -226,9 +228,15 @@ async function executeRun(id: string, userId: string) {
         };
 
         serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
-        logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`);
+        logger.log(
+          'info',
+          `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
+        );
       } catch (socketError: any) {
-        logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`);
+        logger.log(
+          'warn',
+          `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
+        );
       }
 
       try {
@@ -238,20 +246,33 @@ async function executeRun(id: string, userId: string) {
           throw new Error('No URL specified for markdown robot');
         }
 
-        const markdown = await convertPageToMarkdown(url);
+        let markdown = '';
+        let html = '';
+        const serializableOutput: any = {};
+
+        // Markdown conversion
+        if (formats.includes('markdown')) {
+          markdown = await convertPageToMarkdown(url);
+          serializableOutput.markdown = [{ content: markdown }];
+        }
+
+        // HTML conversion
+        if (formats.includes('html')) {
+          html = await convertPageToHTML(url);
+          serializableOutput.html = [{ content: html }];
+        }
 
         await run.update({
           status: 'success',
           finishedAt: new Date().toLocaleString(),
-          log: 'Markdown conversion completed successfully',
-          serializableOutput: {
-            markdown: [{ content: markdown }]
-          },
+          log: `${formats.join(', ')} conversion completed successfully`,
+          serializableOutput,
           binaryOutput: {},
         });
 
         logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
 
+        // Run-completed socket notifications
         try {
           const completionData = {
             runId: plainRun.runId,
@@ -264,40 +285,53 @@ async function executeRun(id: string, userId: string) {
           serverIo.of(plainRun.browserId).emit('run-completed', completionData);
           serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
         } catch (socketError: any) {
-          logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
+          logger.log(
+            'warn',
+            `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
+          );
         }
 
-        const webhookPayload = {
+        // Webhook payload
+        const webhookPayload: any = {
           robot_id: plainRun.robotMetaId,
           run_id: plainRun.runId,
           robot_name: recording.recording_meta.name,
           status: 'success',
           started_at: plainRun.startedAt,
           finished_at: new Date().toLocaleString(),
-          markdown: markdown,
           metadata: {
             browser_id: plainRun.browserId,
             user_id: userId,
           }
         };
 
+        if (formats.includes('markdown')) webhookPayload.markdown = markdown;
+        if (formats.includes('html')) webhookPayload.html = html;
+
         try {
           await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
-          logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`);
+          logger.log(
+            'info',
+            `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
+          );
         } catch (webhookError: any) {
-          logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
+          logger.log(
+            'warn',
+            `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
+          );
         }
 
         await destroyRemoteBrowser(plainRun.browserId, userId);
 
         return true;
+
       } catch (error: any) {
-        logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`);
+        logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
 
         await run.update({
           status: 'failed',
           finishedAt: new Date().toLocaleString(),
-          log: `Markdown conversion failed: ${error.message}`,
+          log: `${formats.join(', ')} conversion failed: ${error.message}`,
         });
 
         try {
@@ -312,7 +346,10 @@ async function executeRun(id: string, userId: string) {
           serverIo.of(plainRun.browserId).emit('run-completed', failureData);
           serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
         } catch (socketError: any) {
-          logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
+          logger.log(
+            'warn',
+            `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
+          );
         }
 
         await destroyRemoteBrowser(plainRun.browserId, userId);
diff --git a/src/components/robot/RecordingsTable.tsx b/src/components/robot/RecordingsTable.tsx
index f06270ed3..79319b923 100644
--- a/src/components/robot/RecordingsTable.tsx
+++ b/src/components/robot/RecordingsTable.tsx
@@ -110,7 +110,10 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
             case 'integrate':
               return (
                 <MemoizedTableCell key={column.id} align={column.align}>
-                  <MemoizedIntegrateButton handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} />
+                  <MemoizedIntegrateButton
+                    handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])}
+                    robotType={row.type}
+                  />
                 </MemoizedTableCell>
               );
             case 'options':
@@ -121,6 +124,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
                     handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])}
                     handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])}
                     handleDelete={() => handlers.handleDelete(row.id)}
+                    robotType={row.type}
                   />
                 </MemoizedTableCell>
               );
@@ -709,13 +713,22 @@ const ScheduleButton = ({ handleSchedule }: ScheduleButtonProps) => {
 
 interface IntegrateButtonProps {
   handleIntegrate: () => void;
+  robotType: string;
 }
 
-const IntegrateButton = ({ handleIntegrate }: IntegrateButtonProps) => {
+const IntegrateButton = ({ handleIntegrate, robotType }: IntegrateButtonProps) => {
+  const isDisabled = robotType === 'scrape';
+
   return (
-    <IconButton aria-label="add" size="small" onClick={() => {
-      handleIntegrate();
-    }}
+    <IconButton
+      aria-label="integrate"
+      size="small"
+      onClick={isDisabled ? undefined : handleIntegrate}
+      disabled={isDisabled}
+      sx={{
+        opacity: isDisabled ? 0.4 : 1,
+        cursor: isDisabled ? 'not-allowed' : 'pointer',
+      }}
     >
       <Power />
     </IconButton>
@@ -742,9 +755,10 @@ interface OptionsButtonProps {
   handleEdit: () => void;
   handleDelete: () => void;
   handleDuplicate: () => void;
+  robotType: string;
 }
 
-const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate }: OptionsButtonProps) => {
+const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate, robotType }: OptionsButtonProps) => {
   const [anchorEl, setAnchorEl] = React.useState<null | HTMLElement>(null);
 
   const handleClick = (event: React.MouseEvent<HTMLElement>) => {
@@ -771,34 +785,33 @@ const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicat
         open={Boolean(anchorEl)}
         onClose={handleClose}
       >
-        <MenuItem onClick={() => { handleRetrain(); handleClose(); }}>
-          <ListItemIcon>
-            <Refresh fontSize="small" />
-          </ListItemIcon>
-          <ListItemText>{t('recordingtable.retrain')}</ListItemText>
-        </MenuItem>
+        {robotType !== 'scrape' && (
+          <MenuItem onClick={() => { handleRetrain(); handleClose(); }}>
+            <ListItemIcon>
+              <Refresh fontSize="small" />
+            </ListItemIcon>
+            <ListItemText>Retrain</ListItemText>
+          </MenuItem>
+        )}
 
         <MenuItem onClick={() => { handleEdit(); handleClose(); }}>
-          <ListItemIcon>
-            <Edit fontSize="small" />
-          </ListItemIcon>
-          <ListItemText>{t('recordingtable.edit')}</ListItemText>
+          <ListItemIcon><Edit fontSize="small" /></ListItemIcon>
+          <ListItemText>Edit</ListItemText>
         </MenuItem>
 
         <MenuItem onClick={() => { handleDelete(); handleClose(); }}>
-          <ListItemIcon>
-            <DeleteForever fontSize="small" />
-          </ListItemIcon>
-          <ListItemText>{t('recordingtable.delete')}</ListItemText>
+          <ListItemIcon><DeleteForever fontSize="small" /></ListItemIcon>
+          <ListItemText>Delete</ListItemText>
         </MenuItem>
 
-        <MenuItem onClick={() => { handleDuplicate(); handleClose(); }}>
-          <ListItemIcon>
-            <ContentCopy fontSize="small" />
-          </ListItemIcon>
-          <ListItemText>{t('recordingtable.duplicate')}</ListItemText>
-        </MenuItem>
+        {robotType !== 'scrape' && (
+          <MenuItem onClick={() => { handleDuplicate(); handleClose(); }}>
+            <ListItemIcon><ContentCopy fontSize="small" /></ListItemIcon>
+            <ListItemText>Duplicate</ListItemText>
+          </MenuItem>
+        )}
       </Menu>
+
     </>
   );
 };
diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 88daa49b4..312d7bae7 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -15,12 +15,16 @@ import {
   Container,
   CardContent,
   Tabs,
-  Tab
+  Tab,
+  RadioGroup,
+  Radio,
+  FormControl,
+  FormLabel
 } from '@mui/material';
 import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material';
 import { useGlobalInfoStore } from '../../../context/globalInfo';
 import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
-import { createMarkdownRobot } from "../../../api/storage";
+import { createScrapeRobot } from "../../../api/storage";
 import { AuthContext } from '../../../context/auth';
 import { GenericModal } from '../../ui/GenericModal';
 
@@ -54,11 +58,12 @@ const RobotCreate: React.FC = () => {
 
   const [tabValue, setTabValue] = useState(0);
   const [url, setUrl] = useState('');
-  const [markdownRobotName, setMarkdownRobotName] = useState('');
+  const [scrapeRobotName, setScrapeRobotName] = useState('');
   const [needsLogin, setNeedsLogin] = useState(false);
   const [isLoading, setIsLoading] = useState(false);
   const [isWarningModalOpen, setWarningModalOpen] = useState(false);
   const [activeBrowserId, setActiveBrowserId] = useState('');
+  const [outputFormats, setOutputFormats] = useState<string[]>([]);
 
   const { state } = React.useContext(AuthContext);
   const { user } = state;
@@ -200,7 +205,7 @@ const RobotCreate: React.FC = () => {
             }}
           >
             <Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
-            <Tab label="Markdown" id="markdown-robot" aria-controls="markdown-robot" />
+            <Tab label="Scrape" id="scrape-robot" aria-controls="scrape-robot" />
           </Tabs>
         </Box>
 
@@ -370,7 +375,7 @@ const RobotCreate: React.FC = () => {
               />
 
               <Typography variant="body2" color="text.secondary" mb={3}>
-                Turn websites into LLM-ready Markdown content for AI apps.
+                Turn websites into LLM-ready Markdown or clean HTML content for AI apps.
               </Typography>
 
               <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
@@ -378,8 +383,8 @@ const RobotCreate: React.FC = () => {
                   placeholder="Example: YC Companies Scraper"
                   variant="outlined"
                   fullWidth
-                  value={markdownRobotName}
-                  onChange={(e) => setMarkdownRobotName(e.target.value)}
+                  value={scrapeRobotName}
+                  onChange={(e) => setScrapeRobotName(e.target.value)}
                   sx={{ mb: 2 }}
                   label="Robot Name"
                 />
@@ -390,7 +395,44 @@ const RobotCreate: React.FC = () => {
                   value={url}
                   onChange={(e) => setUrl(e.target.value)}
                   label="Website URL"
+                  sx={{ mb: 2 }}
                 />
+
+                <FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
+                  <FormLabel component="legend" sx={{ mb: 1 }}>Output Format (Select at least one)</FormLabel>
+
+                  <FormControlLabel
+                    control={
+                      <Checkbox
+                        checked={outputFormats.includes('markdown')}
+                        onChange={(e) => {
+                          if (e.target.checked) {
+                            setOutputFormats([...outputFormats, 'markdown']);
+                          } else {
+                            setOutputFormats(outputFormats.filter(f => f !== 'markdown'));
+                          }
+                        }}
+                      />
+                    }
+                    label="Markdown"
+                  />
+
+                  <FormControlLabel
+                    control={
+                      <Checkbox
+                        checked={outputFormats.includes('html')}
+                        onChange={(e) => {
+                          if (e.target.checked) {
+                            setOutputFormats([...outputFormats, 'html']);
+                          } else {
+                            setOutputFormats(outputFormats.filter(f => f !== 'html'));
+                          }
+                        }}
+                      />
+                    }
+                    label="HTML"
+                  />
+                </FormControl>
               </Box>
 
               <Button
@@ -401,23 +443,28 @@ const RobotCreate: React.FC = () => {
                     notify('error', 'Please enter a valid URL');
                     return;
                   }
-                  if (!markdownRobotName.trim()) {
+                  if (!scrapeRobotName.trim()) {
                     notify('error', 'Please enter a robot name');
                     return;
                   }
+                  if (outputFormats.length === 0) {
+                    notify('error', 'Please select at least one output format');
+                    return;
+                  }
+
                   setIsLoading(true);
-                  const result = await createMarkdownRobot(url, markdownRobotName);
+                  const result = await createScrapeRobot(url, scrapeRobotName, outputFormats);
                   setIsLoading(false);
 
                   if (result) {
                     setRerenderRobots(true);
-                    notify('success', `${markdownRobotName} created successfully!`);
+                    notify('success', `${scrapeRobotName} created successfully!`);
                     navigate('/robots');
                   } else {
                     notify('error', 'Failed to create markdown robot');
                   }
                 }}
-                disabled={!url.trim() || !markdownRobotName.trim() || isLoading}
+                disabled={!url.trim() || !scrapeRobotName.trim() || outputFormats.length === 0 || isLoading}
                 sx={{
                   bgcolor: '#ff00c3',
                   py: 1.4,
@@ -428,7 +475,10 @@ const RobotCreate: React.FC = () => {
                 }}
                 startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
               >
-                {isLoading ? 'Turning...' : 'Turn to Markdown'}
+                {isLoading
+                  ? "Creating..."
+                  : `Create ${outputFormats.join(" + ").toUpperCase()} Robot`
+                }
               </Button>
             </Box>
           </Card>
diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx
index 7c45c8e83..ac602f8e1 100644
--- a/src/components/robot/pages/RobotDuplicatePage.tsx
+++ b/src/components/robot/pages/RobotDuplicatePage.tsx
@@ -24,8 +24,9 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
-  type?: 'traditional' | 'markdown';
+  type?: 'extract' | 'scrape';
   url?: string;
+  formats?: ('markdown' | 'html')[];
 }
 
 interface RobotWorkflow {
diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx
index 19b9e43b2..53424bb2f 100644
--- a/src/components/robot/pages/RobotEditPage.tsx
+++ b/src/components/robot/pages/RobotEditPage.tsx
@@ -24,8 +24,9 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
-  type?: 'traditional' | 'markdown';
+  type?: 'extract' | 'scrape';
   url?: string;
+  formats?: ('markdown' | 'html')[];
 }
 
 interface RobotWorkflow {
diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx
index 96b7d3ecf..f0f2f6ae0 100644
--- a/src/components/robot/pages/RobotSettingsPage.tsx
+++ b/src/components/robot/pages/RobotSettingsPage.tsx
@@ -16,8 +16,9 @@ interface RobotMeta {
   pairs: number;
   updatedAt: string;
   params: any[];
-  type?: 'traditional' | 'markdown';
+  type?: 'extract' | 'scrape';
   url?: string;
+  formats?: ('markdown' | 'html')[];
 }
 
 interface RobotWorkflow {
diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx
index a0c79622a..973714b79 100644
--- a/src/context/globalInfo.tsx
+++ b/src/context/globalInfo.tsx
@@ -27,8 +27,9 @@ interface RobotMeta {
     pairs: number;
     updatedAt: string;
     params: any[];
-    type?: 'traditional' | 'markdown';
+    type?: 'extract' | 'scrape';
     url?: string;
+    formats?: ('markdown' | 'html')[];
 }
 
 interface RobotWorkflow {

From c89b2afed624df52c1f76cd0129f4265528cef16 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 18:52:28 +0530
Subject: [PATCH 69/79] feat: modify scrape api to support html

---
 src/api/storage.ts | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/api/storage.ts b/src/api/storage.ts
index ca4d975cf..d2b28d5e2 100644
--- a/src/api/storage.ts
+++ b/src/api/storage.ts
@@ -28,15 +28,24 @@ export const getStoredRecordings = async (): Promise<string[] | null> => {
   }
 };
 
-export const createMarkdownRobot = async (url: string, name?: string): Promise<any> => {
+export const createScrapeRobot = async (
+  url: string,
+  name?: string,
+  formats: string[] = ['markdown']
+): Promise<any> => {
   try {
-    const response = await axios.post(`${apiUrl}/storage/recordings/markdown`, {
-      url,
-      name,
-    }, {
-      headers: { 'Content-Type': 'application/json' },
-      withCredentials: true
-    });
+    const response = await axios.post(
+      `${apiUrl}/storage/recordings/scrape`,
+      {
+        url,
+        name,
+        formats,
+      },
+      {
+        headers: { 'Content-Type': 'application/json' },
+        withCredentials: true,
+      }
+    );
 
     if (response.status === 201) {
       return response.data;

From 0987183bac73f5875c98798e9639428474cc3379 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 18:59:32 +0530
Subject: [PATCH 70/79] chore: increase goto timeout scrape 100s

---
 server/src/markdownify/scrape.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts
index 935fa0cb6..6821bfdb7 100644
--- a/server/src/markdownify/scrape.ts
+++ b/server/src/markdownify/scrape.ts
@@ -9,7 +9,7 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
   const browser = await chromium.launch();
   const page = await browser.newPage();
 
-  await page.goto(url, { waitUntil: "networkidle" });
+  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
 
   await page.addInitScript(() => {
     const selectors = [
@@ -64,7 +64,7 @@ export async function convertPageToHTML(url: string): Promise<string> {
   const browser = await chromium.launch();
   const page = await browser.newPage();
 
-  await page.goto(url, { waitUntil: "networkidle" });
+  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
 
   await page.addInitScript(() => {
     const selectors = [

From ac0c70ebfe6f4f24b49677f8e32965ed1f2d9f90 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 19:18:11 +0530
Subject: [PATCH 71/79] feat: disable sheets and airtable scrape robot

---
 src/components/robot/RecordingsTable.tsx      | 22 ++-----
 .../robot/pages/RobotIntegrationPage.tsx      | 59 ++++++++++++++-----
 2 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/src/components/robot/RecordingsTable.tsx b/src/components/robot/RecordingsTable.tsx
index 79319b923..aed9ea749 100644
--- a/src/components/robot/RecordingsTable.tsx
+++ b/src/components/robot/RecordingsTable.tsx
@@ -110,10 +110,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
             case 'integrate':
               return (
                 <MemoizedTableCell key={column.id} align={column.align}>
-                  <MemoizedIntegrateButton
-                    handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])}
-                    robotType={row.type}
-                  />
+                  <MemoizedIntegrateButton handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} />
                 </MemoizedTableCell>
               );
             case 'options':
@@ -713,22 +710,13 @@ const ScheduleButton = ({ handleSchedule }: ScheduleButtonProps) => {
 
 interface IntegrateButtonProps {
   handleIntegrate: () => void;
-  robotType: string;
 }
 
-const IntegrateButton = ({ handleIntegrate, robotType }: IntegrateButtonProps) => {
-  const isDisabled = robotType === 'scrape';
-
+const IntegrateButton = ({ handleIntegrate }: IntegrateButtonProps) => {
   return (
-    <IconButton
-      aria-label="integrate"
-      size="small"
-      onClick={isDisabled ? undefined : handleIntegrate}
-      disabled={isDisabled}
-      sx={{
-        opacity: isDisabled ? 0.4 : 1,
-        cursor: isDisabled ? 'not-allowed' : 'pointer',
-      }}
+    <IconButton aria-label="add" size="small" onClick={() => {
+      handleIntegrate();
+    }}
     >
       <Power />
     </IconButton>
diff --git a/src/components/robot/pages/RobotIntegrationPage.tsx b/src/components/robot/pages/RobotIntegrationPage.tsx
index 3c8425901..9bedf3a54 100644
--- a/src/components/robot/pages/RobotIntegrationPage.tsx
+++ b/src/components/robot/pages/RobotIntegrationPage.tsx
@@ -128,6 +128,8 @@ export const RobotIntegrationPage = ({
     "googleSheets" | "airtable" | "webhook" | null
   >(integrationType);
 
+  const isScrapeRobot = recording?.recording_meta?.type === "scrape";
+
   const authenticateWithGoogle = () => {
     if (!recordingId) {
       console.error("Cannot authenticate: recordingId is null");
@@ -729,26 +731,55 @@ export const RobotIntegrationPage = ({
                 width: "100%",
               }}
             >
-              <Button variant="outlined" onClick={() => {
-                if (!recordingId) return;
-                setSelectedIntegrationType("googleSheets");
-                setSettings({ ...settings, integrationType: "googleSheets" });
-                const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
-                navigate(`${basePath}/${recordingId}/integrate/googleSheets`);
-              }} style={{ display: "flex", flexDirection: "column", alignItems: "center", background: 'white', color: '#ff00c3' }}>
+              <Button
+                variant="outlined"
+                disabled={isScrapeRobot}
+                onClick={() => {
+                  if (isScrapeRobot) return;
+                  if (!recordingId) return;
+                  setSelectedIntegrationType("googleSheets");
+                  setSettings({ ...settings, integrationType: "googleSheets" });
+                  const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
+                  navigate(`${basePath}/${recordingId}/integrate/googleSheets`);
+                }}
+                style={{
+                  display: "flex",
+                  flexDirection: "column",
+                  alignItems: "center",
+                  background: 'white',
+                  color: isScrapeRobot ? "#aaa" : "#ff00c3",
+                  opacity: isScrapeRobot ? 0.5 : 1,
+                  cursor: isScrapeRobot ? "not-allowed" : "pointer",
+                }}
+              >
                 <img src="https://ik.imagekit.io/ys1blv5kv/gsheet.svg" alt="Google Sheets" style={{ margin: "6px" }} />
                 Google Sheets
               </Button>
-              <Button variant="outlined" onClick={() => {
-                if (!recordingId) return;
-                setSelectedIntegrationType("airtable");
-                setSettings({ ...settings, integrationType: "airtable" });
-                const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
-                navigate(`${basePath}/${recordingId}/integrate/airtable`);
-              }} style={{ display: "flex", flexDirection: "column", alignItems: "center", background: 'white', color: '#ff00c3' }}>
+              <Button
+                variant="outlined"
+                disabled={isScrapeRobot}
+                onClick={() => {
+                  if (isScrapeRobot) return;
+                  if (!recordingId) return;
+                  setSelectedIntegrationType("airtable");
+                  setSettings({ ...settings, integrationType: "airtable" });
+                  const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
+                  navigate(`${basePath}/${recordingId}/integrate/airtable`);
+                }}
+                style={{
+                  display: "flex",
+                  flexDirection: "column",
+                  alignItems: "center",
+                  background: 'white',
+                  color: isScrapeRobot ? "#aaa" : "#ff00c3",
+                  opacity: isScrapeRobot ? 0.5 : 1,
+                  cursor: isScrapeRobot ? "not-allowed" : "pointer",
+                }}
+              >
                 <img src="https://ik.imagekit.io/ys1blv5kv/airtable.svg" alt="Airtable" style={{ margin: "6px" }} />
                 Airtable
               </Button>
+
               <Button variant="outlined" onClick={() => {
                 if (!recordingId) return;
                 setSelectedIntegrationType("webhook");

From f646713fe9106eda37b3d791e42f3f59092c3586 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 19:22:39 +0530
Subject: [PATCH 72/79] fix: format

---
 src/components/robot/pages/RobotCreate.tsx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 65423a994..b7f1a2e9e 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -400,7 +400,6 @@ const RobotCreate: React.FC = () => {
 
                 <FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
                   <FormLabel component="legend" sx={{ mb: 1 }}>Output Format (Select at least one)</FormLabel>
-
                   <FormControlLabel
                     control={
                       <Checkbox

From 174a09fd64848a9feee37c1d996ed1c663c6116e Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 19:29:50 +0530
Subject: [PATCH 73/79] fix: use p instead of formlabel

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index b7f1a2e9e..1b2fc0a8d 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -399,7 +399,7 @@ const RobotCreate: React.FC = () => {
                 />
 
                 <FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
-                  <FormLabel component="legend" sx={{ mb: 1 }}>Output Format (Select at least one)</FormLabel>
+                  <p style={{ marginBottom: 1 }}>Output Format (Select at least one)</p>
                   <FormControlLabel
                     control={
                       <Checkbox

From 565c858aff3f50ed945b154f2038d52a340190fe Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 19:30:20 +0530
Subject: [PATCH 74/79] fix: remove margin

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 1b2fc0a8d..13d4a2471 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -399,7 +399,7 @@ const RobotCreate: React.FC = () => {
                 />
 
                 <FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
-                  <p style={{ marginBottom: 1 }}>Output Format (Select at least one)</p>
+                  <p >Output Format (Select at least one)</p>
                   <FormControlLabel
                     control={
                       <Checkbox

From 6477feeaea100c2c8c1067a8476cba0bc729ad10 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 19:32:08 +0530
Subject: [PATCH 75/79] fix: dont show selected output format

---
 src/components/robot/pages/RobotCreate.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 13d4a2471..794ccd8a7 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -399,7 +399,7 @@ const RobotCreate: React.FC = () => {
                 />
 
                 <FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
-                  <p >Output Format (Select at least one)</p>
+                  <p>Output Format (Select at least one)</p>
                   <FormControlLabel
                     control={
                       <Checkbox
@@ -476,7 +476,7 @@ const RobotCreate: React.FC = () => {
               >
                 {isLoading
                   ? "Creating..."
-                  : `Create ${outputFormats.join(" + ").toUpperCase()} Robot`
+                  : `Create Robot`
                 }
               </Button>
             </Box>

From b2b5a914e7826077ad619fa295e197c943547766 Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 19:40:48 +0530
Subject: [PATCH 76/79] chore: add telemetry for scrape robots and runs

---
 server/src/api/record.ts                         | 16 ++++++++++++++++
 server/src/pgboss-worker.ts                      | 16 ++++++++++++++++
 server/src/routes/storage.ts                     |  6 +++---
 .../src/workflow-management/scheduler/index.ts   | 16 ++++++++++++++++
 4 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/server/src/api/record.ts b/server/src/api/record.ts
index cbf4f67e4..7c665001e 100644
--- a/server/src/api/record.ts
+++ b/server/src/api/record.ts
@@ -761,6 +761,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
                     );
                 }
 
+                capture("maxun-oss-run-created-api", {
+                    runId: plainRun.runId,
+                    user_id: userId,
+                    status: "success",
+                    robot_type: "scrape",
+                    formats
+                });
+
                 await destroyRemoteBrowser(plainRun.browserId, userId);
 
                 return {
@@ -800,6 +808,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
                     );
                 }
 
+                capture("maxun-oss-run-created-api", {
+                    runId: plainRun.runId,
+                    user_id: userId,
+                    status: "failed",
+                    robot_type: "scrape",
+                    formats
+                });
+
                 await destroyRemoteBrowser(plainRun.browserId, userId);
 
                 throw error;
diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts
index 66e852b85..f5d719b46 100644
--- a/server/src/pgboss-worker.ts
+++ b/server/src/pgboss-worker.ts
@@ -268,6 +268,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
             logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
           }
 
+          capture("maxun-oss-run-created-manual", {
+            runId: data.runId,
+            user_id: data.userId,
+            status: "success",
+            robot_type: "scrape",
+            formats,
+          });
+
           await destroyRemoteBrowser(browserId, data.userId);
 
           return { success: true };
@@ -296,6 +304,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
             logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
           }
 
+          capture("maxun-oss-run-created-manual", {
+            runId: data.runId,
+            user_id: data.userId,
+            status: "failed",
+            robot_type: "scrape",
+            formats,
+          });
+
           await destroyRemoteBrowser(browserId, data.userId);
 
           throw error;
diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts
index 44279e9cd..8451c7205 100644
--- a/server/src/routes/storage.ts
+++ b/server/src/routes/storage.ts
@@ -500,12 +500,12 @@ router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedReques
 
     logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
     capture(
-      'maxun-oss-markdown-robot-created',
+      'maxun-oss-robot-created',
       {
         robot_meta: newRobot.recording_meta,
-        url: url,
+        recording: newRobot.recording,
       }
-    );
+    )
 
     return res.status(201).json({
       message: 'Markdown robot created successfully.',
diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts
index d5ba76f40..470cdacb3 100644
--- a/server/src/workflow-management/scheduler/index.ts
+++ b/server/src/workflow-management/scheduler/index.ts
@@ -321,6 +321,14 @@ async function executeRun(id: string, userId: string) {
           );
         }
 
+        capture("maxun-oss-run-created-scheduled", {
+          runId: plainRun.runId,
+          user_id: userId,
+          status: "success",
+          robot_type: "scrape",
+          formats
+        });
+
         await destroyRemoteBrowser(plainRun.browserId, userId);
 
         return true;
@@ -352,6 +360,14 @@ async function executeRun(id: string, userId: string) {
           );
         }
 
+        capture("maxun-oss-run-created-scheduled", {
+          runId: plainRun.runId,
+          user_id: userId,
+          status: "failed",
+          robot_type: "scrape",
+          formats
+        });
+
         await destroyRemoteBrowser(plainRun.browserId, userId);
 
         throw error;

From 467ffe39fa8575ee2821f5149a7065ed316da95a Mon Sep 17 00:00:00 2001
From: Rohit Rajan <rohit.rajan031101@gmail.com>
Date: Thu, 20 Nov 2025 20:38:03 +0530
Subject: [PATCH 77/79] feat: rm display integrations scrape robot

---
 .../robot/pages/RobotIntegrationPage.tsx      | 102 +++++++++---------
 1 file changed, 54 insertions(+), 48 deletions(-)

diff --git a/src/components/robot/pages/RobotIntegrationPage.tsx b/src/components/robot/pages/RobotIntegrationPage.tsx
index 9bedf3a54..8905abe21 100644
--- a/src/components/robot/pages/RobotIntegrationPage.tsx
+++ b/src/components/robot/pages/RobotIntegrationPage.tsx
@@ -731,54 +731,60 @@ export const RobotIntegrationPage = ({
                 width: "100%",
               }}
             >
-              <Button
-                variant="outlined"
-                disabled={isScrapeRobot}
-                onClick={() => {
-                  if (isScrapeRobot) return;
-                  if (!recordingId) return;
-                  setSelectedIntegrationType("googleSheets");
-                  setSettings({ ...settings, integrationType: "googleSheets" });
-                  const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
-                  navigate(`${basePath}/${recordingId}/integrate/googleSheets`);
-                }}
-                style={{
-                  display: "flex",
-                  flexDirection: "column",
-                  alignItems: "center",
-                  background: 'white',
-                  color: isScrapeRobot ? "#aaa" : "#ff00c3",
-                  opacity: isScrapeRobot ? 0.5 : 1,
-                  cursor: isScrapeRobot ? "not-allowed" : "pointer",
-                }}
-              >
-                <img src="https://ik.imagekit.io/ys1blv5kv/gsheet.svg" alt="Google Sheets" style={{ margin: "6px" }} />
-                Google Sheets
-              </Button>
-              <Button
-                variant="outlined"
-                disabled={isScrapeRobot}
-                onClick={() => {
-                  if (isScrapeRobot) return;
-                  if (!recordingId) return;
-                  setSelectedIntegrationType("airtable");
-                  setSettings({ ...settings, integrationType: "airtable" });
-                  const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
-                  navigate(`${basePath}/${recordingId}/integrate/airtable`);
-                }}
-                style={{
-                  display: "flex",
-                  flexDirection: "column",
-                  alignItems: "center",
-                  background: 'white',
-                  color: isScrapeRobot ? "#aaa" : "#ff00c3",
-                  opacity: isScrapeRobot ? 0.5 : 1,
-                  cursor: isScrapeRobot ? "not-allowed" : "pointer",
-                }}
-              >
-                <img src="https://ik.imagekit.io/ys1blv5kv/airtable.svg" alt="Airtable" style={{ margin: "6px" }} />
-                Airtable
-              </Button>
+              {!isScrapeRobot && (
+                <Button
+                  variant="outlined"
+                  onClick={() => {
+                    if (!recordingId) return;
+                    setSelectedIntegrationType("googleSheets");
+                    setSettings({ ...settings, integrationType: "googleSheets" });
+                    const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
+                    navigate(`${basePath}/${recordingId}/integrate/googleSheets`);
+                  }}
+                  style={{
+                    display: "flex",
+                    flexDirection: "column",
+                    alignItems: "center",
+                    background: "white",
+                    color: "#ff00c3",
+                  }}
+                >
+                  <img
+                    src="https://ik.imagekit.io/ys1blv5kv/gsheet.svg"
+                    alt="Google Sheets"
+                    style={{ margin: "6px" }}
+                  />
+                  Google Sheets
+                </Button>
+              )}
+
+              {!isScrapeRobot && (
+                <Button
+                  variant="outlined"
+                  onClick={() => {
+                    if (!recordingId) return;
+                    setSelectedIntegrationType("airtable");
+                    setSettings({ ...settings, integrationType: "airtable" });
+                    const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
+                    navigate(`${basePath}/${recordingId}/integrate/airtable`);
+                  }}
+                  style={{
+                    display: "flex",
+                    flexDirection: "column",
+                    alignItems: "center",
+                    background: "white",
+                    color: "#ff00c3",
+                  }}
+                >
+                  <img
+                    src="https://ik.imagekit.io/ys1blv5kv/airtable.svg"
+                    alt="Airtable"
+                    style={{ margin: "6px" }}
+                  />
+                  Airtable
+                </Button>
+              )}
+
 
               <Button variant="outlined" onClick={() => {
                 if (!recordingId) return;

From 25fd74e1b22549bcd114bb7b28df5f3b1d98edd2 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 20:56:56 +0530
Subject: [PATCH 78/79] feat: center tabs

---
 src/components/robot/pages/RobotCreate.tsx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index 794ccd8a7..a440b5dd2 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -189,6 +189,7 @@ const RobotCreate: React.FC = () => {
         <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 2 }}>
           <Tabs
             value={tabValue}
+            centered
             onChange={handleTabChange}
             aria-label="robot type tabs"
             sx={{

From a1b21178661a7eb71e4938ec1d06d24aa2bed0b3 Mon Sep 17 00:00:00 2001
From: amhsirak <karishmashuklaa@gmail.com>
Date: Thu, 20 Nov 2025 21:01:37 +0530
Subject: [PATCH 79/79] fix: less gap

---
 src/components/robot/pages/RobotCreate.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx
index a440b5dd2..486f3bff3 100644
--- a/src/components/robot/pages/RobotCreate.tsx
+++ b/src/components/robot/pages/RobotCreate.tsx
@@ -186,7 +186,7 @@ const RobotCreate: React.FC = () => {
           </Typography>
         </Box>
 
-        <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 2 }}>
+        <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 2, mt: "-30px" }}>
           <Tabs
             value={tabValue}
             centered