diff --git a/package.json b/package.json index c70b0fef5..405571436 100644 --- a/package.json +++ b/package.json @@ -46,6 +46,7 @@ "idcac-playwright": "^0.1.3", "ioredis": "^5.4.1", "joi": "^17.6.0", + "joplin-turndown-plugin-gfm": "^1.0.12", "jsonwebtoken": "^9.0.2", "jwt-decode": "^4.0.0", "lodash": "^4.17.21", @@ -80,6 +81,7 @@ "styled-components": "^5.3.3", "swagger-jsdoc": "^6.2.8", "swagger-ui-express": "^5.0.1", + "turndown": "^7.2.2", "typedoc": "^0.23.8", "typescript": "^5.0.0", "uuid": "^8.3.2", @@ -126,6 +128,7 @@ "@types/styled-components": "^5.1.23", "@types/swagger-jsdoc": "^6.0.4", "@types/swagger-ui-express": "^4.1.6", + "@types/turndown": "^5.0.6", "@vitejs/plugin-react": "^4.3.3", "ajv": "^8.8.2", "concurrently": "^7.0.0", diff --git a/server/src/api/record.ts b/server/src/api/record.ts index 29d1f2615..7c665001e 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core"; import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { sendWebhook } from "../routes/webhook"; +import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape'; chromium.use(stealthPlugin()); @@ -344,7 +345,9 @@ function formatRunResponse(run: any) { runByAPI: run.runByAPI, data: { textData: {}, - listData: {} + listData: {}, + markdown: '', + html: '' }, screenshots: [] as any[], }; @@ -359,6 +362,14 @@ function formatRunResponse(run: any) { formattedRun.data.listData = output.scrapeList; } + if (output.markdown && Array.isArray(output.markdown)) { + formattedRun.data.markdown = output.markdown[0]?.content || ''; + } + + if (output.html && Array.isArray(output.html)) { + formattedRun.data.html = output.html[0]?.content || ''; + } + if (run.binaryOutput) { Object.keys(run.binaryOutput).forEach(key => { if (run.binaryOutput[key]) { @@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr } } -async function readyForRunHandler(browserId: string, id: string, userId: string){ +async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){ try { - const result = await executeRun(id, userId); + const result = await executeRun(id, userId, requestedFormats); if (result && result.success) { logger.log('info', `Interpretation of ${id} succeeded`); @@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) { return copy; }; -async function executeRun(id: string, userId: string) { +async function executeRun(id: string, userId: string, requestedFormats?: string[]) { let browser: any = null; try { @@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) { }; } + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for API run ${id}`); + + let formats = recording.recording_meta.formats || ['markdown']; + + // Override if API request defines formats + if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) { + formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f)); + } + + await run.update({ + status: 'running', + log: `Converting page to: ${formats.join(', ')}` + }); + + try { + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: `${formats.join(', ')} conversion completed successfully`, + serializableOutput, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for API run ${id}`); + + // Push success socket event + try { + const completionData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo + .of('/queued-run') + .to(`user-${userId}`) + .emit('run-completed', completionData); + } catch (socketError: any) { + logger.log( + 'warn', + `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}` + ); + } + + // Build webhook payload + const webhookPayload: any = { + robot_id: plainRun.robotMetaId, + run_id: plainRun.runId, + robot_name: recording.recording_meta.name, + status: 'success', + started_at: plainRun.startedAt, + finished_at: new Date().toLocaleString(), + metadata: { + browser_id: plainRun.browserId, + user_id: userId, + }, + }; + + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + + try { + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log( + 'info', + `Webhooks sent successfully for markdown robot API run ${plainRun.runId}` + ); + } catch (webhookError: any) { + logger.log( + 'warn', + `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}` + ); + } + + capture("maxun-oss-run-created-api", { + runId: plainRun.runId, + user_id: userId, + status: "success", + robot_type: "scrape", + formats + }); + + await destroyRemoteBrowser(plainRun.browserId, userId); + + return { + success: true, + interpretationInfo: run.toJSON() + }; + } catch (error: any) { + logger.log( + 'error', + `${formats.join(', ')} conversion failed for API run ${id}: ${error.message}` + ); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `${formats.join(', ')} conversion failed: ${error.message}`, + }); + + // Send failure socket event + try { + const failureData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo + .of('/queued-run') + .to(`user-${userId}`) + .emit('run-completed', failureData); + } catch (socketError: any) { + logger.log( + 'warn', + `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}` + ); + } + + capture("maxun-oss-run-created-api", { + runId: plainRun.runId, + user_id: userId, + status: "failed", + robot_type: "scrape", + formats + }); + + await destroyRemoteBrowser(plainRun.browserId, userId); + + throw error; + } + } + plainRun.status = 'running'; browser = browserPool.getRemoteBrowser(plainRun.browserId); @@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) { } } -export async function handleRunRecording(id: string, userId: string) { +export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) { try { const result = await createWorkflowAndStoreMetadata(id, userId); const { browserId, runId: newRunId } = result; @@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) { rejectUnauthorized: false }); - socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId)); + socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats)); logger.log('info', `Running Robot: ${id}`); @@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) { if (!run) throw new Error('Run not found'); if (run.status === 'success') { - return run.toJSON(); + return run; } else if (run.status === 'failed') { throw new Error('Run failed'); } - // Wait for the next polling interval await new Promise(resolve => setTimeout(resolve, interval)); } } @@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) { * type: string * required: true * description: The ID of the robot to run. + * requestBody: + * required: false + * content: + * application/json: + * schema: + * type: object + * properties: + * formats: + * type: array + * items: + * type: string + * enum: [markdown, html] + * description: Optional override formats for this run. + * example: + * formats: ["html"] * responses: * 200: * description: Robot run started successfully. @@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, if (!req.user) { return res.status(401).json({ ok: false, error: 'Unauthorized' }); } - const runId = await handleRunRecording(req.params.id, req.user.id); + + const requestedFormats = req.body.formats; + + const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats); if (!runId) { throw new Error('Run ID is undefined'); diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts new file mode 100644 index 000000000..eb4567f3b --- /dev/null +++ b/server/src/markdownify/markdown.ts @@ -0,0 +1,160 @@ +export async function parseMarkdown( + html: string | null | undefined, + baseUrl?: string | null +): Promise { + const TurndownService = require("turndown"); + const { gfm } = require("joplin-turndown-plugin-gfm"); + const cheerio = require("cheerio"); + const { URL } = require("url"); + + if (!html) return ""; + + const tidiedHtml = tidyHtml(html); + + const t = new TurndownService({ + headingStyle: "atx", // ensures #### instead of ------ + codeBlockStyle: "fenced", + }); + + // --------------------------------------------- + // Proper ATX headings #### instead of underline-style + // --------------------------------------------- + t.addRule("forceAtxHeadings", { + filter: ["h1", "h2", "h3", "h4", "h5", "h6"], + replacement: (content: string, node: any) => { + const level = Number(node.nodeName.charAt(1)); + const clean = content.trim(); + return `\n${"#".repeat(level)} ${clean}\n`; + }, + }); + + // --------------------------------------------- + // Remove SVGs + // --------------------------------------------- + t.addRule("truncate-svg", { + filter: "svg", + replacement: () => "", + }); + + // --------------------------------------------- + // Improved paragraph cleanup + // --------------------------------------------- + t.addRule("improved-paragraph", { + filter: "p", + replacement: (innerText: string) => { + const trimmed = innerText.trim(); + if (!trimmed) return ""; + return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`; + }, + }); + + // --------------------------------------------- + // Inline link with fallback text + // --------------------------------------------- + t.addRule("inlineLink", { + filter: (node: any, opts: any) => + node.nodeName === "A" && node.getAttribute("href"), + + replacement: (content: string, node: any) => { + let text = content.trim(); + + // Fallback: aria-label → title → domain + if (!text) { + text = + node.getAttribute("aria-label")?.trim() || + node.getAttribute("title")?.trim() || + getDomainFromUrl(node.getAttribute("href")) || + "link"; + } + + let href = node.getAttribute("href").trim(); + + // relative → absolute + if (baseUrl && isRelativeUrl(href)) { + try { + const u = new URL(href, baseUrl); + href = u.toString(); + } catch { } + } + + href = cleanUrl(href); + + return `[${text}](${href})`; + }, + }); + + t.use(gfm); + + // Convert HTML → Markdown + try { + let out = await t.turndown(tidiedHtml); + out = fixBrokenLinks(out); + out = stripSkipLinks(out); + return out.trim(); + } catch (err) { + console.error("HTML→Markdown failed", { err }); + return ""; + } +} + +// ----------------------------------------------------- +// Helpers +// ----------------------------------------------------- +function isRelativeUrl(url: string): boolean { + return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:"); +} + +function getDomainFromUrl(url: string): string | null { + try { + const u = new URL(url); + return u.hostname.replace("www.", ""); + } catch { + return null; + } +} + +function cleanUrl(u: string): string { + return u; +} + +function cleanAttribute(attr: string) { + return attr ? attr.replace(/(\n+\s*)+/g, "\n") : ""; +} + +function tidyHtml(html: string): string { + const cheerio = require("cheerio"); + const $ = cheerio.load(html); + + const manuallyCleanedElements = [ + "script", + "style", + "iframe", + "noscript", + "meta", + "link", + "object", + "embed", + "canvas", + "audio", + "video", + ]; + + manuallyCleanedElements.forEach((tag) => $(tag).remove()); + return $("body").html(); +} + +function fixBrokenLinks(md: string): string { + let depth = 0; + let result = ""; + + for (const ch of md) { + if (ch === "[") depth++; + if (ch === "]") depth = Math.max(0, depth - 1); + result += depth > 0 && ch === "\n" ? "\\\n" : ch; + } + return result; +} + +function stripSkipLinks(md: string): string { + return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, ""); +} diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts new file mode 100644 index 000000000..6821bfdb7 --- /dev/null +++ b/server/src/markdownify/scrape.ts @@ -0,0 +1,111 @@ +import { chromium } from "playwright"; +import { parseMarkdown } from "./markdown"; + +/** + * Fetches a webpage, strips scripts/styles/images/etc, + * returns clean Markdown using parser. + */ +export async function convertPageToMarkdown(url: string): Promise { + const browser = await chromium.launch(); + const page = await browser.newPage(); + + await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); + + await page.addInitScript(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); + + // Remove inline event handlers (onclick, onload…) + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + }); + + // Re-extract HTML after cleanup + const cleanedHtml = await page.evaluate(() => { + return document.documentElement.outerHTML; + }); + + await browser.close(); + + // Convert cleaned HTML → Markdown + const markdown = await parseMarkdown(cleanedHtml, url); + return markdown; +} + +/** + * Fetches a webpage, strips scripts/styles/images/etc, + * returns clean HTML. + */ +export async function convertPageToHTML(url: string): Promise { + const browser = await chromium.launch(); + const page = await browser.newPage(); + + await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); + + await page.addInitScript(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); + + // Remove inline event handlers (onclick, onload…) + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + }); + + // Re-extract HTML after cleanup + const cleanedHtml = await page.evaluate(() => { + return document.documentElement.outerHTML; + }); + + await browser.close(); + + // Return cleaned HTML directly + return cleanedHtml; +} diff --git a/server/src/markdownify/test.ts b/server/src/markdownify/test.ts new file mode 100644 index 000000000..48db37dc2 --- /dev/null +++ b/server/src/markdownify/test.ts @@ -0,0 +1,6 @@ +import { convertPageToMarkdown } from "./scrape"; + +(async () => { + const md = await convertPageToMarkdown("https://quotes.toscrape.com/"); + console.log(md); +})(); diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts index eae9438ec..39218de24 100644 --- a/server/src/models/Robot.ts +++ b/server/src/models/Robot.ts @@ -9,6 +9,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'extract' | 'scrape'; + url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index b9f411008..f5d719b46 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme import { io as serverIo } from "./server"; import { sendWebhook } from './routes/webhook'; import { BinaryOutputService } from './storage/mino'; +import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape'; if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) { throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.'); @@ -183,11 +184,140 @@ async function processRunExecution(job: Job) { try { // Find the recording const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); - + if (!recording) { throw new Error(`Recording for run ${data.runId} not found`); } - + + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for run ${data.runId}`); + + const formats = recording.recording_meta.formats || ['markdown']; + + await run.update({ + status: 'running', + log: `Converting page to ${formats.join(', ')}` + }); + + try { + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } + + // Success update + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: `${formats.join(', ').toUpperCase()} conversion completed successfully`, + serializableOutput, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for run ${data.runId}`); + + // Notify sockets + try { + const completionData = { + runId: data.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(browserId).emit('run-completed', completionData); + serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`); + } + + // Webhooks + try { + const webhookPayload: any = { + runId: data.runId, + robotId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString(), + }; + + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`); + } catch (webhookError: any) { + logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`); + } + + capture("maxun-oss-run-created-manual", { + runId: data.runId, + user_id: data.userId, + status: "success", + robot_type: "scrape", + formats, + }); + + await destroyRemoteBrowser(browserId, data.userId); + + return { success: true }; + + } catch (error: any) { + logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: data.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(browserId).emit('run-completed', failureData); + serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`); + } + + capture("maxun-oss-run-created-manual", { + runId: data.runId, + user_id: data.userId, + status: "failed", + robot_type: "scrape", + formats, + }); + + await destroyRemoteBrowser(browserId, data.userId); + + throw error; + } + } + const isRunAborted = async (): Promise => { try { const currentRun = await Run.findOne({ where: { runId: data.runId } }); diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 89872d6ae..8451c7205 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r } if (targetUrl) { + robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl }); + const updatedWorkflow = [...robot.recording.workflow]; + let foundGoto = false; for (let i = updatedWorkflow.length - 1; i >= 0; i--) { const step = updatedWorkflow[i]; @@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r robot.set('recording', { ...robot.recording, workflow: updatedWorkflow }); robot.changed('recording', true); + foundGoto = true; i = -1; break; } @@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r } }; - if (name) { + if (name || targetUrl) { updates.recording_meta = { ...robot.recording_meta, - name + ...(name && { name }), + ...(targetUrl && { url: targetUrl }) }; } @@ -432,6 +437,91 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate } }); +/** + * POST endpoint for creating a markdown robot + */ +router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => { + try { + const { url, name, formats } = req.body; + + if (!url) { + return res.status(400).json({ error: 'The "url" field is required.' }); + } + + if (!req.user) { + return res.status(401).send({ error: 'Unauthorized' }); + } + + // Validate URL format + try { + new URL(url); + } catch (err) { + return res.status(400).json({ error: 'Invalid URL format' }); + } + + // Validate format + const validFormats = ['markdown', 'html']; + + if (!Array.isArray(formats) || formats.length === 0) { + return res.status(400).json({ error: 'At least one output format must be selected.' }); + } + + const invalid = formats.filter(f => !validFormats.includes(f)); + if (invalid.length > 0) { + return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` }); + } + + const robotName = name || `Markdown Robot - ${new URL(url).hostname}`; + const currentTimestamp = new Date().toLocaleString(); + const robotId = uuid(); + + const newRobot = await Robot.create({ + id: uuid(), + userId: req.user.id, + recording_meta: { + name: robotName, + id: robotId, + createdAt: currentTimestamp, + updatedAt: currentTimestamp, + pairs: 0, + params: [], + type: 'scrape', + url: url, + formats: formats, + }, + recording: { workflow: [] }, + google_sheet_email: null, + google_sheet_name: null, + google_sheet_id: null, + google_access_token: null, + google_refresh_token: null, + schedule: null, + }); + + logger.log('info', `Markdown robot created with id: ${newRobot.id}`); + capture( + 'maxun-oss-robot-created', + { + robot_meta: newRobot.recording_meta, + recording: newRobot.recording, + } + ) + + return res.status(201).json({ + message: 'Markdown robot created successfully.', + robot: newRobot, + }); + } catch (error) { + if (error instanceof Error) { + logger.log('error', `Error creating markdown robot: ${error.message}`); + return res.status(500).json({ error: error.message }); + } else { + logger.log('error', 'Unknown error creating markdown robot'); + return res.status(500).json({ error: 'An unknown error occurred.' }); + } + } +}); + /** * DELETE endpoint for deleting a recording from the storage. */ diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 899cb7f61..470cdacb3 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core"; import { Page } from "playwright"; import { sendWebhook } from "../../routes/webhook"; import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable"; +import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape"; chromium.use(stealthPlugin()); async function createWorkflowAndStoreMetadata(id: string, userId: string) { @@ -207,6 +208,172 @@ async function executeRun(id: string, userId: string) { } } + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for scheduled run ${id}`); + + const formats = recording.recording_meta.formats || ['markdown']; + + await run.update({ + status: 'running', + log: `Converting page to: ${formats.join(', ')}` + }); + + try { + const runStartedData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'running', + startedAt: plainRun.startedAt + }; + + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); + logger.log( + 'info', + `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}` + ); + } catch (socketError: any) { + logger.log( + 'warn', + `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}` + ); + } + + try { + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: `${formats.join(', ')} conversion completed successfully`, + serializableOutput, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for scheduled run ${id}`); + + // Run-completed socket notifications + try { + const completionData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(plainRun.browserId).emit('run-completed', completionData); + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log( + 'warn', + `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}` + ); + } + + // Webhook payload + const webhookPayload: any = { + robot_id: plainRun.robotMetaId, + run_id: plainRun.runId, + robot_name: recording.recording_meta.name, + status: 'success', + started_at: plainRun.startedAt, + finished_at: new Date().toLocaleString(), + metadata: { + browser_id: plainRun.browserId, + user_id: userId, + } + }; + + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + + try { + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log( + 'info', + `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}` + ); + } catch (webhookError: any) { + logger.log( + 'warn', + `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}` + ); + } + + capture("maxun-oss-run-created-scheduled", { + runId: plainRun.runId, + user_id: userId, + status: "success", + robot_type: "scrape", + formats + }); + + await destroyRemoteBrowser(plainRun.browserId, userId); + + return true; + + } catch (error: any) { + logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `${formats.join(', ')} conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(plainRun.browserId).emit('run-completed', failureData); + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log( + 'warn', + `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}` + ); + } + + capture("maxun-oss-run-created-scheduled", { + runId: plainRun.runId, + user_id: userId, + status: "failed", + robot_type: "scrape", + formats + }); + + await destroyRemoteBrowser(plainRun.browserId, userId); + + throw error; + } + } + plainRun.status = 'running'; try { @@ -217,7 +384,7 @@ async function executeRun(id: string, userId: string) { status: 'running', startedAt: plainRun.startedAt }; - + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`); } catch (socketError: any) { diff --git a/src/api/storage.ts b/src/api/storage.ts index b5dc32ded..d2b28d5e2 100644 --- a/src/api/storage.ts +++ b/src/api/storage.ts @@ -28,6 +28,36 @@ export const getStoredRecordings = async (): Promise => { } }; +export const createScrapeRobot = async ( + url: string, + name?: string, + formats: string[] = ['markdown'] +): Promise => { + try { + const response = await axios.post( + `${apiUrl}/storage/recordings/scrape`, + { + url, + name, + formats, + }, + { + headers: { 'Content-Type': 'application/json' }, + withCredentials: true, + } + ); + + if (response.status === 201) { + return response.data; + } else { + throw new Error('Failed to create markdown robot'); + } + } catch (error: any) { + console.error('Error creating markdown robot:', error); + return null; + } +}; + export const updateRecording = async (id: string, data: { name?: string; limits?: Array<{pairIndex: number, actionIndex: number, argIndex: number, limit: number}>; diff --git a/src/components/robot/RecordingsTable.tsx b/src/components/robot/RecordingsTable.tsx index f06270ed3..aed9ea749 100644 --- a/src/components/robot/RecordingsTable.tsx +++ b/src/components/robot/RecordingsTable.tsx @@ -121,6 +121,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => { handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])} handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])} handleDelete={() => handlers.handleDelete(row.id)} + robotType={row.type} /> ); @@ -742,9 +743,10 @@ interface OptionsButtonProps { handleEdit: () => void; handleDelete: () => void; handleDuplicate: () => void; + robotType: string; } -const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate }: OptionsButtonProps) => { +const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate, robotType }: OptionsButtonProps) => { const [anchorEl, setAnchorEl] = React.useState(null); const handleClick = (event: React.MouseEvent) => { @@ -771,34 +773,33 @@ const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicat open={Boolean(anchorEl)} onClose={handleClose} > - { handleRetrain(); handleClose(); }}> - - - - {t('recordingtable.retrain')} - + {robotType !== 'scrape' && ( + { handleRetrain(); handleClose(); }}> + + + + Retrain + + )} { handleEdit(); handleClose(); }}> - - - - {t('recordingtable.edit')} + + Edit { handleDelete(); handleClose(); }}> - - - - {t('recordingtable.delete')} + + Delete - { handleDuplicate(); handleClose(); }}> - - - - {t('recordingtable.duplicate')} - + {robotType !== 'scrape' && ( + { handleDuplicate(); handleClose(); }}> + + Duplicate + + )} + ); }; diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 70058642c..486f3bff3 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -13,29 +13,65 @@ import { Card, CircularProgress, Container, - CardContent + CardContent, + Tabs, + Tab, + RadioGroup, + Radio, + FormControl, + FormLabel } from '@mui/material'; -import { ArrowBack, PlayCircleOutline, Article } from '@mui/icons-material'; +import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material'; import { useGlobalInfoStore } from '../../../context/globalInfo'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; +import { createScrapeRobot } from "../../../api/storage"; import { AuthContext } from '../../../context/auth'; import { GenericModal } from '../../ui/GenericModal'; +interface TabPanelProps { + children?: React.ReactNode; + index: number; + value: number; +} + +function TabPanel(props: TabPanelProps) { + const { children, value, index, ...other } = props; + + return ( + + ); +} + const RobotCreate: React.FC = () => { const { t } = useTranslation(); const navigate = useNavigate(); - const { setBrowserId, setRecordingUrl, notify, setRecordingId } = useGlobalInfoStore(); + const { setBrowserId, setRecordingUrl, notify, setRecordingId, setRerenderRobots } = useGlobalInfoStore(); + const [tabValue, setTabValue] = useState(0); const [url, setUrl] = useState(''); + const [scrapeRobotName, setScrapeRobotName] = useState(''); const [needsLogin, setNeedsLogin] = useState(false); const [isLoading, setIsLoading] = useState(false); const [isWarningModalOpen, setWarningModalOpen] = useState(false); const [activeBrowserId, setActiveBrowserId] = useState(''); + const [outputFormats, setOutputFormats] = useState([]); const { state } = React.useContext(AuthContext); const { user } = state; + const handleTabChange = (event: React.SyntheticEvent, newValue: number) => { + setTabValue(newValue); + }; + const handleStartRecording = async () => { if (!url.trim()) { @@ -146,155 +182,307 @@ const RobotCreate: React.FC = () => { - New Data Extraction Robot + Create New Robot - - - {/* Logo (kept as original) */} - Maxun Logo - - {/* Origin URL Input */} - - setUrl(e.target.value)} - /> - - - {/* Checkbox */} - - setNeedsLogin(e.target.checked)} - color="primary" - /> - } - label="This website needs logging in." - /> - - - {/* Button */} - - - - - + + + + + + - - - First time creating a robot? - - - Get help and learn how to use Maxun effectively. - - + + + + {/* Logo (kept as original) */} + Maxun Logo - {/* YouTube Tutorials */} - - + Extract structured data from websites in a few clicks. + + + {/* Origin URL Input */} + + setUrl(e.target.value)} + /> + + + {/* Checkbox */} + + setNeedsLogin(e.target.checked)} + color="primary" + /> + } + label="This website needs logging in." + /> + + + {/* Button */} + + + + diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx index b02cecdeb..ac602f8e1 100644 --- a/src/components/robot/pages/RobotDuplicatePage.tsx +++ b/src/components/robot/pages/RobotDuplicatePage.tsx @@ -24,13 +24,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: string; - description?: string; - usedByUsers?: number[]; - subscriptionLevel?: number; - access?: string; - sample?: any[]; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index 80671c1fb..53424bb2f 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -24,13 +24,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: string; - description?: string; - usedByUsers?: number[]; - subscriptionLevel?: number; - access?: string; - sample?: any[]; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { @@ -795,11 +791,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { navigate(basePath); }; - const lastPair = - robot?.recording.workflow[robot?.recording.workflow.length - 1]; - const targetUrl = lastPair?.what.find((action) => action.action === "goto") - ?.args?.[0]; - return ( { handleTargetUrlChange(e.target.value)} style={{ marginBottom: "20px" }} /> diff --git a/src/components/robot/pages/RobotIntegrationPage.tsx b/src/components/robot/pages/RobotIntegrationPage.tsx index 3c8425901..8905abe21 100644 --- a/src/components/robot/pages/RobotIntegrationPage.tsx +++ b/src/components/robot/pages/RobotIntegrationPage.tsx @@ -128,6 +128,8 @@ export const RobotIntegrationPage = ({ "googleSheets" | "airtable" | "webhook" | null >(integrationType); + const isScrapeRobot = recording?.recording_meta?.type === "scrape"; + const authenticateWithGoogle = () => { if (!recordingId) { console.error("Cannot authenticate: recordingId is null"); @@ -729,26 +731,61 @@ export const RobotIntegrationPage = ({ width: "100%", }} > - - + {!isScrapeRobot && ( + + )} + + {!isScrapeRobot && ( + + )} + + + + + + )} + + {hasHTML && ( + + }> + HTML + + + + + {htmlContent} + + + + + + + + + )} + + ) : ( + // Extract robot output + <> {row.status === 'running' || row.status === 'queued' ? ( <> @@ -939,6 +1043,8 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe )} + + )} diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index 69969a09c..973714b79 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,6 +27,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'extract' | 'scrape'; + url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow {