Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions jest.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export default {
testMatch: [
'**/tests/unit/**/*.test.js',
'**/tests/mock/**/*.test.js',
'**/tests/integration/**/*.test.js',
'**/tests/e2e/**/*.test.js'
],
setupFiles: ['./tests/jest.setup.mjs']
Expand Down
21 changes: 16 additions & 5 deletions src/browser.js
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,21 @@ async function createPlaywrightBrowser(options = {}) {
// Playwright uses chromium by default
const browser = await playwright.chromium.launch({ ...defaultOptions, ...options });

// Create a browser context to allow setting user agent and headers
const context = await browser.newContext();

return {
async newPage() {
const page = await browser.newPage();
return createPlaywrightPageAdapter(page);
const page = await context.newPage();
return createPlaywrightPageAdapter(page, context);
},
async close() {
await context.close();
await browser.close();
},
type: 'playwright',
_browser: browser
_browser: browser,
_context: context
};
}

Expand Down Expand Up @@ -127,15 +132,20 @@ function createPuppeteerPageAdapter(page) {
/**
* Create a page adapter for Playwright
* @param {Object} page - Playwright page object
* @param {Object} context - Playwright browser context
* @returns {PageAdapter}
*/
function createPlaywrightPageAdapter(page) {
function createPlaywrightPageAdapter(page, context) {
return {
async setExtraHTTPHeaders(headers) {
await page.setExtraHTTPHeaders(headers);
},
async setUserAgent(userAgent) {
await page.setUserAgent(userAgent);
// Playwright doesn't support setUserAgent on page after creation
// We need to create a new context with the user agent
// For now, we'll silently ignore or warn
// In a real implementation, we'd need to recreate the context
console.warn('Playwright: setUserAgent should be called before page creation. Ignoring.');
},
async setViewport(viewport) {
// Playwright uses setViewportSize instead of setViewport
Expand All @@ -159,6 +169,7 @@ function createPlaywrightPageAdapter(page) {
await page.close();
},
_page: page,
_context: context,
_type: 'playwright'
};
}
Expand Down
185 changes: 185 additions & 0 deletions tests/integration/habr-article.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import { createBrowser } from '../../src/browser.js';
import { convertHtmlToMarkdown } from '../../src/lib.js';

const HABR_ARTICLE_URL = 'https://habr.com/ru/articles/895896';

describe('Habr Article Download Tests', () => {
describe('Puppeteer Engine', () => {
let browser;

beforeEach(async () => {
browser = await createBrowser('puppeteer');
});

afterEach(async () => {
if (browser) {
await browser.close();
}
});

it('can download Habr article as markdown', async () => {
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
'Accept-Charset': 'utf-8'
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');

await page.goto(HABR_ARTICLE_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });
const html = await page.content();

// Verify HTML was fetched
expect(html).toBeTruthy();
expect(html.length).toBeGreaterThan(1000);

// Convert to markdown
const markdown = convertHtmlToMarkdown(html, HABR_ARTICLE_URL);

// Verify markdown content
expect(markdown).toBeTruthy();
expect(markdown.length).toBeGreaterThan(100);

// Habr articles typically have headers, links, and code blocks
// Just verify we got some markdown-like content
expect(markdown).toMatch(/[#\-*[\]]/); // Should contain markdown syntax

await page.close();
}, 90000);

it('can download Habr article as image screenshot', async () => {
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
'Accept-Charset': 'utf-8'
});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({ width: 1280, height: 800 });

await page.goto(HABR_ARTICLE_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });

// Wait for content to fully render
await new Promise(resolve => setTimeout(resolve, 2000));

const screenshot = await page.screenshot({ type: 'png' });

// Verify screenshot
expect(screenshot).toBeInstanceOf(Buffer);
expect(screenshot.length).toBeGreaterThan(1000);

// Verify PNG signature
const pngSignature = Buffer.from([0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]);
expect(screenshot.slice(0, 8)).toEqual(pngSignature);

await page.close();
}, 90000);
});

describe('Playwright Engine', () => {
let browser;

beforeEach(async () => {
browser = await createBrowser('playwright');
});

afterEach(async () => {
if (browser) {
await browser.close();
}
});

it('can download Habr article as markdown', async () => {
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
'Accept-Charset': 'utf-8'
});

await page.goto(HABR_ARTICLE_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });
const html = await page.content();

// Verify HTML was fetched
expect(html).toBeTruthy();
expect(html.length).toBeGreaterThan(1000);

// Convert to markdown
const markdown = convertHtmlToMarkdown(html, HABR_ARTICLE_URL);

// Verify markdown content
expect(markdown).toBeTruthy();
expect(markdown.length).toBeGreaterThan(100);

// Habr articles typically have headers, links, and code blocks
// Just verify we got some markdown-like content
expect(markdown).toMatch(/[#\-*[\]]/); // Should contain markdown syntax

await page.close();
}, 90000);

it('can download Habr article as image screenshot', async () => {
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
'Accept-Charset': 'utf-8'
});
await page.setViewport({ width: 1280, height: 800 });

await page.goto(HABR_ARTICLE_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });

// Wait for content to fully render
await new Promise(resolve => setTimeout(resolve, 2000));

const screenshot = await page.screenshot({ type: 'png' });

// Verify screenshot
expect(screenshot).toBeInstanceOf(Buffer);
expect(screenshot.length).toBeGreaterThan(1000);

// Verify PNG signature
const pngSignature = Buffer.from([0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]);
expect(screenshot.slice(0, 8)).toEqual(pngSignature);

await page.close();
}, 90000);
});

describe('Engine Comparison for Habr Article', () => {
it('both engines can successfully download the same Habr article', async () => {
const puppeteerBrowser = await createBrowser('puppeteer');
const playwrightBrowser = await createBrowser('playwright');

const puppeteerPage = await puppeteerBrowser.newPage();
const playwrightPage = await playwrightBrowser.newPage();

// Set headers for both
const headers = {
'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
'Accept-Charset': 'utf-8'
};

await puppeteerPage.setExtraHTTPHeaders(headers);
await playwrightPage.setExtraHTTPHeaders(headers);
await puppeteerPage.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');

await puppeteerPage.goto(HABR_ARTICLE_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });
await playwrightPage.goto(HABR_ARTICLE_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });

const puppeteerHtml = await puppeteerPage.content();
const playwrightHtml = await playwrightPage.content();

// Both should successfully fetch content
expect(puppeteerHtml.length).toBeGreaterThan(1000);
expect(playwrightHtml.length).toBeGreaterThan(1000);

// Convert both to markdown
const puppeteerMarkdown = convertHtmlToMarkdown(puppeteerHtml, HABR_ARTICLE_URL);
const playwrightMarkdown = convertHtmlToMarkdown(playwrightHtml, HABR_ARTICLE_URL);

// Both should produce valid markdown
expect(puppeteerMarkdown.length).toBeGreaterThan(100);
expect(playwrightMarkdown.length).toBeGreaterThan(100);

await puppeteerBrowser.close();
await playwrightBrowser.close();
}, 120000);
});
});