/
index.js
executable file
·284 lines (246 loc) · 6.32 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/bin/env node
const pup = require('puppeteer');
const got = require('got');
const ora = require('ora');
const { JSDOM } = require('jsdom');
const nunjucks = require('nunjucks');
const tmp = require('tmp');
const fs = require('fs');
const css = require('css');
const slugify = require('slugify');
const Readability = require('./vendor/readability');
const pkg = require('./package.json');
const uuid = require('uuid/v1');
const spinner = ora();
const {
ampToHtml,
fixLazyLoadedImages,
imagesAtFullSize,
wikipediaSpecific,
noUselessHref,
relativeToAbsoluteURIs,
singleImgToFigure,
expandDetailsElements
} = require('./src/enhancements');
const get_style_attribute_value = require('./src/get-style-attribute-value');
const resolve = path =>
require.resolve(path, {
paths: [process.cwd(), __dirname]
});
const enhancePage = function(dom) {
// Note: the order of the enhancements matters!
[
ampToHtml,
fixLazyLoadedImages,
relativeToAbsoluteURIs,
imagesAtFullSize,
singleImgToFigure,
noUselessHref,
expandDetailsElements,
wikipediaSpecific
].forEach(enhancement => {
enhancement(dom.window.document);
});
};
function createDom({ url, content }) {
const dom = new JSDOM(content, { url });
// Force relative URL resolution
dom.window.document.body.setAttribute(null, null);
return dom;
}
/*
Some setup
----------
*/
function configure() {
nunjucks.configure({ autoescape: false, noCache: true });
}
/*
Fetch a web page and clean the HTML
-----------------------------------
*/
async function cleanup(url, options) {
try {
spinner.start(`Fetching: ${url}`);
/*
Must ensure that the URL is properly encoded.
See: https://github.com/danburzo/percollate/pull/83
*/
const content = (await got(encodeURI(decodeURI(url)), {
headers: {
'user-agent': `percollate/${pkg.version}`
}
})).body;
spinner.succeed();
spinner.start('Enhancing web page');
const dom = createDom({ url, content });
const amp = dom.window.document.querySelector('link[rel=amphtml]');
if (amp && options.amp) {
spinner.succeed('Found AMP version');
return cleanup(amp.href, options);
}
/*
Run enhancements
----------------
*/
enhancePage(dom);
// Run through readability and return
const parsed = new Readability(dom.window.document, {
classesToPreserve: [
'no-href',
/*
Placed on some <a> elements
as in-page anchors
*/
'anchor'
]
}).parse();
spinner.succeed();
return { ...parsed, id: `percollate-page-${uuid()}`, url };
} catch (error) {
spinner.fail(error.message);
throw error;
}
}
/*
Bundle the HTML files into a PDF
--------------------------------
*/
async function bundle(items, options) {
spinner.start('Generating temporary HTML file');
const temp_file = tmp.tmpNameSync({ postfix: '.html' });
const stylesheet = resolve(options.style || './templates/default.css');
const style = fs.readFileSync(stylesheet, 'utf8') + (options.css || '');
const use_toc = options.toc && items.length > 1;
const html = nunjucks.renderString(
fs.readFileSync(
resolve(options.template || './templates/default.html'),
'utf8'
),
{
items,
style,
stylesheet, // deprecated
options: {
use_toc
}
}
);
const doc = new JSDOM(html).window.document;
const headerTemplate = doc.querySelector('.header-template');
const footerTemplate = doc.querySelector('.footer-template');
const header = new JSDOM(
headerTemplate ? headerTemplate.innerHTML : '<span></span>'
).window.document;
const footer = new JSDOM(
footerTemplate ? footerTemplate.innerHTML : '<span></span>'
).window.document;
const css_ast = css.parse(style);
const header_style = get_style_attribute_value(css_ast, '.header-template');
const header_div = header.querySelector('body :first-child');
if (header_div && header_style) {
header_div.setAttribute(
'style',
`
${header_style};
${header_div.getAttribute('style') || ''}
`
);
}
const footer_style = get_style_attribute_value(css_ast, '.footer-template');
const footer_div = footer.querySelector('body :first-child');
if (footer_div && footer_style) {
footer_div.setAttribute(
'style',
`
${footer_style};
${footer_div.getAttribute('style') || ''}
`
);
}
fs.writeFileSync(temp_file, html);
spinner.succeed(`Temporary HTML file: file://${temp_file}`);
const browser = await pup.launch({
headless: true,
/*
Allow running with no sandbox
See: https://github.com/danburzo/percollate/issues/26
*/
args: options.sandbox
? undefined
: ['--no-sandbox', '--disable-setuid-sandbox'],
defaultViewport: {
// Emulate retina display (@2x)...
deviceScaleFactor: 2,
// ...but then we need to provide the other
// viewport parameters as well
width: 1920,
height: 1080
}
});
const page = await browser.newPage();
/*
Increase the navigation timeout to 2 minutes
See: https://github.com/danburzo/percollate/issues/80
*/
page.setDefaultNavigationTimeout(120 * 1000);
if (options.debug) {
page.on('response', response => {
spinner.succeed(`Fetched: ${response.url()}`);
});
}
await page.goto(`file://${temp_file}`, { waitUntil: 'load' });
/*
When no output path is present,
produce the file name from the web page title
(if a single page was sent as argument),
or a timestamped file (for the moment)
in case we're bundling many web pages.
*/
const output_path =
options.output ||
(items.length === 1
? `${slugify(items[0].title || 'Untitled page')}.pdf`
: `percollate-${Date.now()}.pdf`);
await page.pdf({
path: output_path,
preferCSSPageSize: true,
displayHeaderFooter: true,
headerTemplate: header.body.innerHTML,
footerTemplate: footer.body.innerHTML,
printBackground: true
});
await browser.close();
spinner.succeed(`Saved PDF: ${output_path}`);
}
/*
Generate PDF
*/
async function pdf(urls, options) {
if (!urls.length) return;
let items = [];
for (let url of urls) {
let item = await cleanup(url, options);
if (options.individual) {
await bundle([item], options);
} else {
items.push(item);
}
}
if (!options.individual) {
await bundle(items, options);
}
}
/*
Generate EPUB
*/
async function epub(urls, options) {
console.log('TODO', urls, options);
}
/*
Generate HTML
*/
async function html(urls, options) {
console.log('TODO', urls, options);
}
module.exports = { configure, pdf, epub, html };