-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.js
94 lines (87 loc) · 2.57 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
const { PDFDocument, PDFName } = require("pdf-lib");
function guessMimeType(imageBytes) {
if (imageBytes[0] === 0xff && imageBytes[1] === 0xd8) {
return "image/jpeg"; // JPEG
} else if (
imageBytes[0] === 0x89 &&
imageBytes[1] === 0x50 &&
imageBytes[2] === 0x4e &&
imageBytes[3] === 0x47
) {
return "image/png"; // PNG
} else if (
imageBytes[0] === 0x49 &&
imageBytes[1] === 0x49 &&
imageBytes[2] === 0x2a &&
imageBytes[3] === 0x00
) {
return "image/tiff"; // TIFF, little-endian order
} else if (
imageBytes[0] === 0x47 &&
imageBytes[1] === 0x49 &&
imageBytes[2] === 0x46
) {
return "image/gif"; // GIF
} else {
// Default or add more conditions for other types
return "application/octet-stream"; // Unknown or binary data
}
}
const ExtractImages = async ({ pdf, fileType }) => {
try {
// Fetch the PDF
let arrayBuffer;
if (fileType === "url") {
let response = await fetch(pdf);
arrayBuffer = await response.arrayBuffer();
} else if (fileType === "blob") {
arrayBuffer = await pdf.arrayBuffer();
} else {
return;
}
// Load the PDF
const pdfDoc = await PDFDocument.load(arrayBuffer, {
ignoreEncryption: true,
});
const seenImages = new Set();
// Placeholder for extracted images
const extractedImages = [];
// Iterate over each page
for (let i = 0; i < pdfDoc.getPageCount(); i++) {
const page = pdfDoc.getPage(i);
const resources = page.node.Resources();
const xObjects = resources.get(PDFName.of("XObject"));
if (xObjects) {
for (const [key, ref] of xObjects.dict) {
const pdfImage = pdfDoc.context.lookup(ref);
const pngBytes = await pdfImage.asUint8Array();
const mimeType = guessMimeType(pngBytes);
const byteString = pngBytes.join(",");
if (!seenImages.has(byteString)) {
seenImages.add(byteString);
let blob;
if (mimeType === "image/png" || mimeType === "image/jpeg") {
blob = new Blob([pngBytes], { type: mimeType });
} else {
continue;
}
const imageUrl = URL.createObjectURL(blob);
extractedImages.push({
blob: blob,
url: imageUrl,
type: "image",
imageType: mimeType,
});
}
}
}
}
return extractedImages;
} catch (error) {
console.error("Error extracting images from PDF:", error);
} finally {
}
};
module.exports = {
ExtractImages,
};