Skip to content

Commit

Permalink
[major] update pdfjs-dist dep from 3.9.179 to 4.2.67
Browse files Browse the repository at this point in the history
Switch from CommonJS to ESM to use the new pdfjs-dist builds.
Switch from ts-node to tsx for running tests because ts-node doesn't work with ESM still.
Remove test coverage testing since it doesn't work with tsx.
Update .nvmrc to 22 because pdfjs-dist doesn't work below that.
Remove BinaryData export beacuse pdfjs-dist no longer exports it. Fix the data input to match pdfjs-dist's new type.
Delete globalThis.pdfjsWorker after reading a PDF as pdfjs-dist doesn't clean it up itself.

Update all other deps as well.
  • Loading branch information
electrovir committed May 7, 2024
1 parent d95e232 commit ff61939
Show file tree
Hide file tree
Showing 13 changed files with 1,719 additions and 2,000 deletions.
2 changes: 1 addition & 1 deletion .nvmrc
Original file line number Diff line number Diff line change
@@ -1 +1 @@
20
22
File renamed without changes.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# PDF Text Reader

Dead simple PDF text reader.
Dead simple PDF text reader for Node.js. Uses Mozilla's [`pdfjs-dist`](https://www.npmjs.com/package/pdfjs-dist) package.

Requires ESM and Node.js v22 or greater. (These are requirements from Mozilla's `pdf-dist` package itself.)

# Install

Expand Down Expand Up @@ -43,8 +45,6 @@ See [the types](https://github.com/electrovir/pdf-text-reader/tree/master/src/re

# Details

This uses Mozilla's [`pdf.js`](https://github.com/mozilla/pdf.js/) package through its [`pdfjs-dist`](https://www.npmjs.com/package/pdfjs-dist) distribution on npm.

This package simply reads the output of `pdfjs.getDocument` and sorts it into lines based on text position in the document. It also inserts spaces for text on the same line that is far apart horizontally and new lines in between lines that are far apart vertically.

Example:
Expand Down
1 change: 1 addition & 0 deletions configs/mocha.config.js → configs/mocha.config.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ const baseOptions = require('virmator/base-configs/base-mocharc.js');
/** @type {import('mocha').MochaOptions} */
const mochaConfig = {
...baseOptions,
require: ['tsx'],
};

module.exports = mochaConfig;
5 changes: 0 additions & 5 deletions configs/ncu.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,6 @@ export const ncuConfig: RunOptions = {
// exclude these
reject: [
...baseNcuConfig.reject,
/**
* Different versions of this have global pollution issues we're currently on a version that
* doesn't.
*/
'pdfjs-dist',
],
// include only these
filter: [],
Expand Down
7 changes: 0 additions & 7 deletions configs/nyc.config.js

This file was deleted.

File renamed without changes.
3,608 changes: 1,668 additions & 1,940 deletions package-lock.json

Large diffs are not rendered by default.

64 changes: 31 additions & 33 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "pdf-text-reader",
"version": "4.0.1",
"version": "5.0.0",
"description": "Dead simple pdf text reader",
"keywords": [
"pdf",
Expand All @@ -22,54 +22,52 @@
"name": "electrovir",
"url": "https://github.com/electrovir"
},
"type": "module",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"compile": "virmator compile",
"compile": "rm -rf dist && tsc --pretty",
"docs": "virmator docs",
"format": "virmator format",
"format": "prettier --color --cache --cache-strategy content \"./**/*.+(cjs|css|graphql|html|js|json|jsx|less|md|mjs|scss|toml|ts|tsx|yaml|yml)\"",
"publish": "virmator publish \"npm run compile && npm run test:all\"",
"spellcheck": "virmator spellcheck",
"test": "virmator test",
"test:all": "concurrently --colors --kill-others-on-fail -c auto --names types,tests,spelling,format,docs,deps \"npm run test:types\" \"npm run test:coverage\" \"npm run test:spelling\" \"npm run test:format\" \"npm run test:docs\" \"npm run test:deps\"",
"test:coverage": "npm run test coverage",
"test": "mocha --colors --config 'configs/mocha.config.cjs'",
"test:all": "concurrently --colors --kill-others-on-fail -c auto --names types,tests,spelling,format,docs,deps \"npm run test:types\" \"npm run test\" \"npm run test:spelling\" \"npm run test:format\" \"npm run test:docs\" \"npm run test:deps\"",
"test:deps": "virmator deps check",
"test:docs": "virmator docs check",
"test:format": "virmator format check",
"test:format": "npm run format -- --check",
"test:spelling": "virmator spellcheck",
"test:types": "tsc --noEmit",
"test:web": "virmator test-web"
"test:types": "tsc --noEmit"
},
"dependencies": {
"pdfjs-dist": "3.9.179"
"pdfjs-dist": "4.2.67"
},
"devDependencies": {
"@electrovir/nyc": "^15.1.0-fix0",
"@istanbuljs/nyc-config-typescript": "^1.0.2",
"@types/chai": "^4.3.10",
"@types/mocha": "^10.0.4",
"@types/node": "20.9.2",
"chai": "^4.3.10",
"cspell": "^8.0.0",
"dependency-cruiser": "^15.3.0",
"esbuild": "^0.19.6",
"istanbul-smart-text-reporter": "^1.1.2",
"markdown-code-example-inserter": "^0.3.2",
"mocha": "^10.2.0",
"@types/chai": "^4.3.16",
"@types/mocha": "^10.0.6",
"@types/node": "20.12.10",
"chai": "^5.1.0",
"cspell": "^8.8.0",
"dependency-cruiser": "^16.3.1",
"esbuild": "^0.21.0",
"istanbul-smart-text-reporter": "^1.1.4",
"markdown-code-example-inserter": "^1.0.0",
"mocha": "^10.4.0",
"mocha-spec-reporter-with-file-names": "^0.0.3",
"npm-check-updates": "~16.12.3",
"prettier": "^3.1.0",
"prettier-plugin-interpolated-html-tags": "^1.0.2",
"prettier-plugin-jsdoc": "^1.1.1",
"prettier-plugin-multiline-arrays": "^3.0.0",
"prettier": "^3.2.5",
"prettier-plugin-interpolated-html-tags": "^1.0.5",
"prettier-plugin-jsdoc": "^1.3.0",
"prettier-plugin-multiline-arrays": "^3.0.4",
"prettier-plugin-organize-imports": "^3.2.4",
"prettier-plugin-packagejson": "^2.4.6",
"prettier-plugin-sort-json": "^3.1.0",
"prettier-plugin-toml": "^1.0.0",
"ts-node": "^10.9.1",
"type-fest": "^4.8.1",
"typedoc": "^0.25.3",
"typescript": "^5.2.2",
"virmator": "^11.1.1"
"prettier-plugin-packagejson": "^2.5.0",
"prettier-plugin-sort-json": "^4.0.0",
"prettier-plugin-toml": "^2.0.1",
"tsx": "^4.9.3",
"type-fest": "^4.18.2",
"typedoc": "^0.25.13",
"typescript": "^5.4.5",
"virmator": "^11.5.2"
}
}
6 changes: 3 additions & 3 deletions src/read-pdf.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import {assert} from 'chai';
import {existsSync} from 'fs';
import {join} from 'path';
import {ReadonlyDeep} from 'type-fest';
import {existsSync} from 'node:fs';
import {join} from 'node:path';
import type {ReadonlyDeep} from 'type-fest';
import {PdfProgressData, readPdfPages, readPdfText} from './read-pdf';
import {nodeModulesDir, sampleFilesDir} from './repo-paths.test-helper';

Expand Down
12 changes: 7 additions & 5 deletions src/read-pdf.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import {join} from 'path';
import {join} from 'node:path';
import {getDocument} from 'pdfjs-dist';
import type {
BinaryData,
DocumentInitParameters,
PDFPageProxy,
TextItem,
} from 'pdfjs-dist/types/src/display/api';
import {RequireExactlyOne} from 'type-fest';
import type {RequireExactlyOne} from 'type-fest';

export type {BinaryData, DocumentInitParameters} from 'pdfjs-dist/types/src/display/api';
export type {DocumentInitParameters} from 'pdfjs-dist/types/src/display/api';

/** A single page within a PDF file. */
export type PdfPage = {
Expand Down Expand Up @@ -55,7 +54,7 @@ export type ReadPdfTextParams = PartialWithUndefined<{
/** URL to the PDF. */
url: string;
/** PDF file data that has already been read from a PDF file. */
data: BinaryData;
data: DocumentInitParameters['data'];
/** All other options that the Mozilla `pdfjs-dist` package supports. */
allOptions: DocumentInitParameters;
}>;
Expand Down Expand Up @@ -98,6 +97,9 @@ export async function readPdfPages({
pages.push(await parsePage(page));
}

/** This is populated by the pdfjs-dist package. We're deleting it here to prevent memory leaks. */
delete (globalThis as any).pdfjsWorker;

return pages;
}

Expand Down
7 changes: 4 additions & 3 deletions src/repo-paths.test-helper.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import {dirname, join, resolve} from 'path';
import {dirname, join, resolve} from 'node:path';
import {fileURLToPath} from 'node:url';

/**
* Path to the repo's root. Does not use the package name because the source code could
* theoretically be cloned into any folder. "src" is used for the ts source code files (so they CAN
* be run directly without transpiling it into JS) and "dist" is used for the transpiled JS output
* directory.
*/
const repoRootDir = dirname(__dirname);
const repoRootDir = dirname(dirname(fileURLToPath(import.meta.url)));

export const sampleFilesDir = join(repoRootDir, 'test-files');
export const nodeModulesDir = resolve(__dirname, '..', 'node_modules', 'pdfjs-dist');
export const nodeModulesDir = resolve(repoRootDir, 'node_modules', 'pdfjs-dist');
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"compilerOptions": {
"module": "ES2022",
"outDir": "./dist",
"rootDir": "./src"
},
Expand Down

0 comments on commit ff61939

Please sign in to comment.