diff --git a/.changeset/fast-tips-act.md b/.changeset/fast-tips-act.md new file mode 100644 index 0000000..368ba25 --- /dev/null +++ b/.changeset/fast-tips-act.md @@ -0,0 +1,9 @@ +--- +"unicode-segmenter": minor +--- + +- grapheme: Use `String.prototype.codePointAt` +- grapheme: Optimize character length checking, also reduce the size a bit +- utils: Add `isBMP` and `isSMP` util to check a codepoint number is being BMP(Basic Multilingual Plane) range +- utils: Deprecated `takeCodePoint` and `takeChar` in favor of ES6 `String.prototype.codePointAt` and `String.fromCodePoint` +- utils: `takeChar` is no longer depends on `String.fromCodePoint` internally diff --git a/README.md b/README.md index 8074312..2e349a1 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,12 @@ A lightweight and fast, pure JavaScript library for Unicode segmentation. ## Features -The library includes utilities to deal with: +`unicode-segmenter` includes utilities to deal with: - Emojis and pictographic [⤵](#export-unicode-segmenteremoji) - Extended grapheme clusters [⤵](#export-unicode-segmentergrapheme) - Non-Latin alphabets and numbers [⤵](#export-unicode-segmentergeneral) - UTF-8 characters and UTF-16 surrogates [⤵](#export-unicode-segmenterutils) -- Polyfill [`Intl.Segmenter`] API [⤵](#export-unicode-segmenterintl-adapter) +- [`Intl.Segmenter`] Polyfill [⤵](#export-unicode-segmenterintl-adapter) With no dependencies, so you can use it even in places where built-in Unicode libraries aren't available, such as old browsers, edge runtimes, and embedded environments. @@ -21,11 +21,17 @@ With no dependencies, so you can use it even in places where built-in Unicode li Unicode® 15.1.0 Standard [Annex \#29 Revision 43](https://www.unicode.org/reports/tr29/tr29-43.html) (2023-08-16) +## Compatibility + +`unicode-segmenter` uses most basic ES6+ features like [generators](https://caniuse.com/es6-generators), [modules](https://caniuse.com/es6-module) and [`String.prototype.codePointAt()`](https://caniuse.com/mdn-javascript_builtins_string_codepointat). + +Those are available in (not very) modern browsers as well as lightweight runtimes like [QuickJS](https://bellard.org/quickjs/). You can still use the library even in IE11 after transpile/polyfilling them using. + ## Usage ### Using TypeScript -No worry. Library is fully typed, and provides `*.d.ts` for you 😉 +No worry. The project is fully type-checked, and provides `*.d.ts` for you 😉 ### Export `unicode-segmenter/emoji` [![](https://edge.bundlejs.com/badge?q=unicode-segmenter/emoji&treeshake=[*])](https://bundlejs.com/?q=unicode-segmenter%2Femoji&treeshake=%5B*%5D) @@ -188,7 +194,7 @@ ch = takeChar(str, cursor += ch.length); // => '😍' ## Benchmarks -This library aims to be lighter and faster than alternatives in the ecosystem. +`unicode-segmenter` aims to be lighter and faster than alternatives in the ecosystem while fully spec compliant. So the benchmark is tracking the performance, bundle size, and Unicode version compliance of several libraries. Look [benchmark](benchmark) to see how it works. @@ -315,7 +321,7 @@ It's \~2.5x worse than `RegExp` w/ `u` for match-all performance, but that's use | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:| -| `unicode-segmenter/grapheme` | 15.1.0 | ✔️ | 33,594 | 29,915 | 9,454 | 5,776 | +| `unicode-segmenter/grapheme` | 15.1.0 | ✔️ | 33,045 | 29,667 | 9,343 | 5,658 | | `graphemer` | 15.0.0 | ✖️ ️| 410,424 | 95,104 | 15,752 | 10,660 | | `grapheme-splitter` | 10.0.0 | ✖️ | 122,241 | 23,680 | 7,852 | 4,841 | | `unicode-segmentation`* | 15.0.0 | ✔️ | 51,251 | 51,251 | 22,545 | 16,614 | @@ -342,78 +348,78 @@ The gap may increase depending on the environment. Bindings for browsers general ----------------------------------------------------------------------------------- ----------------------------- • Lorem ipsum (ascii) ----------------------------------------------------------------------------------- ----------------------------- - unicode-segmenter 5'529 ns/iter (5'166 ns … 188 µs) 5'334 ns 5'917 ns 50'584 ns - Intl.Segmenter 50'420 ns/iter (46'584 ns … 564 µs) 49'916 ns 57'667 ns 317 µs - graphemer 49'188 ns/iter (46'833 ns … 395 µs) 47'959 ns 99'583 ns 221 µs - grapheme-splitter 74'743 ns/iter (73'292 ns … 229 µs) 73'917 ns 80'833 ns 192 µs - unicode-rs/unicode-segmentation (wasm-pack) 16'130 ns/iter (15'625 ns … 292 µs) 16'000 ns 17'083 ns 86'042 ns + unicode-segmenter 5'307 ns/iter (4'708 ns … 252 µs) 5'125 ns 6'250 ns 68'625 ns + Intl.Segmenter 51'373 ns/iter (47'000 ns … 467 µs) 50'709 ns 58'583 ns 397 µs + graphemer 49'735 ns/iter (46'416 ns … 1'739 µs) 47'042 ns 123 µs 342 µs + grapheme-splitter 74'459 ns/iter (73'292 ns … 211 µs) 73'834 ns 81'334 ns 169 µs + unicode-rs/unicode-segmentation (wasm-pack) 16'422 ns/iter (15'625 ns … 325 µs) 16'375 ns 19'416 ns 89'125 ns summary for Lorem ipsum (ascii) unicode-segmenter - 2.92x faster than unicode-rs/unicode-segmentation (wasm-pack) - 8.9x faster than graphemer - 9.12x faster than Intl.Segmenter - 13.52x faster than grapheme-splitter + 3.09x faster than unicode-rs/unicode-segmentation (wasm-pack) + 9.37x faster than graphemer + 9.68x faster than Intl.Segmenter + 14.03x faster than grapheme-splitter • Emojis ----------------------------------------------------------------------------------- ----------------------------- - unicode-segmenter 1'693 ns/iter (1'662 ns … 1'804 ns) 1'711 ns 1'787 ns 1'804 ns - Intl.Segmenter 14'306 ns/iter (12'209 ns … 1'277 µs) 13'500 ns 17'542 ns 776 µs - graphemer 13'101 ns/iter (12'542 ns … 596 µs) 12'875 ns 15'417 ns 109 µs - grapheme-splitter 27'259 ns/iter (26'583 ns … 508 µs) 26'875 ns 31'500 ns 59'917 ns - unicode-rs/unicode-segmentation (wasm-pack) 5'590 ns/iter (5'471 ns … 6'420 ns) 5'609 ns 6'361 ns 6'420 ns + unicode-segmenter 1'820 ns/iter (1'730 ns … 2'428 ns) 1'853 ns 2'228 ns 2'428 ns + Intl.Segmenter 14'743 ns/iter (12'166 ns … 2'454 µs) 13'875 ns 18'000 ns 39'834 ns + graphemer 13'406 ns/iter (12'625 ns … 1'243 µs) 13'292 ns 15'208 ns 117 µs + grapheme-splitter 27'827 ns/iter (26'625 ns … 513 µs) 27'709 ns 32'208 ns 82'958 ns + unicode-rs/unicode-segmentation (wasm-pack) 5'591 ns/iter (5'462 ns … 5'916 ns) 5'655 ns 5'845 ns 5'916 ns summary for Emojis unicode-segmenter - 3.3x faster than unicode-rs/unicode-segmentation (wasm-pack) - 7.74x faster than graphemer - 8.45x faster than Intl.Segmenter - 16.1x faster than grapheme-splitter + 3.07x faster than unicode-rs/unicode-segmentation (wasm-pack) + 7.37x faster than graphemer + 8.1x faster than Intl.Segmenter + 15.29x faster than grapheme-splitter • Demonic characters ----------------------------------------------------------------------------------- ----------------------------- - unicode-segmenter 1'742 ns/iter (1'686 ns … 1'874 ns) 1'767 ns 1'867 ns 1'874 ns - Intl.Segmenter 5'070 ns/iter (3'498 ns … 9'027 ns) 8'009 ns 8'877 ns 9'027 ns - graphemer 27'235 ns/iter (26'250 ns … 1'753 µs) 26'750 ns 29'416 ns 168 µs - grapheme-splitter 19'988 ns/iter (19'000 ns … 417 µs) 19'500 ns 24'000 ns 269 µs - unicode-rs/unicode-segmentation (wasm-pack) 2'518 ns/iter (2'440 ns … 2'734 ns) 2'550 ns 2'641 ns 2'734 ns + unicode-segmenter 1'789 ns/iter (1'728 ns … 1'945 ns) 1'812 ns 1'905 ns 1'945 ns + Intl.Segmenter 5'083 ns/iter (3'505 ns … 9'451 ns) 7'867 ns 9'238 ns 9'451 ns + graphemer 27'906 ns/iter (26'375 ns … 284 µs) 27'750 ns 30'917 ns 168 µs + grapheme-splitter 20'428 ns/iter (19'042 ns … 373 µs) 20'125 ns 23'833 ns 287 µs + unicode-rs/unicode-segmentation (wasm-pack) 2'513 ns/iter (2'426 ns … 2'728 ns) 2'542 ns 2'693 ns 2'728 ns summary for Demonic characters unicode-segmenter - 1.45x faster than unicode-rs/unicode-segmentation (wasm-pack) - 2.91x faster than Intl.Segmenter - 11.48x faster than grapheme-splitter - 15.64x faster than graphemer + 1.4x faster than unicode-rs/unicode-segmentation (wasm-pack) + 2.84x faster than Intl.Segmenter + 11.42x faster than grapheme-splitter + 15.6x faster than graphemer • Tweet text (combined) ----------------------------------------------------------------------------------- ----------------------------- - unicode-segmenter 8'791 ns/iter (8'125 ns … 268 µs) 8'750 ns 9'792 ns 122 µs - Intl.Segmenter 72'326 ns/iter (63'583 ns … 680 µs) 71'875 ns 111 µs 557 µs - graphemer 72'033 ns/iter (66'791 ns … 357 µs) 71'875 ns 110 µs 300 µs - grapheme-splitter 154 µs/iter (147 µs … 501 µs) 157 µs 194 µs 480 µs - unicode-rs/unicode-segmentation (wasm-pack) 24'444 ns/iter (23'666 ns … 355 µs) 24'083 ns 27'542 ns 179 µs + unicode-segmenter 8'170 ns/iter (7'666 ns … 370 µs) 8'042 ns 9'125 ns 110 µs + Intl.Segmenter 67'951 ns/iter (63'542 ns … 664 µs) 67'875 ns 73'667 ns 359 µs + graphemer 68'831 ns/iter (66'542 ns … 349 µs) 69'083 ns 78'500 ns 197 µs + grapheme-splitter 151 µs/iter (145 µs … 624 µs) 150 µs 183 µs 445 µs + unicode-rs/unicode-segmentation (wasm-pack) 24'231 ns/iter (23'625 ns … 252 µs) 24'000 ns 26'250 ns 133 µs summary for Tweet text (combined) unicode-segmenter - 2.78x faster than unicode-rs/unicode-segmentation (wasm-pack) - 8.19x faster than graphemer - 8.23x faster than Intl.Segmenter - 17.55x faster than grapheme-splitter + 2.97x faster than unicode-rs/unicode-segmentation (wasm-pack) + 8.32x faster than Intl.Segmenter + 8.42x faster than graphemer + 18.43x faster than grapheme-splitter • Code snippet (combined) ----------------------------------------------------------------------------------- ----------------------------- - unicode-segmenter 20'311 ns/iter (19'541 ns … 220 µs) 19'958 ns 23'375 ns 129 µs - Intl.Segmenter 170 µs/iter (151 µs … 1'633 µs) 161 µs 513 µs 1'282 µs - graphemer 163 µs/iter (159 µs … 410 µs) 161 µs 286 µs 367 µs - grapheme-splitter 352 µs/iter (346 µs … 710 µs) 350 µs 463 µs 672 µs - unicode-rs/unicode-segmentation (wasm-pack) 57'798 ns/iter (56'083 ns … 326 µs) 57'208 ns 63'625 ns 203 µs + unicode-segmenter 19'604 ns/iter (18'291 ns … 239 µs) 19'375 ns 27'709 ns 139 µs + Intl.Segmenter 160 µs/iter (148 µs … 406 µs) 159 µs 309 µs 385 µs + graphemer 165 µs/iter (159 µs … 377 µs) 165 µs 267 µs 351 µs + grapheme-splitter 353 µs/iter (340 µs … 1'264 µs) 354 µs 541 µs 1'136 µs + unicode-rs/unicode-segmentation (wasm-pack) 58'236 ns/iter (55'958 ns … 905 µs) 58'333 ns 65'125 ns 199 µs summary for Code snippet (combined) unicode-segmenter - 2.85x faster than unicode-rs/unicode-segmentation (wasm-pack) - 8.04x faster than graphemer - 8.37x faster than Intl.Segmenter - 17.34x faster than grapheme-splitter + 2.97x faster than unicode-rs/unicode-segmentation (wasm-pack) + 8.16x faster than Intl.Segmenter + 8.39x faster than graphemer + 17.98x faster than grapheme-splitter ``` diff --git a/benchmark/performance-grapheme.js b/benchmark/performance-grapheme.js index 178b00b..a710dfd 100644 --- a/benchmark/performance-grapheme.js +++ b/benchmark/performance-grapheme.js @@ -6,7 +6,7 @@ import * as unicodeSegmentation from 'unicode-segmentation-wasm'; import { graphemeSegments } from '../src/grapheme.js'; -if (typeof self === 'object') { +if (globalThis.origin) { await unicodeSegmentation.default(); } diff --git a/src/grapheme.js b/src/grapheme.js index d5ae26b..5a72b29 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -14,7 +14,7 @@ // @ts-check import { bsearchRange } from './core.js'; -import { takeCodePoint } from './utils.js'; +import { isSMP } from './utils.js'; import { searchGraphemeCategory, GraphemeCategory, @@ -78,15 +78,18 @@ export function* graphemeSegments(input) { /** InCB=Consonant InCB=Linker x InCB=Consonant */ let incb = false; - let cp = takeCodePoint(input, cursor, len); - let ch = String.fromCodePoint(cp); + /** @type number */ + // @ts-ignore + let cp = input.codePointAt(cursor); let index = 0; let segment = ''; while (true) { - cursor += ch.length; - segment += ch; + segment += input[cursor++]; + if (isSMP(cp)) { + segment += input[cursor++]; + } catBefore = catAfter; if (catBefore === null) { @@ -100,8 +103,8 @@ export function* graphemeSegments(input) { } if (cursor < len) { - cp = takeCodePoint(input, cursor, len); - ch = String.fromCodePoint(cp); + // @ts-ignore + cp = input.codePointAt(cursor); catAfter = cat(cp, cache); } else { yield { segment, index, input, _cat: catBefore }; @@ -171,7 +174,7 @@ function cat(cp, cache) { // If this char isn't within the cached range, update the cache to the // range that includes it. if (cp < cache[0] || cp > cache[1]) { - let result = searchGraphemeCategory(cp); + let result = searchGraphemeCategory(cp); cache[0] = result[0]; cache[1] = result[1]; cache[2] = result[2]; diff --git a/src/utils.js b/src/utils.js index c0e3923..3f6dac3 100644 --- a/src/utils.js +++ b/src/utils.js @@ -3,6 +3,9 @@ /** * Take a Unicode code point from the given input by cursor * + * @deprecated + * Use this only if `String.prototype.codePointAt()` isn't available on the host environment + * * @param {string} input * @param {number} cursor * @param {number} [length] length of input @@ -24,25 +27,38 @@ export function takeCodePoint(input, cursor, length = input.length) { /** * Take a UTF-8 char from the given input by cursor * + * @deprecated + * Use this only if `String.fromCodePoint()` isn't available on the host environment + * * @param {string} input * @param {number} cursor * @param {number} [length] length of input * @return {string} a UTF-8 character (its `.length` will be 1 or 2) */ export function takeChar(input, cursor, length = input.length) { - let cp = takeCodePoint(input, cursor, length); - return String.fromCodePoint(cp); + let hi = input.charCodeAt(cursor); + if (isHighSurrogate(hi)) { + if (cursor + 1 < length) { + let lo = input.charCodeAt(cursor + 1); + if (isLowSurrogate(lo)) { + // This seems to be much slower in V8 + // return String.fromCharCode(hi, lo); + return String.fromCharCode(hi) + String.fromCharCode(lo); + } + } + } + return String.fromCharCode(hi); } /** - * @param {number} c UTF-16 code + * @param {number} c UTF-16 code point */ export function isHighSurrogate(c) { return 0xd800 <= c && c <= 0xdbff; } /** - * @param {number} c UTF-16 code + * @param {number} c UTF-16 code point */ export function isLowSurrogate(c) { return 0xdc00 <= c && c <= 0xdfff; @@ -55,3 +71,23 @@ export function isLowSurrogate(c) { export function surrogatePairToCodePoint(hi, lo) { return ((hi - 0xd800) << 10) + (lo - 0xdc00) + 0x10000; } + +/** + * Check if given code point is within the BMP(Basic Multilingual Plane) + * + * @param {number} c Unicode code point + * @return {boolean} + */ +export function isBMP(c) { + return c <= 0xffff; +} + +/** + * Check if given code point is within the SMP(Supplementary Multilingual Plane) + * + * @param {number} c Unicode code point + * @return {boolean} + */ +export function isSMP(c) { + return 0xffff < c; +} diff --git a/test/utils.js b/test/utils.js index 4f2fe26..1f86abc 100644 --- a/test/utils.js +++ b/test/utils.js @@ -4,7 +4,12 @@ import { test } from 'node:test'; import * as assert from 'node:assert/strict'; import fc from 'fast-check'; -import { takeChar } from 'unicode-segmenter/utils'; +import { + takeChar, + takeCodePoint, + isBMP, + isSMP, +} from 'unicode-segmenter/utils'; fc.configureGlobal({ // Fix seed here for stable coverage report @@ -44,3 +49,54 @@ test('takeChar', async t => { }); }); +test('takeChar', async t => { + await t.test('ascii', () => { + fc.assert( + // @ts-ignore + fc.property(fc.ascii(), fc.fullUnicodeString(), (data, extra) => { + assert.equal(takeCodePoint(data + extra, 0), (data + extra).codePointAt(0)); + }), + ); + }); + + await t.test('char16bits', () => { + fc.assert( + // @ts-ignore + fc.property(fc.char16bits(), fc.fullUnicodeString(), (data, extra) => { + assert.equal(takeCodePoint(data + extra, 0), (data + extra).codePointAt(0)); + }), + ); + }); + + await t.test('utf-8 (3-bytes)', () => { + fc.assert( + fc.property( + // @ts-ignore + fc.integer({ min: 0xffff + 1, max: 0x10ffff }), fc.fullUnicodeString(), (data, extra) => { + let leading = String.fromCodePoint(data); + assert.equal(takeCodePoint(leading + extra, 0), (leading + extra).codePointAt(0)); + }, + ), + ); + }); +}); + +test('isBMP', () => { + fc.assert( + // @ts-ignore + fc.property(fc.fullUnicode(), (data) => { + // @ts-ignore + assert.equal(isBMP(data.codePointAt(0)) ? 1 : 2, data.length); + }), + ); +}); + +test('isSMP', () => { + fc.assert( + // @ts-ignore + fc.property(fc.fullUnicode(), (data) => { + // @ts-ignore + assert.equal(isSMP(data.codePointAt(0)) ? 2 : 1, data.length); + }), + ); +});