-
Notifications
You must be signed in to change notification settings - Fork 569
/
entities.ts
105 lines (87 loc) · 2.83 KB
/
entities.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
// This module is browser compatible.
/** Object structure for a list of HTML entities. */
export type EntityList = Record<string, string>;
const rawToEntityEntries = [
["&", "&"],
["<", "<"],
[">", ">"],
['"', """],
["'", "'"],
] as const;
const defaultEntityList: EntityList = Object.fromEntries([
...rawToEntityEntries.map(([raw, entity]) => [entity, raw]),
["'", "'"],
[" ", "\xa0"],
]);
const rawToEntity = new Map<string, string>(rawToEntityEntries);
const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g");
/**
* Escapes text for safe interpolation into HTML text content and quoted attributes.
*
* @example
* ```ts
* import { escape } from "@std/html/entities";
*
* escape("<>'&AA"); // "<>'&AA"
*
* // Characters that don't need to be escaped will be left alone,
* // even if named HTML entities exist for them.
* escape("þð"); // "þð"
* ```
*/
export function escape(str: string): string {
return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
}
/** Options for {@linkcode unescape}. */
export type UnescapeOptions = { entityList: EntityList };
const defaultUnescapeOptions: UnescapeOptions = {
entityList: defaultEntityList,
};
const MAX_CODE_POINT = 0x10ffff;
const RX_DEC_ENTITY = /&#([0-9]+);/g;
const RX_HEX_ENTITY = /&#x(\p{AHex}+);/gu;
const entityListRegexCache = new WeakMap<EntityList, RegExp>();
/**
* Unescapes HTML entities in text.
*
* @example
* ```ts
* import { unescape } from "@std/html/entities";
*
* // Default options (only handles &<>'" and numeric entities)
* unescape("<>'&AA"); // "<>'&AA"
* unescape("þð"); // "þð"
*
* // Using the full named entity list from the HTML spec (~47K un-minified)
* import entityList from "@std/html/named-entity-list.json" with { type: "json" };
*
* unescape("þð", { entityList }); // "þð"
* ```
*/
export function unescape(
str: string,
options: Partial<UnescapeOptions> = {},
): string {
const { entityList } = { ...defaultUnescapeOptions, ...options };
let entityRe = entityListRegexCache.get(entityList);
if (!entityRe) {
entityRe = new RegExp(
`(${
Object.keys(entityList)
.sort((a, b) => b.length - a.length)
.join("|")
})`,
"g",
);
entityListRegexCache.set(entityList, entityRe);
}
return str
.replaceAll(entityRe, (m) => entityList[m]!)
.replaceAll(RX_DEC_ENTITY, (_, dec) => codePointStrToChar(dec, 10))
.replaceAll(RX_HEX_ENTITY, (_, hex) => codePointStrToChar(hex, 16));
}
function codePointStrToChar(codePointStr: string, radix: number) {
const codePoint = parseInt(codePointStr, radix);
return codePoint > MAX_CODE_POINT ? "�" : String.fromCodePoint(codePoint);
}