-
Notifications
You must be signed in to change notification settings - Fork 3.6k
/
html2markdown.ts
172 lines (136 loc) · 4.17 KB
/
html2markdown.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/**
* @license Copyright (c) 2003-2024, CKSource Holding sp. z o.o. All rights reserved.
* For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
*/
/**
* @module markdown-gfm/html2markdown/html2markdown
*/
import Turndown from 'turndown';
// There no avaialble types for 'turndown-plugin-gfm' module and it's not worth to generate them on our own.
/* eslint-disable @typescript-eslint/ban-ts-comment */
// @ts-ignore
import { gfm } from 'turndown-plugin-gfm';
const autolinkRegex = /* #__PURE__ */ new RegExp(
// Prefix.
/\b(?:(?:https?|ftp):\/\/|www\.)/.source +
// Domain name.
/(?![-_])(?:[-_a-z0-9\u00a1-\uffff]{1,63}\.)+(?:[a-z\u00a1-\uffff]{2,63})/.source +
// The rest.
/(?:[^\s<>]*)/.source,
'gi'
);
class UpdatedTurndown extends Turndown {
public override escape( string: string ): string {
const originalEscape = super.escape;
function escape( string: string ): string {
string = originalEscape( string );
// Escape "<".
string = string.replace( /</g, '\\<' );
return string;
}
// Urls should not be escaped. Our strategy is using a regex to find them and escape everything
// which is out of the matches parts.
let escaped = '';
let lastLinkEnd = 0;
for ( const match of this._matchAutolink( string ) ) {
const index = match.index!;
// Append the substring between the last match and the current one (if anything).
if ( index > lastLinkEnd ) {
escaped += escape( string.substring( lastLinkEnd, index ) );
}
const matchedURL = match[ 0 ];
escaped += matchedURL;
lastLinkEnd = index + matchedURL.length;
}
// Add text after the last link or at the string start if no matches.
if ( lastLinkEnd < string.length ) {
escaped += escape( string.substring( lastLinkEnd, string.length ) );
}
return escaped;
}
/**
* Trimming end of link.
* https://github.github.com/gfm/#autolinks-extension-
*/
private* _matchAutolink( string: string ) {
for ( const match of string.matchAll( autolinkRegex ) ) {
const matched = match[ 0 ];
const length = this._autolinkFindEnd( matched );
yield Object.assign(
[ matched.substring( 0, length ) ],
{ index: match.index }
);
// We could adjust regex.lastIndex but it's not needed because what we skipped is for sure not a valid URL.
}
}
/**
* Returns the new length of the link (after it would trim trailing characters).
*/
private _autolinkFindEnd( string: string ) {
let length = string.length;
while ( length > 0 ) {
const char = string[ length - 1 ];
if ( '?!.,:*_~\'"'.includes( char ) ) {
length--;
} else if ( char == ')' ) {
let openBrackets = 0;
for ( let i = 0; i < length; i++ ) {
if ( string[ i ] == '(' ) {
openBrackets++;
} else if ( string[ i ] == ')' ) {
openBrackets--;
}
}
// If there is fewer opening brackets then closing ones we should remove a closing bracket.
if ( openBrackets < 0 ) {
length--;
} else {
break;
}
} else {
break;
}
}
return length;
}
}
/**
* This is a helper class used by the {@link module:markdown-gfm/markdown Markdown feature} to convert HTML to Markdown.
*/
export class HtmlToMarkdown {
private _parser: UpdatedTurndown;
constructor() {
this._parser = this._createParser();
}
public parse( html: string ): string {
return this._parser.turndown( html );
}
public keep( elements: Turndown.Filter ): void {
this._parser.keep( elements );
}
private _createParser(): UpdatedTurndown {
const parser = new UpdatedTurndown( {
codeBlockStyle: 'fenced',
hr: '---',
headingStyle: 'atx'
} );
parser.use( [
gfm,
this._todoList
] );
return parser;
}
// This is a copy of the original taskListItems rule from turndown-plugin-gfm, with minor changes.
private _todoList( turndown: UpdatedTurndown ): void {
turndown.addRule( 'taskListItems', {
filter( node: any ) {
return node.type === 'checkbox' &&
// Changes here as CKEditor outputs a deeper structure.
( node.parentNode.nodeName === 'LI' || node.parentNode.parentNode.nodeName === 'LI' );
},
replacement( content: any, node: any ) {
return ( node.checked ? '[x]' : '[ ]' ) + ' ';
}
} );
}
}