-
-
Notifications
You must be signed in to change notification settings - Fork 3.7k
/
html2markdown.js
153 lines (121 loc) · 3.56 KB
/
html2markdown.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/**
* @license Copyright (c) 2003-2023, CKSource Holding sp. z o.o. All rights reserved.
* For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
*/
/**
* @module markdown-gfm/html2markdown
*/
import TurndownService from 'turndown';
import { gfm } from 'turndown-plugin-gfm';
// Override the original escape method by not escaping links.
const originalEscape = TurndownService.prototype.escape;
function escape( string ) {
string = originalEscape( string );
// Escape "<".
string = string.replace( /</g, '\\<' );
return string;
}
TurndownService.prototype.escape = function( string ) {
// Urls should not be escaped. Our strategy is using a regex to find them and escape everything
// which is out of the matches parts.
let escaped = '';
let lastLinkEnd = 0;
for ( const match of matchAutolink( string ) ) {
const index = match.index;
// Append the substring between the last match and the current one (if anything).
if ( index > lastLinkEnd ) {
escaped += escape( string.substring( lastLinkEnd, index ) );
}
const matchedURL = match[ 0 ];
escaped += matchedURL;
lastLinkEnd = index + matchedURL.length;
}
// Add text after the last link or at the string start if no matches.
if ( lastLinkEnd < string.length ) {
escaped += escape( string.substring( lastLinkEnd, string.length ) );
}
return escaped;
};
const turndownService = new TurndownService( {
codeBlockStyle: 'fenced',
hr: '---',
headingStyle: 'atx'
} );
turndownService.use( [
gfm,
todoList
] );
/**
* Parses HTML to a markdown.
*
* @param {String} html
* @returns {String}
*/
export default function html2markdown( html ) {
return turndownService.turndown( html );
}
export { turndownService };
// This is a copy of the original taskListItems rule from turdown-plugin-gfm, with minor changes.
function todoList( turndownService ) {
turndownService.addRule( 'taskListItems', {
filter( node ) {
return node.type === 'checkbox' &&
// Changes here as CKEditor outputs a deeper structure.
( node.parentNode.nodeName === 'LI' || node.parentNode.parentNode.nodeName === 'LI' );
},
replacement( content, node ) {
return ( node.checked ? '[x]' : '[ ]' ) + ' ';
}
} );
}
// Autolink matcher.
const regex = new RegExp(
// Prefix.
/\b(?:(?:https?|ftp):\/\/|www\.)/.source +
// Domain name.
/(?![-_])(?:[-_a-z0-9\u00a1-\uffff]{1,63}\.)+(?:[a-z\u00a1-\uffff]{2,63})/.source +
// The rest.
/(?:[^\s<>]*)/.source,
'gi'
);
// Trimming end of link.
// https://github.github.com/gfm/#autolinks-extension-
function* matchAutolink( string ) {
for ( const match of string.matchAll( regex ) ) {
const matched = match[ 0 ];
const length = autolinkFindEnd( matched );
yield Object.assign(
[ matched.substring( 0, length ) ],
{ index: match.index }
);
// We could adjust regex.lastIndex but it's not needed because what we skipped is for sure not a valid URL.
}
}
// Returns the new length of the link (after it would trim trailing characters).
function autolinkFindEnd( string ) {
let length = string.length;
while ( length > 0 ) {
const char = string[ length - 1 ];
if ( '?!.,:*_~\'"'.includes( char ) ) {
length--;
} else if ( char == ')' ) {
let openBrackets = 0;
for ( let i = 0; i < length; i++ ) {
if ( string[ i ] == '(' ) {
openBrackets++;
} else if ( string[ i ] == ')' ) {
openBrackets--;
}
}
// If there is fewer opening brackets then closing ones we should remove a closing bracket.
if ( openBrackets < 0 ) {
length--;
} else {
break;
}
} else {
break;
}
}
return length;
}