/
parse.ts
152 lines (122 loc) · 4.64 KB
/
parse.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/**
* @license Copyright (c) 2003-2024, CKSource Holding sp. z o.o. All rights reserved.
* For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
*/
/**
* @module paste-from-office/filters/parse
*/
/* globals DOMParser */
import {
DomConverter,
ViewDocument,
type StylesProcessor,
type ViewDocumentFragment
} from 'ckeditor5/src/engine.js';
import { normalizeSpacing, normalizeSpacerunSpans } from './space.js';
/**
* Parses the provided HTML extracting contents of `<body>` and `<style>` tags.
*
* @param htmlString HTML string to be parsed.
*/
export function parseHtml( htmlString: string, stylesProcessor: StylesProcessor ): ParseHtmlResult {
const domParser = new DOMParser();
// Remove Word specific "if comments" so content inside is not omitted by the parser.
htmlString = htmlString.replace( /<!--\[if gte vml 1]>/g, '' );
// Clean the <head> section of MS Windows specific tags. See https://github.com/ckeditor/ckeditor5/issues/15333.
// The regular expression matches the <o:SmartTagType> tag with optional attributes (with or without values).
htmlString = htmlString.replace( /<o:SmartTagType(?:\s+[^\s>=]+(?:="[^"]*")?)*\s*\/?>/gi, '' );
const normalizedHtml = normalizeSpacing( cleanContentAfterBody( htmlString ) );
// Parse htmlString as native Document object.
const htmlDocument = domParser.parseFromString( normalizedHtml, 'text/html' );
normalizeSpacerunSpans( htmlDocument );
// Get `innerHTML` first as transforming to View modifies the source document.
const bodyString = htmlDocument.body.innerHTML;
// Transform document.body to View.
const bodyView = documentToView( htmlDocument, stylesProcessor );
// Extract stylesheets.
const stylesObject = extractStyles( htmlDocument );
return {
body: bodyView,
bodyString,
styles: stylesObject.styles,
stylesString: stylesObject.stylesString
};
}
/**
* The result of {@link ~parseHtml}.
*/
export interface ParseHtmlResult {
/**
* Parsed body content as a traversable structure.
*/
body: ViewDocumentFragment;
/**
* Entire body content as a string.
*/
bodyString: string;
/**
* Array of native `CSSStyleSheet` objects, each representing separate `style` tag from the source HTML.
*/
styles: Array<CSSStyleSheet>;
/**
* All `style` tags contents combined in the order of occurrence into one string.
*/
stylesString: string;
}
/**
* Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
*
* @param htmlDocument Native `Document` object to be transformed.
*/
function documentToView( htmlDocument: Document, stylesProcessor: StylesProcessor ) {
const viewDocument = new ViewDocument( stylesProcessor );
const domConverter = new DomConverter( viewDocument, { renderingMode: 'data' } );
const fragment = htmlDocument.createDocumentFragment();
const nodes = htmlDocument.body.childNodes;
while ( nodes.length > 0 ) {
fragment.appendChild( nodes[ 0 ] );
}
return domConverter.domToView( fragment, { skipComments: true } ) as ViewDocumentFragment;
}
/**
* Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
*
* @param htmlDocument Native `Document` object from which styles will be extracted.
*/
function extractStyles( htmlDocument: Document ): { styles: Array<CSSStyleSheet>; stylesString: string } {
const styles = [];
const stylesString = [];
const styleTags = Array.from( htmlDocument.getElementsByTagName( 'style' ) );
for ( const style of styleTags ) {
if ( style.sheet && style.sheet.cssRules && style.sheet.cssRules.length ) {
styles.push( style.sheet );
stylesString.push( style.innerHTML );
}
}
return {
styles,
stylesString: stylesString.join( ' ' )
};
}
/**
* Removes leftover content from between closing </body> and closing </html> tag:
*
* ```html
* <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
* ```
*
* This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
* @param htmlString The HTML string to be cleaned.
* @returns The HTML string with leftover content removed.
*/
function cleanContentAfterBody( htmlString: string ) {
const bodyCloseTag = '</body>';
const htmlCloseTag = '</html>';
const bodyCloseIndex = htmlString.indexOf( bodyCloseTag );
if ( bodyCloseIndex < 0 ) {
return htmlString;
}
const htmlCloseIndex = htmlString.indexOf( htmlCloseTag, bodyCloseIndex + bodyCloseTag.length );
return htmlString.substring( 0, bodyCloseIndex + bodyCloseTag.length ) +
( htmlCloseIndex >= 0 ? htmlString.substring( htmlCloseIndex ) : '' );
}