This repository has been archived by the owner on Jun 26, 2020. It is now read-only.
/
parse.js
116 lines (94 loc) · 4.17 KB
/
parse.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
/**
* @license Copyright (c) 2003-2018, CKSource - Frederico Knabben. All rights reserved.
* For licensing, see LICENSE.md.
*/
/**
* @module paste-from-office/filters/parse
*/
/* globals DOMParser */
import DomConverter from '@ckeditor/ckeditor5-engine/src/view/domconverter';
import { NBSP_FILLER } from '@ckeditor/ckeditor5-engine/src/view/filler';
import { normalizeSpacing, normalizeSpacerunSpans } from './space';
/**
* Parses provided HTML extracting contents of `<body>` and `<style>` tags.
*
* @param {String} htmlString HTML string to be parsed.
* @returns {Object} result
* @returns {module:engine/view/documentfragment~DocumentFragment} result.body Parsed body
* content as a traversable structure.
* @returns {String} result.bodyString Entire body content as a string.
* @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` objects, each representing
* separate `style` tag from the source HTML.
* @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence into one string.
*/
export function parseHtml( htmlString ) {
const domParser = new DOMParser();
// Remove Word specific "if comments" so content inside is not omitted by the parser.
htmlString = htmlString.replace( /<!--\[if gte vml 1]>/g, '' );
const normalizedHtml = normalizeSpacing( cleanContentAfterBody( htmlString ) );
// Parse htmlString as native Document object.
const htmlDocument = domParser.parseFromString( normalizedHtml, 'text/html' );
normalizeSpacerunSpans( htmlDocument );
// Get `innerHTML` first as transforming to View modifies the source document.
const bodyString = htmlDocument.body.innerHTML;
// Transform document.body to View.
const bodyView = documentToView( htmlDocument );
// Extract stylesheets.
const stylesObject = extractStyles( htmlDocument );
return {
body: bodyView,
bodyString,
styles: stylesObject.styles,
stylesString: stylesObject.stylesString
};
}
// Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}.
//
// @param {Document} htmlDocument Native `Document` object to be transformed.
// @returns {module:engine/view/documentfragment~DocumentFragment}
function documentToView( htmlDocument ) {
const domConverter = new DomConverter( { blockFiller: NBSP_FILLER } );
const fragment = htmlDocument.createDocumentFragment();
const nodes = htmlDocument.body.childNodes;
while ( nodes.length > 0 ) {
fragment.appendChild( nodes[ 0 ] );
}
return domConverter.domToView( fragment );
}
// Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
//
// @param {Document} htmlDocument Native `Document` object from which styles will be extracted.
// @returns {Object} result
// @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` object, each representing
// separate `style` tag from the source object.
// @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence as one string.
function extractStyles( htmlDocument ) {
const styles = [];
const stylesString = [];
const styleTags = Array.from( htmlDocument.getElementsByTagName( 'style' ) );
for ( const style of styleTags ) {
if ( style.sheet && style.sheet.cssRules && style.sheet.cssRules.length ) {
styles.push( style.sheet );
stylesString.push( style.innerHTML );
}
}
return {
styles,
stylesString: stylesString.join( ' ' )
};
}
// Removes leftover content from between closing </body> and closing </html> tag:
//
// <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
//
// This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
// @param {String} htmlString The HTML string to be cleaned.
// @returns {String} The HTML string with leftover content removed.
function cleanContentAfterBody( htmlString ) {
const regexp = /<\/body>(.*?)(<\/html>|$)/;
const match = htmlString.match( regexp );
if ( match && match[ 1 ] ) {
htmlString = htmlString.slice( 0, match.index ) + htmlString.slice( match.index ).replace( match[ 1 ], '' );
}
return htmlString;
}