Skip to content

Commit

Permalink
add html utf8 bom signature (#268)
Browse files Browse the repository at this point in the history
* detect HTML with utf8 BOM signature; closes #267
* make BOM have priority over HTML meta in charset detection: #268 (comment) 

Co-authored-by: Gabriel Vasile <gabriel.vasile@email.com>
  • Loading branch information
napalu and gabriel-vasile committed Apr 17, 2022
1 parent 7322b53 commit 6e3aeb1
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 80 deletions.
9 changes: 7 additions & 2 deletions internal/charset/charset.go
Expand Up @@ -157,9 +157,14 @@ func fromXML(content []byte) string {
return strings.ToLower(xmlEncoding(string(t.Inst)))
}

// FromHTML returns the charset of an HTML document. It relies on the meta tag
// <meta charset="UTF-8"> and falls back on the plain text content.
// FromHTML returns the charset of an HTML document. It first looks if a BOM is
// present and if so uses it to determine the charset. If no BOM is present,
// it relies on the meta tag <meta charset="UTF-8"> and falls back on the
// plain text content.
func FromHTML(content []byte) string {
if cset := FromBOM(content); cset != "" {
return cset
}
if cset := fromHTML(content); cset != "" {
return cset
}
Expand Down
30 changes: 30 additions & 0 deletions internal/charset/charset_test.go
Expand Up @@ -24,6 +24,29 @@ const htmlDoc = `<!DOCTYPE html>
<div class="container footer">さ</div>
</body>
</html>`
const htmlDocWithIncorrectCharset = `<!DOCTYPE html>
<!--
Some comment
-->
<html dir="ltr" mozdisallowselectionprint>
<head>
<meta charset="ISO-8859-16">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="some name" content="notranslate">
<title>test</title>
<link rel="stylesheet" href="html.utf8bom.css">
</head>
<body tabindex="1">
<div id="printContainer"></div>
</body>
</html>`

func TestFromXML(t *testing.T) {
charset := FromXML([]byte(xmlDoc))
Expand All @@ -39,6 +62,13 @@ func TestFromHTML(t *testing.T) {
}
}

func TestFromHTMLWithBOM(t *testing.T) {
charset := FromHTML(append([]byte{0xEF, 0xBB, 0xBF}, []byte(htmlDocWithIncorrectCharset)...))
if charset != "utf-8" {
t.Errorf("expected: utf-8; got: %s", charset)
}
}

func TestFromPlain(t *testing.T) {
tcases := []struct {
raw []byte
Expand Down
9 changes: 8 additions & 1 deletion internal/magic/magic.go
Expand Up @@ -104,7 +104,14 @@ func xmlCheck(sig xmlSig, raw []byte) bool {
// matches the raw input.
func markup(sigs ...[]byte) Detector {
return func(raw []byte, limit uint32) bool {
raw = trimLWS(raw)
if bytes.HasPrefix(raw, []byte{0xEF, 0xBB, 0xBF}) {
// We skip the UTF-8 BOM if present to ensure we correctly
// process any leading whitespace. The presence of the BOM
// is taken into account during charset detection in charset.go.
raw = trimLWS(raw[3:])
} else {
raw = trimLWS(raw)
}
if len(raw) == 0 {
return false
}
Expand Down
157 changes: 80 additions & 77 deletions mimetype_test.go
Expand Up @@ -18,83 +18,86 @@ const testDataDir = "testdata"

// test files sorted by the file name in alphabetical order.
var files = map[string]string{
"3g2.3g2": "video/3gpp2",
"3gp.3gp": "video/3gpp",
"3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml",
"7z.7z": "application/x-7z-compressed",
"a.a": "application/x-archive",
"aac.aac": "audio/aac",
"aaf.aaf": "application/octet-stream",
"accdb.accdb": "application/x-msaccess",
"aiff.aiff": "audio/aiff",
"amf.amf": "application/x-amf",
"amr.amr": "audio/amr",
"ape.ape": "audio/ape",
"apng.png": "image/vnd.mozilla.apng",
"asf.asf": "video/x-ms-asf",
"atom.atom": "application/atom+xml",
"au.au": "audio/basic",
"avi.avi": "video/x-msvideo",
"avif.avif": "image/avif",
"avifsequence.avif": "image/avif",
"bmp.bmp": "image/bmp",
"bpg.bpg": "image/bpg",
"bz2.bz2": "application/x-bzip2",
"cab.cab": "application/vnd.ms-cab-compressed",
"class.class": "application/x-java-applet",
"crx.crx": "application/x-chrome-extension",
"csv.csv": "text/csv",
"cpio.cpio": "application/x-cpio",
"dae.dae": "model/vnd.collada+xml",
"dbf.dbf": "application/x-dbf",
"dcm.dcm": "application/dicom",
"deb.deb": "application/vnd.debian.binary-package",
"djvu.djvu": "image/vnd.djvu",
"doc.doc": "application/msword",
"docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"drpm.rpm": "application/x-rpm",
"dwg.1.dwg": "image/vnd.dwg",
"dwg.dwg": "image/vnd.dwg",
"eot.eot": "application/vnd.ms-fontobject",
"epub.epub": "application/epub+zip",
"exe.exe": "application/vnd.microsoft.portable-executable",
"fdf.fdf": "application/vnd.fdf",
"fits.fits": "application/fits",
"flac.flac": "audio/flac",
"flv.flv": "video/x-flv",
"gbr.gbr": "image/x-gimp-gbr",
"geojson.1.geojson": "application/geo+json",
"geojson.geojson": "application/geo+json",
"gif.gif": "image/gif",
"glb.glb": "model/gltf-binary",
"gml.gml": "application/gml+xml",
"gpx.gpx": "application/gpx+xml",
"gz.gz": "application/gzip",
"har.har": "application/json",
"hdr.hdr": "image/vnd.radiance",
"heic.single.heic": "image/heic",
"heif.heif": "image/heif",
"html.html": "text/html; charset=utf-8",
"html.iso88591.html": "text/html; charset=iso-8859-1",
"html.svg.html": "text/html; charset=utf-8",
"html.usascii.html": "text/html; charset=us-ascii",
"html.utf8.html": "text/html; charset=utf-8",
"html.withbr.html": "text/html; charset=utf-8",
"ico.ico": "image/x-icon",
"ics.dos.ics": "text/calendar",
"ics.ics": "text/calendar",
"iso88591.txt": "text/plain; charset=iso-8859-1",
"jar.jar": "application/jar",
"jp2.jp2": "image/jp2",
"jpf.jpf": "image/jpx",
"jpg.jpg": "image/jpeg",
"jpm.jpm": "image/jpm",
"jxl.jxl": "image/jxl",
"xpm.xpm": "image/x-xpixmap",
"js.js": "application/javascript",
"json.json": "application/json",
"json.lowascii.json": "application/json",
"3g2.3g2": "video/3gpp2",
"3gp.3gp": "video/3gpp",
"3mf.3mf": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml",
"7z.7z": "application/x-7z-compressed",
"a.a": "application/x-archive",
"aac.aac": "audio/aac",
"aaf.aaf": "application/octet-stream",
"accdb.accdb": "application/x-msaccess",
"aiff.aiff": "audio/aiff",
"amf.amf": "application/x-amf",
"amr.amr": "audio/amr",
"ape.ape": "audio/ape",
"apng.png": "image/vnd.mozilla.apng",
"asf.asf": "video/x-ms-asf",
"atom.atom": "application/atom+xml",
"au.au": "audio/basic",
"avi.avi": "video/x-msvideo",
"avif.avif": "image/avif",
"avifsequence.avif": "image/avif",
"bmp.bmp": "image/bmp",
"bpg.bpg": "image/bpg",
"bz2.bz2": "application/x-bzip2",
"cab.cab": "application/vnd.ms-cab-compressed",
"class.class": "application/x-java-applet",
"crx.crx": "application/x-chrome-extension",
"csv.csv": "text/csv",
"cpio.cpio": "application/x-cpio",
"dae.dae": "model/vnd.collada+xml",
"dbf.dbf": "application/x-dbf",
"dcm.dcm": "application/dicom",
"deb.deb": "application/vnd.debian.binary-package",
"djvu.djvu": "image/vnd.djvu",
"doc.doc": "application/msword",
"docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"drpm.rpm": "application/x-rpm",
"dwg.1.dwg": "image/vnd.dwg",
"dwg.dwg": "image/vnd.dwg",
"eot.eot": "application/vnd.ms-fontobject",
"epub.epub": "application/epub+zip",
"exe.exe": "application/vnd.microsoft.portable-executable",
"fdf.fdf": "application/vnd.fdf",
"fits.fits": "application/fits",
"flac.flac": "audio/flac",
"flv.flv": "video/x-flv",
"gbr.gbr": "image/x-gimp-gbr",
"geojson.1.geojson": "application/geo+json",
"geojson.geojson": "application/geo+json",
"gif.gif": "image/gif",
"glb.glb": "model/gltf-binary",
"gml.gml": "application/gml+xml",
"gpx.gpx": "application/gpx+xml",
"gz.gz": "application/gzip",
"har.har": "application/json",
"hdr.hdr": "image/vnd.radiance",
"heic.single.heic": "image/heic",
"heif.heif": "image/heif",
"html.html": "text/html; charset=utf-8",
"html.iso88591.html": "text/html; charset=iso-8859-1",
"html.svg.html": "text/html; charset=utf-8",
"html.usascii.html": "text/html; charset=us-ascii",
"html.utf8.html": "text/html; charset=utf-8",
"html.utf8bom.html": "text/html; charset=utf-8",
"html.utf8bomws.html": "text/html; charset=utf-8",
"html.utf8bomdetect.html": "text/html; charset=utf-8",
"html.withbr.html": "text/html; charset=utf-8",
"ico.ico": "image/x-icon",
"ics.dos.ics": "text/calendar",
"ics.ics": "text/calendar",
"iso88591.txt": "text/plain; charset=iso-8859-1",
"jar.jar": "application/jar",
"jp2.jp2": "image/jp2",
"jpf.jpf": "image/jpx",
"jpg.jpg": "image/jpeg",
"jpm.jpm": "image/jpm",
"jxl.jxl": "image/jxl",
"xpm.xpm": "image/x-xpixmap",
"js.js": "application/javascript",
"json.json": "application/json",
"json.lowascii.json": "application/json",
// json.{int,float,string}.txt contain a single JSON value. They are valid JSON
// documents, but they should not be detected as application/json. This mimics
// the behaviour of the file utility and seems the correct thing to do.
Expand Down
23 changes: 23 additions & 0 deletions testdata/html.utf8bom.html
@@ -0,0 +1,23 @@
<!DOCTYPE html>
<!--
Some comment
-->
<html dir="ltr" mozdisallowselectionprint>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="some name" content="notranslate">
<title>test</title>


<link rel="stylesheet" href="html.utf8bom.css">



</head>

<body tabindex="1">
<div id="printContainer"></div>
</body>
</html>
24 changes: 24 additions & 0 deletions testdata/html.utf8bomdetect.html
@@ -0,0 +1,24 @@

<!DOCTYPE html>
<!--
Some comment
-->
<html dir="ltr" mozdisallowselectionprint>
<head>
<meta charset="ISO-8859-16">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="some name" content="notranslate">
<title>test</title>


<link rel="stylesheet" href="html.utf8bom.css">



</head>

<body tabindex="1">
<div id="printContainer"></div>
</body>
</html>
24 changes: 24 additions & 0 deletions testdata/html.utf8bomws.html
@@ -0,0 +1,24 @@

<!DOCTYPE html>
<!--
Some comment
-->
<html dir="ltr" mozdisallowselectionprint>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="some name" content="notranslate">
<title>test</title>


<link rel="stylesheet" href="html.utf8bom.css">



</head>

<body tabindex="1">
<div id="printContainer"></div>
</body>
</html>

0 comments on commit 6e3aeb1

Please sign in to comment.