Skip to content

Commit

Permalink
Improve binary file check (fixes #205) (#206)
Browse files Browse the repository at this point in the history
* fix(files): Improve binary file check (fixes #205)

* Fix URL in download-test-files.sh

* Fix TestDecodeBytesText test

Also changed download script to Makefile

* Remove dead code

* Fix that DetectBest is non-deterministic.

chardet's DetectBest() returns multiple charsets with the same Confidence level.
Unfortunately, it sometimes returns a different charset. This commit
fixes this by sorting the results returned by DetectAll() and returning
the first alphabetically. This could be certainly be approved.

* Remove commented out code

* Add additional types to defaultAllowedContentTypes

Specifically:

application/ecmascript
application/json
application/xml
+xml

* Add application/x-ndjson & +json, update readme

* Update readme with improved grammar
  • Loading branch information
rasa committed Jul 16, 2022
1 parent bed39b8 commit 9d4703a
Show file tree
Hide file tree
Showing 68 changed files with 21,946 additions and 58 deletions.
5 changes: 4 additions & 1 deletion .ecrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
"IgnoreDefaults": false,
"SpacesAftertabs": false,
"NoColor": false,
"exclude": ["testfiles"],
"exclude": [
"testfiles",
"testdata"
],
"AllowedContentTypes": [],
"PassedFiles": [],
"Disable": {
Expand Down
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/pkg/encoding/testdata/* -text
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,18 @@ You could also specify command line arguments and they will get merged with the

You can create a configuration with the `init`-flag. If you specify an `config`-path it will be created there.

By default the allowed_content_types are `text/` and `application/octet-stream`(needed as a fallback when no content type could be determined). You can add additional accepted content types with the `allowed_content_types` key. But the default ones doesn't get removed.
By default the allowed_content_types are:

1. `text/` (matches `text/plain`, `text/html`, etc.)
1. `application/ecmascript`
1. `application/json`
1. `application/x-ndjson`
1. `application/xml`
1. `+json` (matches `application/geo+json`, etc.)
1. `+xml` (matches `application/rss+xml`, etc.)
1. `application/octet-stream`

`application/octet-stream` is needed as a fallback when no content type could be determined. You can add additional accepted content types with the `allowed_content_types` key. But the default ones don't get removed.

## Excluding

Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ module github.com/editorconfig-checker/editorconfig-checker
go 1.16

require (
github.com/baulk/chardet v0.1.0 // indirect
github.com/editorconfig/editorconfig-core-go/v2 v2.4.2
github.com/gabriel-vasile/mimetype v1.4.0 // indirect
github.com/gopherjs/gopherjs v0.0.0-20190812055157-5d271430af9f // indirect
github.com/smartystreets/assertions v1.0.1 // indirect
golang.org/x/text v0.3.7 // indirect
)
13 changes: 13 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
github.com/baulk/chardet v0.1.0 h1:6/r5nPMikB9OG1Njs10VfVHZTDMFH6BdybHPISpfUVA=
github.com/baulk/chardet v0.1.0/go.mod h1:0ibN6068qswel5Hv54U7GNJUU57njfzPJrLIq7Y8xas=
github.com/editorconfig/editorconfig-core-go/v2 v2.4.2 h1:1lkDpSoAaFLrgYTVJ/eNCV+lkDSv/j9Wm0jcvDfVVEo=
github.com/editorconfig/editorconfig-core-go/v2 v2.4.2/go.mod h1:IXeWRVO4LZRoNunhHh/oP6BQvTs94nB2pNvbw32l8tQ=
github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0FnbhiOsEro=
github.com/gabriel-vasile/mimetype v1.4.0/go.mod h1:fA8fi6KUiG7MgQQ+mEWotXoEOvmxRtOJlERCzSmRvr8=
github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
Expand All @@ -19,10 +23,19 @@ golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210505024714-0287a6fb4125 h1:Ugb8sMTWuWRC3+sz5WeN/4kejDx9BvIwnPUiJBjJE+8=
golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
Expand Down
6 changes: 6 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ var defaultExcludes = []string{
var defaultAllowedContentTypes = []string{
"text/",
"application/octet-stream",
"application/ecmascript",
"application/json",
"application/x-ndjson",
"application/xml",
"+json",
"+xml",
}

// Config struct, contains everything a config can contain
Expand Down
10 changes: 5 additions & 5 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func TestGetExcludesAsRegularExpression(t *testing.T) {
}

actual = c.GetExcludesAsRegularExpression()
expected = `testfiles|^\.yarn/|^yarn\.lock$|^package-lock\.json$|^composer\.lock$|^Cargo\.lock$|^\.pnp\.cjs$|^\.pnp\.js$|^\.pnp\.loader\.mjs$|\.snap$|\.otf$|\.woff$|\.woff2$|\.eot$|\.ttf$|\.gif$|\.png$|\.jpg$|\.jpeg$|\.webp$|\.avif$|\.mp4$|\.wmv$|\.svg$|\.ico$|\.bak$|\.bin$|\.pdf$|\.zip$|\.gz$|\.tar$|\.7z$|\.bz2$|\.log$|\.patch$|\.css\.map$|\.js\.map$|min\.css$|min\.js$`
expected = `testfiles|testdata|^\.yarn/|^yarn\.lock$|^package-lock\.json$|^composer\.lock$|^Cargo\.lock$|^\.pnp\.cjs$|^\.pnp\.js$|^\.pnp\.loader\.mjs$|\.snap$|\.otf$|\.woff$|\.woff2$|\.eot$|\.ttf$|\.gif$|\.png$|\.jpg$|\.jpeg$|\.webp$|\.avif$|\.mp4$|\.wmv$|\.svg$|\.ico$|\.bak$|\.bin$|\.pdf$|\.zip$|\.gz$|\.tar$|\.7z$|\.bz2$|\.log$|\.patch$|\.css\.map$|\.js\.map$|min\.css$|min\.js$`

if actual != expected {
t.Errorf("expected %s, got %s", expected, actual)
Expand Down Expand Up @@ -104,8 +104,8 @@ func TestMerge(t *testing.T) {

c1.Merge(mergeConfig)

mergeConfig.AllowedContentTypes = []string{"text/", "application/octet-stream", "xml/"}
mergeConfig.Exclude = []string{"testfiles", "some-other"}
mergeConfig.AllowedContentTypes = []string{"text/", "application/octet-stream", "application/ecmascript", "application/json", "application/x-ndjson", "application/xml", "+json", "+xml", "xml/"}
mergeConfig.Exclude = []string{"testfiles", "testdata", "some-other"}

expected := mergeConfig
expected.Logger.Verbosee = true
Expand Down Expand Up @@ -141,7 +141,7 @@ func TestParse(t *testing.T) {
c.Debug != true ||
c.IgnoreDefaults != true ||
!reflect.DeepEqual(c.Exclude, []string{"testfiles"}) ||
!reflect.DeepEqual(c.AllowedContentTypes, []string{"text/", "application/octet-stream", "hey"}) ||
!reflect.DeepEqual(c.AllowedContentTypes, []string{"text/", "application/octet-stream", "application/ecmascript", "application/json", "application/x-ndjson", "application/xml", "+json", "+xml", "hey"}) ||
c.SpacesAftertabs != true ||
c.Disable.EndOfLine != false ||
c.Disable.TrimTrailingWhitespace != false ||
Expand Down Expand Up @@ -171,7 +171,7 @@ func TestGetAsString(t *testing.T) {
_ = c.Parse()

actual := c.GetAsString()
expected := "Config: {ShowVersion:false Help:false DryRun:false Path:../../.ecrc Version:2.5.0 Verbose:false Debug:false IgnoreDefaults:false SpacesAftertabs:false NoColor:false Exclude:[testfiles] AllowedContentTypes:[text/ application/octet-stream] PassedFiles:[] Disable:{EndOfLine:false Indentation:false InsertFinalNewline:false TrimTrailingWhitespace:false IndentSize:false MaxLineLength:false} Logger:{Verbosee:false Debugg:false NoColor:false}}"
expected := "Config: {ShowVersion:false Help:false DryRun:false Path:../../.ecrc Version:2.5.0 Verbose:false Debug:false IgnoreDefaults:false SpacesAftertabs:false NoColor:false Exclude:[testfiles testdata] AllowedContentTypes:[text/ application/octet-stream application/ecmascript application/json application/x-ndjson application/xml +json +xml] PassedFiles:[] Disable:{EndOfLine:false Indentation:false InsertFinalNewline:false TrimTrailingWhitespace:false IndentSize:false MaxLineLength:false} Logger:{Verbosee:false Debugg:false NoColor:false}}"

if actual != expected {
t.Errorf("Expected: %v, got: %v ", expected, actual)
Expand Down
177 changes: 177 additions & 0 deletions pkg/encoding/encoding.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
// package encoding contains all the encoding functions
package encoding

import (
"fmt"
"sort"
"strings"
"unicode/utf8"

"github.com/baulk/chardet"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/encoding/unicode/utf32"
)

const BinaryData = "binary"

var encodings = map[string]encoding.Encoding{
// In https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 and
// https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 :
"utf8": unicode.UTF8,
"ibm866": charmap.CodePage866,
"iso88592": charmap.ISO8859_2,
"iso88593": charmap.ISO8859_3,
"iso88594": charmap.ISO8859_4,
"iso88595": charmap.ISO8859_5,
"iso88596": charmap.ISO8859_6,
"iso88597": charmap.ISO8859_7,
"iso88598": charmap.ISO8859_8,
"iso885910": charmap.ISO8859_10,
"iso885913": charmap.ISO8859_13,
"iso885914": charmap.ISO8859_14,
"iso885915": charmap.ISO8859_15,
"iso885916": charmap.ISO8859_16,
"koi8r": charmap.KOI8R,
"koi8u": charmap.KOI8U,
"macintosh": charmap.Macintosh,
"windows874": charmap.Windows874,
"windows1250": charmap.Windows1250,
"windows1251": charmap.Windows1251,
"windows1252": charmap.Windows1252,
"windows1253": charmap.Windows1253,
"windows1254": charmap.Windows1254,
"windows1255": charmap.Windows1255,
"windows1256": charmap.Windows1256,
"windows1257": charmap.Windows1257,
"windows1258": charmap.Windows1258,
"gbk": simplifiedchinese.GBK,
"gb18030": simplifiedchinese.GB18030,
"big5": traditionalchinese.Big5,
"eucjp": japanese.EUCJP,
"iso2022jp": japanese.ISO2022JP,
"shiftjis": japanese.ShiftJIS,
"euckr": korean.EUCKR,
"utf16be": unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
"utf16le": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
// Not in https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 :
"iso88591": charmap.ISO8859_1,
"ibm037": charmap.CodePage037,
"ibm437": charmap.CodePage437,
"ibm850": charmap.CodePage850,
"ibm852": charmap.CodePage852,
"ibm855": charmap.CodePage855,
"ibm858": charmap.CodePage858,
"ibm860": charmap.CodePage860,
"ibm862": charmap.CodePage862,
"ibm863": charmap.CodePage863,
"ibm865": charmap.CodePage865,
"ibm1047": charmap.CodePage1047,
"ibm1140": charmap.CodePage1140,
"iso88596e": charmap.ISO8859_6E,
"iso88596i": charmap.ISO8859_6I,
"iso88598e": charmap.ISO8859_8E,
"iso88598i": charmap.ISO8859_8I,
"iso88599": charmap.ISO8859_9,
"hzgb2312": simplifiedchinese.HZGB2312,
// Not https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 :
"macintoshcyrillic": charmap.MacintoshCyrillic,
// Not in https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 or
// https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 :
"utf8bom": unicode.UTF8,
"utf32be": utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM),
"utf32le": utf32.UTF32(utf32.LittleEndian, utf32.IgnoreBOM),
}

// In https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156
// but not included above:
// enc3: asciiEnc,
// enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM)

// DecodeBytes converts a byte array to a string
func DecodeBytes(contentBytes []byte) (string, string, error) {
contentString := string(contentBytes)

charset, err := detectText(contentBytes)
if err != nil {
if IsBinaryFile(contentBytes) {
return contentString, BinaryData, nil
}
return contentString, charset, err
}
decodedContentString, err := decodeText(contentBytes, charset)
if err != nil {
if IsBinaryFile(contentBytes) {
return contentString, BinaryData, nil
}
return contentString, charset, err
}
return decodedContentString, charset, nil
}

func detectText(contentBytes []byte) (string, error) {
detector := chardet.NewTextDetector()
results, err := detector.DetectAll(contentBytes)
if err != nil {
return "", err
}
if len(results) == 0 {
return "", fmt.Errorf("Failed to determine charset")
}
confidence := -1
keys := make([]string, 0, len(results))
for _, result := range results {
if result.Confidence < confidence {
break
}
confidence = result.Confidence
keys = append(keys, result.Charset)
}
sort.Strings(keys)
return keys[0], nil
}

func decodeText(contentBytes []byte, charset string) (string, error) {
r := strings.NewReplacer("-", "", "_", "")
key := strings.ToLower(r.Replace(charset))
enc, ok := encodings[key]
if !ok {
return "", fmt.Errorf("unrecognized charset %s", charset)
}
if enc != nil {
var err error
contentBytes, err = enc.NewDecoder().Bytes(contentBytes)
if err != nil {
return "", err
}
}
if !utf8.Valid(contentBytes) {
return "", fmt.Errorf("the file is not a valid UTF-8 encoded file")
}
return string(contentBytes), nil
}

var binaryChars = [256]bool{}

func init() {
// Allow tab (9), lf (10), ff (12), and cr (13)
trues := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
for _, i := range trues {
binaryChars[i] = true
}
}

// IsBinaryFile returns true if the bytes contain \x00-\x08,\x0b,\x0e-\x1f
func IsBinaryFile(rawFileContent []byte) bool {
for _, b := range rawFileContent {
if binaryChars[b] {
return true
}
}
return false
}

0 comments on commit 9d4703a

Please sign in to comment.