diff --git a/internal/magic/magic_test.go b/internal/magic/magic_test.go index 04490a6..5152e72 100644 --- a/internal/magic/magic_test.go +++ b/internal/magic/magic_test.go @@ -1,7 +1,8 @@ package magic import ( - "io" + "bufio" + "strings" "testing" ) @@ -86,6 +87,88 @@ func TestMagic(t *testing.T) { } } +func TestScanLine(t *testing.T) { + tcases := []struct { + name string + input string + expected []string + }{{ + name: "empty input", + input: "", + expected: nil, + }, { + name: "single line, no terminal nl", + input: "1", + expected: []string{"1"}, + }, { + name: "single line, terminal nl", + input: "1\n", + expected: []string{"1"}, + }, { + name: "two lines, no terminal nl", + input: "1\n2", + expected: []string{"1", "2"}, + }, { + name: "two lines, with terminal nl", + input: "1\n2\n", + expected: []string{"1", "2"}, + }, { + name: "drops final cr", + input: "1\n2\r", + expected: []string{"1", "2"}, + }, { + name: "final empty line", + input: "1\n2\n\n", + expected: []string{"1", "2", ""}, + }, { + name: "empty line with cr", + input: "1\n2\n\r", + expected: []string{"1", "2", ""}, + }, { + name: "nd-json numbers and object", + input: "1\n2\n3\n{}", + expected: []string{"1", "2", "3", "{}"}, + }, + } + + for _, tt := range tcases { + t.Run(tt.name, func(t *testing.T) { + testScanLine(t, tt.input, tt.expected) + testScanLineLikeBufioScanner(t, tt.input) + }) + } +} + +func testScanLine(t *testing.T, text string, expectedLines []string) { + var l []byte + i, raw := 0, []byte(text) + for i = 0; len(raw) != 0; i++ { + l, raw = scanLine(raw) + if string(l) != expectedLines[i] { + t.Errorf("expected %q, got %q", expectedLines[i], l) + } + } + if i != len(expectedLines) { + t.Errorf("expected %d lines, got %d lines", len(expectedLines), i) + } +} + +// Test that scanLine behaves exactly like bufio.Scanner. +func testScanLineLikeBufioScanner(t *testing.T, text string) { + var l []byte + raw := []byte(text) + s := bufio.NewScanner(strings.NewReader(text)) + for lineNum := 0; s.Scan(); lineNum++ { + l, raw = scanLine(raw) + if string(l) != s.Text() { + t.Errorf("expected: %q, got: %q", s.Text(), string(l)) + } + } + if err := s.Err(); err != nil { + t.Error(err) + } +} + func TestDropLastLine(t *testing.T) { dropTests := []struct { raw string @@ -105,8 +188,7 @@ func TestDropLastLine(t *testing.T) { {"\nå\n", 5, "\nå\n"}, } for i, tt := range dropTests { - gotR := dropLastLine([]byte(tt.raw), tt.cutAt) - got, _ := io.ReadAll(gotR) + got := dropLastLine([]byte(tt.raw), tt.cutAt) if got := string(got); got != tt.res { t.Errorf("dropLastLine %d error: expected %q; got %q", i, tt.res, got) } diff --git a/internal/magic/text.go b/internal/magic/text.go index fedb5c2..9f1a637 100644 --- a/internal/magic/text.go +++ b/internal/magic/text.go @@ -1,7 +1,6 @@ package magic import ( - "bufio" "bytes" "strings" "time" @@ -234,9 +233,10 @@ func GeoJSON(raw []byte, limit uint32) bool { // types. func NdJSON(raw []byte, limit uint32) bool { lCount, hasObjOrArr := 0, false - sc := bufio.NewScanner(dropLastLine(raw, limit)) - for sc.Scan() { - l := sc.Bytes() + raw = dropLastLine(raw, limit) + var l []byte + for len(raw) != 0 { + l, raw = scanLine(raw) // Empty lines are allowed in NDJSON. if l = trimRWS(trimLWS(l)); len(l) == 0 { continue @@ -301,21 +301,14 @@ func Svg(raw []byte, limit uint32) bool { } // Srt matches a SubRip file. -func Srt(in []byte, _ uint32) bool { - line, in, found := scanLine(in) - if !found { - return false - } +func Srt(raw []byte, _ uint32) bool { + line, raw := scanLine(raw) // First line must be 1. if string(line) != "1" { return false } - line, in, found = scanLine(in) - if !found { - return false - } - + line, raw = scanLine(raw) secondLine := string(line) // Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine // length to exactly 29 characters. @@ -345,9 +338,9 @@ func Srt(in []byte, _ uint32) bool { return false } - line, _, found = scanLine(in) + line, _ = scanLine(raw) // A third line must exist and not be empty. This is the actual subtitle text. - return found && len(line) != 0 + return len(line) != 0 } // Vtt matches a Web Video Text Tracks (WebVTT) file. See @@ -375,15 +368,14 @@ func Vtt(raw []byte, limit uint32) bool { bytes.Equal(raw, []byte{0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) // "WEBVTT" } -func scanLine(in []byte) (line, remainder []byte, found bool) { - line, remainder, found = bytes.Cut(in, []byte("\n")) - if !found { - return - } - - // Drop off any \r before \n. - if lenLine := len(line); lenLine > 0 && line[lenLine-1] == '\r' { - line = line[:lenLine-1] +// dropCR drops a terminal \r from the data. +func dropCR(data []byte) []byte { + if len(data) > 0 && data[len(data)-1] == '\r' { + return data[0 : len(data)-1] } - return + return data +} +func scanLine(b []byte) (line, remainder []byte) { + line, remainder, _ = bytes.Cut(b, []byte("\n")) + return dropCR(line), remainder } diff --git a/internal/magic/text_csv.go b/internal/magic/text_csv.go index 84ed649..af25643 100644 --- a/internal/magic/text_csv.go +++ b/internal/magic/text_csv.go @@ -18,7 +18,7 @@ func Tsv(raw []byte, limit uint32) bool { } func sv(in []byte, comma rune, limit uint32) bool { - r := csv.NewReader(dropLastLine(in, limit)) + r := csv.NewReader(bytes.NewReader(dropLastLine(in, limit))) r.Comma = comma r.ReuseRecord = true r.LazyQuotes = true @@ -44,20 +44,14 @@ func sv(in []byte, comma rune, limit uint32) bool { // mimetype limits itself to ReadLimit bytes when performing a detection. // This means, for file formats like CSV for NDJSON, the last line of the input // can be an incomplete line. -func dropLastLine(b []byte, cutAt uint32) io.Reader { - if cutAt == 0 { - return bytes.NewReader(b) +func dropLastLine(b []byte, readLimit uint32) []byte { + if readLimit == 0 || uint32(len(b)) < readLimit { + return b } - if uint32(len(b)) >= cutAt { - for i := cutAt - 1; i > 0; i-- { - if b[i] == '\n' { - return bytes.NewReader(b[:i]) - } + for i := len(b) - 1; i > 0; i-- { + if b[i] == '\n' { + return b[:i] } - - // No newline was found between the 0 index and cutAt. - return bytes.NewReader(b[:cutAt]) } - - return bytes.NewReader(b) + return b } diff --git a/mimetype_test.go b/mimetype_test.go index 445b1ed..b76235b 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -416,6 +416,7 @@ func TestConcurrent(t *testing.T) { wg := sync.WaitGroup{} wg.Add(4) + Extend(func([]byte, uint32) bool { return false }, "e", ".e") go func() { for i := 0; i < 1000; i++ { Detect([]byte("text content")) @@ -436,8 +437,7 @@ func TestConcurrent(t *testing.T) { }() go func() { for i := 0; i < 1000; i++ { - Extend(func([]byte, uint32) bool { return false }, "e", ".e") - Lookup("text/plain").Extend(func([]byte, uint32) bool { return false }, "e", ".e") + Lookup("e").Extend(func([]byte, uint32) bool { return false }, "e", ".e") } wg.Done() }() @@ -495,6 +495,24 @@ func BenchmarkSliceRand(b *testing.B) { }) } +func BenchmarkText(b *testing.B) { + r := rand.New(rand.NewSource(0)) + data := make([]byte, defaultLimit) + if _, err := io.ReadFull(r, data); err != io.ErrUnexpectedEOF && err != nil { + b.Fatal(err) + } + + for _, m := range text.children { + b.Run(m.String(), func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for n := 0; n < b.N; n++ { + m.detector(data, uint32(len(data))) + } + }) + } +} + func BenchmarkAll(b *testing.B) { r := rand.New(rand.NewSource(0)) data := make([]byte, defaultLimit)