improve performance for text detection (#532)

This commit improves memory allocations inside nd-json by using a scanLines function that works with already allocated buffer. Previous version of code was using bufio.Scanner which allocates his own copy of the buffer. before: BenchmarkText/application/x-ndjson-8 663314 2027 ns/op 4306 B/op 6 allocs/op after: BenchmarkText/application/x-ndjson-8 1930292 678.6 ns/op 160 B/op 4 allocs/op
gabriel-vasile · May 23, 2024 · ff4d3d0 · ff4d3d0
1 parent bc511b8
commit ff4d3d0
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 45 deletions.
diff --git a/internal/magic/magic_test.go b/internal/magic/magic_test.go
@@ -1,7 +1,8 @@
 package magic
 
 import (
-	"io"
+	"bufio"
+	"strings"
 	"testing"
 )
 
@@ -86,6 +87,88 @@ func TestMagic(t *testing.T) {
 	}
 }
 
+func TestScanLine(t *testing.T) {
+	tcases := []struct {
+		name     string
+		input    string
+		expected []string
+	}{{
+		name:     "empty input",
+		input:    "",
+		expected: nil,
+	}, {
+		name:     "single line, no terminal nl",
+		input:    "1",
+		expected: []string{"1"},
+	}, {
+		name:     "single line, terminal nl",
+		input:    "1\n",
+		expected: []string{"1"},
+	}, {
+		name:     "two lines, no terminal nl",
+		input:    "1\n2",
+		expected: []string{"1", "2"},
+	}, {
+		name:     "two lines, with terminal nl",
+		input:    "1\n2\n",
+		expected: []string{"1", "2"},
+	}, {
+		name:     "drops final cr",
+		input:    "1\n2\r",
+		expected: []string{"1", "2"},
+	}, {
+		name:     "final empty line",
+		input:    "1\n2\n\n",
+		expected: []string{"1", "2", ""},
+	}, {
+		name:     "empty line with cr",
+		input:    "1\n2\n\r",
+		expected: []string{"1", "2", ""},
+	}, {
+		name:     "nd-json numbers and object",
+		input:    "1\n2\n3\n{}",
+		expected: []string{"1", "2", "3", "{}"},
+	},
+	}
+
+	for _, tt := range tcases {
+		t.Run(tt.name, func(t *testing.T) {
+			testScanLine(t, tt.input, tt.expected)
+			testScanLineLikeBufioScanner(t, tt.input)
+		})
+	}
+}
+
+func testScanLine(t *testing.T, text string, expectedLines []string) {
+	var l []byte
+	i, raw := 0, []byte(text)
+	for i = 0; len(raw) != 0; i++ {
+		l, raw = scanLine(raw)
+		if string(l) != expectedLines[i] {
+			t.Errorf("expected %q, got %q", expectedLines[i], l)
+		}
+	}
+	if i != len(expectedLines) {
+		t.Errorf("expected %d lines, got %d lines", len(expectedLines), i)
+	}
+}
+
+// Test that scanLine behaves exactly like bufio.Scanner.
+func testScanLineLikeBufioScanner(t *testing.T, text string) {
+	var l []byte
+	raw := []byte(text)
+	s := bufio.NewScanner(strings.NewReader(text))
+	for lineNum := 0; s.Scan(); lineNum++ {
+		l, raw = scanLine(raw)
+		if string(l) != s.Text() {
+			t.Errorf("expected: %q, got: %q", s.Text(), string(l))
+		}
+	}
+	if err := s.Err(); err != nil {
+		t.Error(err)
+	}
+}
+
 func TestDropLastLine(t *testing.T) {
 	dropTests := []struct {
 		raw   string
@@ -105,8 +188,7 @@ func TestDropLastLine(t *testing.T) {
 		{"\nå\n", 5, "\nå\n"},
 	}
 	for i, tt := range dropTests {
-		gotR := dropLastLine([]byte(tt.raw), tt.cutAt)
-		got, _ := io.ReadAll(gotR)
+		got := dropLastLine([]byte(tt.raw), tt.cutAt)
 		if got := string(got); got != tt.res {
 			t.Errorf("dropLastLine %d error: expected %q; got %q", i, tt.res, got)
 		}

diff --git a/internal/magic/text.go b/internal/magic/text.go
@@ -1,7 +1,6 @@
 package magic
 
 import (
-	"bufio"
 	"bytes"
 	"strings"
 	"time"
@@ -234,9 +233,10 @@ func GeoJSON(raw []byte, limit uint32) bool {
 // types.
 func NdJSON(raw []byte, limit uint32) bool {
 	lCount, hasObjOrArr := 0, false
-	sc := bufio.NewScanner(dropLastLine(raw, limit))
-	for sc.Scan() {
-		l := sc.Bytes()
+	raw = dropLastLine(raw, limit)
+	var l []byte
+	for len(raw) != 0 {
+		l, raw = scanLine(raw)
 		// Empty lines are allowed in NDJSON.
 		if l = trimRWS(trimLWS(l)); len(l) == 0 {
 			continue
@@ -301,21 +301,14 @@ func Svg(raw []byte, limit uint32) bool {
 }
 
 // Srt matches a SubRip file.
-func Srt(in []byte, _ uint32) bool {
-	line, in, found := scanLine(in)
-	if !found {
-		return false
-	}
+func Srt(raw []byte, _ uint32) bool {
+	line, raw := scanLine(raw)
 
 	// First line must be 1.
 	if string(line) != "1" {
 		return false
 	}
-	line, in, found = scanLine(in)
-	if !found {
-		return false
-	}
-
+	line, raw = scanLine(raw)
 	secondLine := string(line)
 	// Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine
 	// length to exactly 29 characters.
@@ -345,9 +338,9 @@ func Srt(in []byte, _ uint32) bool {
 		return false
 	}
 
-	line, _, found = scanLine(in)
+	line, _ = scanLine(raw)
 	// A third line must exist and not be empty. This is the actual subtitle text.
-	return found && len(line) != 0
+	return len(line) != 0
 }
 
 // Vtt matches a Web Video Text Tracks (WebVTT) file. See
@@ -375,15 +368,14 @@ func Vtt(raw []byte, limit uint32) bool {
 		bytes.Equal(raw, []byte{0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) // "WEBVTT"
 }
 
-func scanLine(in []byte) (line, remainder []byte, found bool) {
-	line, remainder, found = bytes.Cut(in, []byte("\n"))
-	if !found {
-		return
-	}
-
-	// Drop off any \r before \n.
-	if lenLine := len(line); lenLine > 0 && line[lenLine-1] == '\r' {
-		line = line[:lenLine-1]
+// dropCR drops a terminal \r from the data.
+func dropCR(data []byte) []byte {
+	if len(data) > 0 && data[len(data)-1] == '\r' {
+		return data[0 : len(data)-1]
 	}
-	return
+	return data
+}
+func scanLine(b []byte) (line, remainder []byte) {
+	line, remainder, _ = bytes.Cut(b, []byte("\n"))
+	return dropCR(line), remainder
 }
diff --git a/internal/magic/text_csv.go b/internal/magic/text_csv.go
@@ -18,7 +18,7 @@ func Tsv(raw []byte, limit uint32) bool {
 }
 
 func sv(in []byte, comma rune, limit uint32) bool {
-	r := csv.NewReader(dropLastLine(in, limit))
+	r := csv.NewReader(bytes.NewReader(dropLastLine(in, limit)))
 	r.Comma = comma
 	r.ReuseRecord = true
 	r.LazyQuotes = true
@@ -44,20 +44,14 @@ func sv(in []byte, comma rune, limit uint32) bool {
 // mimetype limits itself to ReadLimit bytes when performing a detection.
 // This means, for file formats like CSV for NDJSON, the last line of the input
 // can be an incomplete line.
-func dropLastLine(b []byte, cutAt uint32) io.Reader {
-	if cutAt == 0 {
-		return bytes.NewReader(b)
+func dropLastLine(b []byte, readLimit uint32) []byte {
+	if readLimit == 0 || uint32(len(b)) < readLimit {
+		return b
 	}
-	if uint32(len(b)) >= cutAt {
-		for i := cutAt - 1; i > 0; i-- {
-			if b[i] == '\n' {
-				return bytes.NewReader(b[:i])
-			}
+	for i := len(b) - 1; i > 0; i-- {
+		if b[i] == '\n' {
+			return b[:i]
 		}
-
-		// No newline was found between the 0 index and cutAt.
-		return bytes.NewReader(b[:cutAt])
 	}
-
-	return bytes.NewReader(b)
+	return b
 }
diff --git a/mimetype_test.go b/mimetype_test.go
@@ -416,6 +416,7 @@ func TestConcurrent(t *testing.T) {
 	wg := sync.WaitGroup{}
 	wg.Add(4)
 
+	Extend(func([]byte, uint32) bool { return false }, "e", ".e")
 	go func() {
 		for i := 0; i < 1000; i++ {
 			Detect([]byte("text content"))
@@ -436,8 +437,7 @@ func TestConcurrent(t *testing.T) {
 	}()
 	go func() {
 		for i := 0; i < 1000; i++ {
-			Extend(func([]byte, uint32) bool { return false }, "e", ".e")
-			Lookup("text/plain").Extend(func([]byte, uint32) bool { return false }, "e", ".e")
+			Lookup("e").Extend(func([]byte, uint32) bool { return false }, "e", ".e")
 		}
 		wg.Done()
 	}()
@@ -495,6 +495,24 @@ func BenchmarkSliceRand(b *testing.B) {
 	})
 }
 
+func BenchmarkText(b *testing.B) {
+	r := rand.New(rand.NewSource(0))
+	data := make([]byte, defaultLimit)
+	if _, err := io.ReadFull(r, data); err != io.ErrUnexpectedEOF && err != nil {
+		b.Fatal(err)
+	}
+
+	for _, m := range text.children {
+		b.Run(m.String(), func(b *testing.B) {
+			b.ReportAllocs()
+			b.ResetTimer()
+			for n := 0; n < b.N; n++ {
+				m.detector(data, uint32(len(data)))
+			}
+		})
+	}
+}
+
 func BenchmarkAll(b *testing.B) {
 	r := rand.New(rand.NewSource(0))
 	data := make([]byte, defaultLimit)