Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not recognize text files as audio (#23355) #23368

Merged
merged 1 commit into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions modules/typesniffer/typesniffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,16 @@ func DetectContentType(data []byte) SniffedType {
}
}

if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
// So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
// This works especially because audio files contain many unprintable/invalid characters like `0x00`
ct2 := http.DetectContentType(data[3:])
if strings.HasPrefix(ct2, "text/") {
ct = ct2
}
}

return SniffedType{ct}
}

Expand Down
4 changes: 4 additions & 0 deletions modules/typesniffer/typesniffer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ func TestIsAudio(t *testing.T) {
mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
assert.True(t, DetectContentType(mp3).IsAudio())
assert.False(t, DetectContentType([]byte("plain text")).IsAudio())

assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio())
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText()) // test ID3 tag for plain text
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char
}

func TestDetectContentTypeFromReader(t *testing.T) {
Expand Down