Skip to content

Commit

Permalink
Merge branch 'master' into apng
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-vasile committed Feb 3, 2022
2 parents 99ae532 + bda98fd commit bd68efd
Show file tree
Hide file tree
Showing 22 changed files with 367 additions and 74 deletions.
9 changes: 3 additions & 6 deletions .github/workflows/codeql.yml
Expand Up @@ -5,23 +5,20 @@ on:
branches: [master]
pull_request:
branches: [master]
schedule:
- cron: "22 11 * * *"

jobs:
CodeQL-Build:
runs-on: ubuntu-latest

steps:
- name: Check out code
uses: actions/checkout@v2
uses: actions/checkout@v2.4.0

- name: Initialize CodeQL
uses: github/codeql-action/init@v1
uses: github/codeql-action/init@v1.0.31
with:
languages: go
queries: security-and-quality

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

uses: github/codeql-action/analyze@v1.0.31
12 changes: 7 additions & 5 deletions .github/workflows/go.yml
@@ -1,5 +1,7 @@
on:
push:
branches:
- master
pull_request:

name: run tests
Expand All @@ -8,26 +10,26 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Install Go
uses: actions/setup-go@v2.1.4
uses: actions/setup-go@v2.1.5
with:
go-version: 1.16
- name: Checkout code
uses: actions/checkout@v2.4.0
- name: Run linters
uses: golangci/golangci-lint-action@v2.5.2
with:
version: v1.29
version: "v1.37.1"

test:
strategy:
matrix:
go-version: [1.16]
go-version: ["1.12.0", "^1.17.6"]
platform: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.platform }}
steps:
- name: Install Go
if: success()
uses: actions/setup-go@v2.1.4
uses: actions/setup-go@v2.1.5
with:
go-version: ${{ matrix.go-version }}
- name: Checkout code
Expand All @@ -40,7 +42,7 @@ jobs:
steps:
- name: Install Go
if: success()
uses: actions/setup-go@v2.1.4
uses: actions/setup-go@v2.1.5
with:
go-version: 1.16
- name: Checkout code
Expand Down
16 changes: 16 additions & 0 deletions README.md
Expand Up @@ -57,6 +57,22 @@ using magic numbers is slow, inaccurate, and non-standard. Most of the times
protocols have methods for specifying such metadata; e.g., `Content-Type` header
in HTTP and SMTP.

## FAQ
Q: My file is in the list of [supported MIME types](supported_mimes.md) but
it is not correctly detected. What should I do?

A: Some file formats (often Microsoft Office documents) keep their signatures
towards the end of the file. Try increasing the number of bytes used for detection
with:
```go
mimetype.SetLimit(1024*1024) // Set limit to 1MB.
// or
mimetype.SetLimit(0) // No limit, whole file content used.
mimetype.DetectFile("file.doc")
```
If increasing the limit does not help, please
[open an issue](https://github.com/gabriel-vasile/mimetype/issues/new?assignees=&labels=&template=mismatched-mime-type-detected.md&title=).

## Structure
**mimetype** uses a hierarchical structure to keep the MIME type detection logic.
This reduces the number of calls needed for detecting the file type. The reason
Expand Down
8 changes: 8 additions & 0 deletions internal/charset/charset.go
Expand Up @@ -52,6 +52,7 @@ var (
}
)

// FromBOM returns the charset declared in the BOM of content.
func FromBOM(content []byte) string {
for _, b := range boms {
if bytes.HasPrefix(content, b.bom) {
Expand All @@ -61,6 +62,8 @@ func FromBOM(content []byte) string {
return ""
}

// FromPlain returns the charset of a plain text. It relies on BOM presence
// and it falls back on checking each byte in content.
func FromPlain(content []byte) string {
if len(content) == 0 {
return ""
Expand Down Expand Up @@ -129,6 +132,9 @@ func ascii(content []byte) bool {
return true
}

// FromXML returns the charset of an XML document. It relies on the XML
// header <?xml version="1.0" encoding="UTF-8"?> and falls back on the plain
// text content.
func FromXML(content []byte) string {
if cset := fromXML(content); cset != "" {
return cset
Expand All @@ -151,6 +157,8 @@ func fromXML(content []byte) string {
return strings.ToLower(xmlEncoding(string(t.Inst)))
}

// FromHTML returns the charset of an HTML document. It relies on the meta tag
// <meta charset="UTF-8"> and falls back on the plain text content.
func FromHTML(content []byte) string {
if cset := fromHTML(content); cset != "" {
return cset
Expand Down
49 changes: 44 additions & 5 deletions internal/magic/binary.go
Expand Up @@ -17,8 +17,6 @@ var (
Elf = prefix([]byte{0x7F, 0x45, 0x4C, 0x46})
// Nes matches a Nintendo Entertainment system ROM file.
Nes = prefix([]byte{0x4E, 0x45, 0x53, 0x1A})
// TzIf matches a Time Zone Information Format (TZif) file.
TzIf = prefix([]byte("TZif"))
// SWF matches an Adobe Flash swf file.
SWF = prefix([]byte("CWS"), []byte("FWS"), []byte("ZWS"))
// Torrent has bencoded text in the beginning.
Expand Down Expand Up @@ -64,16 +62,27 @@ func MachO(raw []byte, limit uint32) bool {
// Dbf matches a dBase file.
// https://www.dbase.com/Knowledgebase/INT/db7_file_fmt.htm
func Dbf(raw []byte, limit uint32) bool {
if len(raw) < 4 {
if len(raw) < 68 {
return false
}

// 3rd and 4th bytes contain the last update month and day of month
// 3rd and 4th bytes contain the last update month and day of month.
if !(0 < raw[2] && raw[2] < 13 && 0 < raw[3] && raw[3] < 32) {
return false
}

// dbf type is dictated by the first byte
// 12, 13, 30, 31 are reserved bytes and always filled with 0x00.
if raw[12] != 0x00 || raw[13] != 0x00 || raw[30] != 0x00 || raw[31] != 0x00 {
return false
}
// Production MDX flag;
// 0x01 if a production .MDX file exists for this table;
// 0x00 if no .MDX file exists.
if raw[28] > 0x01 {
return false
}

// dbf type is dictated by the first byte.
dbfTypes := []byte{
0x02, 0x03, 0x04, 0x05, 0x30, 0x31, 0x32, 0x42, 0x62, 0x7B, 0x82,
0x83, 0x87, 0x8A, 0x8B, 0x8E, 0xB3, 0xCB, 0xE5, 0xF5, 0xF4, 0xFB,
Expand Down Expand Up @@ -155,3 +164,33 @@ func Marc(raw []byte, limit uint32) bool {
// | g l T F | 1 | ... |
var Glb = prefix([]byte("\x67\x6C\x54\x46\x02\x00\x00\x00"),
[]byte("\x67\x6C\x54\x46\x01\x00\x00\x00"))

// TzIf matches a Time Zone Information Format (TZif) file.
// See more: https://tools.ietf.org/id/draft-murchison-tzdist-tzif-00.html#rfc.section.3
// Its header structure is shown below:
// +---------------+---+
// | magic (4) | <-+-- version (1)
// +---------------+---+---------------------------------------+
// | [unused - reserved for future use] (15) |
// +---------------+---------------+---------------+-----------+
// | isutccnt (4) | isstdcnt (4) | leapcnt (4) |
// +---------------+---------------+---------------+
// | timecnt (4) | typecnt (4) | charcnt (4) |
func TzIf(raw []byte, limit uint32) bool {
// File is at least 44 bytes (header size).
if len(raw) < 44 {
return false
}

if !bytes.HasPrefix(raw, []byte("TZif")) {
return false
}

// Field "typecnt" MUST not be zero.
if binary.BigEndian.Uint32(raw[36:40]) == 0 {
return false
}

// Version has to be NUL (0x00), '2' (0x32) or '3' (0x33).
return raw[4] == 0x00 || raw[4] == 0x32 || raw[4] == 0x33
}
File renamed without changes.
16 changes: 14 additions & 2 deletions internal/magic/magic_test.go
@@ -1,7 +1,7 @@
package magic

import (
"io"
"io/ioutil"
"testing"
)

Expand Down Expand Up @@ -106,9 +106,21 @@ func TestDropLastLine(t *testing.T) {
}
for i, tt := range dropTests {
gotR := dropLastLine([]byte(tt.raw), tt.cutAt)
got, _ := io.ReadAll(gotR)
got, _ := ioutil.ReadAll(gotR)
if got := string(got); got != tt.res {
t.Errorf("dropLastLine %d error: expected %q; got %q", i, tt.res, got)
}
}
}

func BenchmarkSrt(b *testing.B) {
const subtitle = `1
00:02:16,612 --> 00:02:19,376
Senator, we're making
our final approach into Coruscant.
`
for i := 0; i < b.N; i++ {
Srt([]byte(subtitle), 0)
}
}
41 changes: 29 additions & 12 deletions internal/magic/ms_office.go
Expand Up @@ -78,14 +78,24 @@ func Aaf(raw []byte, limit uint32) bool {
}

// Doc matches a Microsoft Word 97-2003 file.
//
// BUG(gabriel-vasile): Doc should look for subheaders like Ppt and Xls does.
//
// Ole is a container for Doc, Ppt, Pub and Xls.
// Right now, when an Ole file is detected, it is considered to be a Doc file
// if the checks for Ppt, Pub and Xls failed.
func Doc(raw []byte, limit uint32) bool {
return true
// See: https://github.com/decalage2/oletools/blob/412ee36ae45e70f42123e835871bac956d958461/oletools/common/clsid.py
func Doc(raw []byte, _ uint32) bool {
clsids := [][]byte{
// Microsoft Word 97-2003 Document (Word.Document.8)
{0x06, 0x09, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46},
// Microsoft Word 6.0-7.0 Document (Word.Document.6)
{0x00, 0x09, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46},
// Microsoft Word Picture (Word.Picture.8)
{0x07, 0x09, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46},
}

for _, clsid := range clsids {
if matchOleClsid(raw, clsid) {
return true
}
}

return false
}

// Ppt matches a Microsoft PowerPoint 97-2003 file or a PowerPoint 95 presentation.
Expand Down Expand Up @@ -190,15 +200,22 @@ func Msi(raw []byte, limit uint32) bool {
//
// http://fileformats.archiveteam.org/wiki/Microsoft_Compound_File
func matchOleClsid(in []byte, clsid []byte) bool {
if len(in) <= 512 {
// Microsoft Compound files v3 have a sector length of 512, while v4 has 4096.
// Change sector offset depending on file version.
// https://www.loc.gov/preservation/digital/formats/fdd/fdd000392.shtml
sectorLength := 512
if len(in) < sectorLength {
return false
}
if in[26] == 0x04 && in[27] == 0x00 {
sectorLength = 4096
}

// SecID of first sector of the directory stream
// SecID of first sector of the directory stream.
firstSecID := int(binary.LittleEndian.Uint32(in[48:52]))

// Expected offset of CLSID for root storage object
clsidOffset := 512*(1+firstSecID) + 80
// Expected offset of CLSID for root storage object.
clsidOffset := sectorLength*(1+firstSecID) + 80

if len(in) <= clsidOffset+16 {
return false
Expand Down
76 changes: 76 additions & 0 deletions internal/magic/text.go
Expand Up @@ -3,6 +3,8 @@ package magic
import (
"bufio"
"bytes"
"strings"
"time"

"github.com/gabriel-vasile/mimetype/internal/charset"
"github.com/gabriel-vasile/mimetype/internal/json"
Expand Down Expand Up @@ -297,3 +299,77 @@ func HAR(raw []byte, limit uint32) bool {
func Svg(raw []byte, limit uint32) bool {
return bytes.Contains(raw, []byte("<svg"))
}

// Srt matches a SubRip file.
func Srt(in []byte, _ uint32) bool {
s := bufio.NewScanner(bytes.NewReader(in))
if !s.Scan() {
return false
}
// First line must be 1.
if s.Text() != "1" {
return false
}

if !s.Scan() {
return false
}
secondLine := s.Text()
// Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine
// length to exactly 29 characters.
if len(secondLine) != 29 {
return false
}
// Decimal separator of fractional seconds in the timestamps must be a
// comma, not a period.
if strings.Contains(secondLine, ".") {
return false
}
// For Go <1.17, comma is not recognised as a decimal separator by `time.Parse`.
secondLine = strings.ReplaceAll(secondLine, ",", ".")
// Second line must be a time range.
ts := strings.Split(secondLine, " --> ")
if len(ts) != 2 {
return false
}
const layout = "15:04:05.000"
t0, err := time.Parse(layout, ts[0])
if err != nil {
return false
}
t1, err := time.Parse(layout, ts[1])
if err != nil {
return false
}
if t0.After(t1) {
return false
}

// A third line must exist and not be empty. This is the actual subtitle text.
return s.Scan() && len(s.Bytes()) != 0
}

// Vtt matches a Web Video Text Tracks (WebVTT) file. See
// https://www.iana.org/assignments/media-types/text/vtt.
func Vtt(raw []byte, limit uint32) bool {
// Prefix match.
prefixes := [][]byte{
{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0A}, // UTF-8 BOM, "WEBVTT" and a line feed
{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0D}, // UTF-8 BOM, "WEBVTT" and a carriage return
{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x20}, // UTF-8 BOM, "WEBVTT" and a space
{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x09}, // UTF-8 BOM, "WEBVTT" and a horizontal tab
{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0A}, // "WEBVTT" and a line feed
{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0D}, // "WEBVTT" and a carriage return
{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x20}, // "WEBVTT" and a space
{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x09}, // "WEBVTT" and a horizontal tab
}
for _, p := range prefixes {
if bytes.HasPrefix(raw, p) {
return true
}
}

// Exact match.
return bytes.Equal(raw, []byte{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) || // UTF-8 BOM and "WEBVTT"
bytes.Equal(raw, []byte{0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) // "WEBVTT"
}

0 comments on commit bd68efd

Please sign in to comment.