Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support ms ooxml #48

Merged
merged 4 commits into from
Oct 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
language: go

go:
- "1.11"
- "1.10"
- "1.9"
- "1.8"
- "1.7"
- "tip"

before_install:
- go get -u -v github.com/golang/lint/golint
- go get -u -v golang.org/x/lint/golint

script:
- diff -u <(echo -n) <(gofmt -s -d ./)
Expand Down
4 changes: 2 additions & 2 deletions filetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ package filetype
import (
"errors"

"gopkg.in/h2non/filetype.v1/matchers"
"gopkg.in/h2non/filetype.v1/types"
"github.com/h2non/filetype/matchers"
"github.com/h2non/filetype/types"
)

// Types stores a map of supported types
Expand Down
2 changes: 1 addition & 1 deletion filetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package filetype
import (
"testing"

"gopkg.in/h2non/filetype.v1/types"
"github.com/h2non/filetype/types"
)

func TestIs(t *testing.T) {
Expand Down
Binary file added fixtures/sample.docx
Binary file not shown.
Binary file added fixtures/sample.pptx
Binary file not shown.
Binary file added fixtures/sample.xlsx
Binary file not shown.
4 changes: 2 additions & 2 deletions kind.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package filetype

import (
"gopkg.in/h2non/filetype.v1/matchers"
"gopkg.in/h2non/filetype.v1/types"
"github.com/h2non/filetype/matchers"
"github.com/h2non/filetype/types"
)

// Image tries to match a file as image type
Expand Down
12 changes: 8 additions & 4 deletions match.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@ import (
"io"
"os"

"gopkg.in/h2non/filetype.v1/matchers"
"gopkg.in/h2non/filetype.v1/types"
"github.com/h2non/filetype/matchers"
"github.com/h2non/filetype/types"
)

// Matchers is an alias to matchers.Matchers
var Matchers = matchers.Matchers

// MatcherKeys is an alias to matchers.MatcherKeys
var MatcherKeys = matchers.MatcherKeys

// NewMatcher is an alias to matchers.NewMatcher
var NewMatcher = matchers.NewMatcher

Expand All @@ -21,7 +24,8 @@ func Match(buf []byte) (types.Type, error) {
return types.Unknown, ErrEmptyBuffer
}

for _, checker := range Matchers {
for _, kind := range MatcherKeys {
checker := Matchers[kind]
match := checker(buf)
if match != types.Unknown && match.Extension != "" {
return match, nil
Expand Down Expand Up @@ -49,7 +53,7 @@ func MatchFile(filepath string) (types.Type, error) {

// MatchReader is convenient wrapper to Match() any Reader
func MatchReader(reader io.Reader) (types.Type, error) {
buffer := make([]byte, 512)
buffer := make([]byte, 4096) // just make msooxml test happy, but 4096 bytes maybe not enough to determine the real type

_, err := reader.Read(buffer)
if err != nil && err != io.EOF {
Expand Down
30 changes: 27 additions & 3 deletions match_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ import (
"io/ioutil"
"testing"

"gopkg.in/h2non/filetype.v1/matchers"
"gopkg.in/h2non/filetype.v1/types"
"github.com/h2non/filetype/matchers"
"github.com/h2non/filetype/types"
)

func TestMatch(t *testing.T) {
Expand Down Expand Up @@ -43,12 +43,15 @@ func TestMatchFile(t *testing.T) {
{"tar"},
{"tif"},
{"mp4"},
{"docx"},
{"pptx"},
{"xlsx"},
}

for _, test := range cases {
kind, _ := MatchFile("./fixtures/sample." + test.ext)
if kind.Extension != test.ext {
t.Fatalf("Invalid image type: %s != %s", kind.Extension, test.ext)
t.Fatalf("Invalid type: %s != %s", kind.Extension, test.ext)
}
}
}
Expand Down Expand Up @@ -155,6 +158,9 @@ var zipBuffer, _ = ioutil.ReadFile("./fixtures/sample.zip")
var jpgBuffer, _ = ioutil.ReadFile("./fixtures/sample.jpg")
var gifBuffer, _ = ioutil.ReadFile("./fixtures/sample.gif")
var pngBuffer, _ = ioutil.ReadFile("./fixtures/sample.png")
var xlsxBuffer, _ = ioutil.ReadFile("./fixtures/sample.xlsx")
var pptxBuffer, _ = ioutil.ReadFile("./fixtures/sample.pptx")
var docxBuffer, _ = ioutil.ReadFile("./fixtures/sample.docx")

func BenchmarkMatchTar(b *testing.B) {
for n := 0; n < b.N; n++ {
Expand Down Expand Up @@ -185,3 +191,21 @@ func BenchmarkMatchPng(b *testing.B) {
Match(pngBuffer)
}
}

func BenchmarkMatchXlsx(b *testing.B) {
for n := 0; n < b.N; n++ {
Match(xlsxBuffer)
}
}

func BenchmarkMatchPptx(b *testing.B) {
for n := 0; n < b.N; n++ {
Match(pptxBuffer)
}
}

func BenchmarkMatchDocx(b *testing.B) {
for n := 0; n < b.N; n++ {
Match(docxBuffer)
}
}
142 changes: 129 additions & 13 deletions matchers/document.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package matchers

import "bytes"
import (
"bytes"
"encoding/binary"
)

var (
TypeDoc = newType("doc", "application/msword")
Expand All @@ -20,6 +23,18 @@ var Document = Map{
TypePptx: Pptx,
}

type docType int

const (
TYPE_DOC docType = iota
TYPE_DOCX
TYPE_XLS
TYPE_XLSX
TYPE_PPT
TYPE_PPTX
TYPE_OOXML
)

func Doc(buf []byte) bool {
return len(buf) > 7 &&
buf[0] == 0xD0 && buf[1] == 0xCF &&
Expand All @@ -29,10 +44,8 @@ func Doc(buf []byte) bool {
}

func Docx(buf []byte) bool {
return len(buf) > 3 &&
buf[0] == 0x50 && buf[1] == 0x4B &&
buf[2] == 0x03 && buf[3] == 0x04 &&
bytes.Contains(buf[:256], []byte(TypeDocx.MIME.Value))
typ, ok := msooxml(buf)
return ok && typ == TYPE_DOCX
}

func Xls(buf []byte) bool {
Expand All @@ -44,10 +57,8 @@ func Xls(buf []byte) bool {
}

func Xlsx(buf []byte) bool {
return len(buf) > 3 &&
buf[0] == 0x50 && buf[1] == 0x4B &&
buf[2] == 0x03 && buf[3] == 0x04 &&
bytes.Contains(buf[:256], []byte(TypeXlsx.MIME.Value))
typ, ok := msooxml(buf)
return ok && typ == TYPE_XLSX
}

func Ppt(buf []byte) bool {
Expand All @@ -59,8 +70,113 @@ func Ppt(buf []byte) bool {
}

func Pptx(buf []byte) bool {
return len(buf) > 3 &&
buf[0] == 0x50 && buf[1] == 0x4B &&
buf[2] == 0x07 && buf[3] == 0x08 &&
bytes.Contains(buf[:256], []byte(TypePptx.MIME.Value))
typ, ok := msooxml(buf)
return ok && typ == TYPE_PPTX
}

func msooxml(buf []byte) (typ docType, found bool) {
signature := []byte{'P', 'K', 0x03, 0x04}

// start by checking for ZIP local file header signature
if ok := compareBytes(buf, signature, 0); !ok {
return
}

// make sure the first file is correct
if v, ok := checkMSOoml(buf, 0x1E); ok {
return v, ok
}

if !compareBytes(buf, []byte("[Content_Types].xml"), 0x1E) && !compareBytes(buf, []byte("_rels/.rels"), 0x1E) {
return
}

// skip to the second local file header
// since some documents include a 520-byte extra field following the file
// header, we need to scan for the next header
startOffset := int(binary.LittleEndian.Uint32(buf[18:22]) + 49)
idx := search(buf, startOffset, 6000)
if idx == -1 {
return
}

// now skip to the *third* local file header; again, we need to scan due to a
// 520-byte extra field following the file header
startOffset += idx + 4 + 26
idx = search(buf, startOffset, 6000)
if idx == -1 {
return
}

// and check the subdirectory name to determine which type of OOXML
// file we have. Correct the mimetype with the registered ones:
// http://technet.microsoft.com/en-us/library/cc179224.aspx
startOffset += idx + 4 + 26
if typ, ok := checkMSOoml(buf, startOffset); ok {
return typ, ok
}

// OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
startOffset += 26
idx = search(buf, startOffset, 6000)
if idx == -1 {
return TYPE_OOXML, true
}

startOffset += idx + 4 + 26
if typ, ok := checkMSOoml(buf, startOffset); ok {
return typ, ok
} else {
return TYPE_OOXML, true
}
}

func compareBytes(slice, subSlice []byte, startOffset int) bool {
sl := len(subSlice)

if startOffset+sl > len(slice) {
return false
}

s := slice[startOffset : startOffset+sl]
for i := range s {
if subSlice[i] != s[i] {
return false
}
}

return true
}

func checkMSOoml(buf []byte, offset int) (typ docType, ok bool) {
ok = true

switch {
case compareBytes(buf, []byte("word/"), offset):
typ = TYPE_DOCX
case compareBytes(buf, []byte("ppt/"), offset):
typ = TYPE_PPTX
case compareBytes(buf, []byte("xl/"), offset):
typ = TYPE_XLSX
default:
ok = false
}

return
}

func search(buf []byte, start, rangeNum int) int {
length := len(buf)
end := start + rangeNum
signature := []byte{'P', 'K', 0x03, 0x04}

if end > length {
end = length
}

if start >= end {
return -1
}

return bytes.Index(buf[start:end], signature)
}
7 changes: 6 additions & 1 deletion matchers/matchers.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package matchers

import "gopkg.in/h2non/filetype.v1/types"
import (
"github.com/h2non/filetype/types"
)

// Internal shortcut to NewType
var newType = types.NewType
Expand All @@ -16,6 +18,7 @@ type TypeMatcher func([]byte) types.Type

// Store registered file type matchers
var Matchers = make(map[types.Type]TypeMatcher)
var MatcherKeys []types.Type

// Create and register a new type matcher function
func NewMatcher(kind types.Type, fn Matcher) TypeMatcher {
Expand All @@ -27,10 +30,12 @@ func NewMatcher(kind types.Type, fn Matcher) TypeMatcher {
}

Matchers[kind] = matcher
MatcherKeys = append(MatcherKeys, kind)
return matcher
}

func register(matchers ...Map) {
MatcherKeys = MatcherKeys[:0]
for _, m := range matchers {
for kind, matcher := range m {
NewMatcher(kind, matcher)
Expand Down