Skip to content

Commit

Permalink
support ms ooxml, #40
Browse files Browse the repository at this point in the history
  • Loading branch information
kumakichi committed Oct 25, 2018
1 parent 6f0781f commit 1c8a5f9
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 15 deletions.
4 changes: 3 additions & 1 deletion match.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

// Matchers is an alias to matchers.Matchers
var Matchers = matchers.Matchers
var MatcherKeys = matchers.MatcherKeys

// NewMatcher is an alias to matchers.NewMatcher
var NewMatcher = matchers.NewMatcher
Expand All @@ -21,7 +22,8 @@ func Match(buf []byte) (types.Type, error) {
return types.Unknown, ErrEmptyBuffer
}

for _, checker := range Matchers {
for _, kind := range MatcherKeys {
checker := Matchers[kind]
match := checker(buf)
if match != types.Unknown && match.Extension != "" {
return match, nil
Expand Down
143 changes: 130 additions & 13 deletions matchers/document.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package matchers

import "bytes"
import (
"bytes"
"encoding/binary"
"fmt"
)

var (
TypeDoc = newType("doc", "application/msword")
Expand All @@ -20,6 +24,18 @@ var Document = Map{
TypePptx: Pptx,
}

type docType int

const (
TYPE_DOC docType = iota
TYPE_DOCX
TYPE_XLS
TYPE_XLSX
TYPE_PPT
TYPE_PPTX
TYPE_OOXML
)

func Doc(buf []byte) bool {
return len(buf) > 7 &&
buf[0] == 0xD0 && buf[1] == 0xCF &&
Expand All @@ -29,10 +45,8 @@ func Doc(buf []byte) bool {
}

func Docx(buf []byte) bool {
return len(buf) > 3 &&
buf[0] == 0x50 && buf[1] == 0x4B &&
buf[2] == 0x03 && buf[3] == 0x04 &&
bytes.Contains(buf[:256], []byte(TypeDocx.MIME.Value))
typ, ok := msooxml(buf)
return ok && typ == TYPE_DOCX
}

func Xls(buf []byte) bool {
Expand All @@ -44,10 +58,8 @@ func Xls(buf []byte) bool {
}

func Xlsx(buf []byte) bool {
return len(buf) > 3 &&
buf[0] == 0x50 && buf[1] == 0x4B &&
buf[2] == 0x03 && buf[3] == 0x04 &&
bytes.Contains(buf[:256], []byte(TypeXlsx.MIME.Value))
typ, ok := msooxml(buf)
return ok && typ == TYPE_XLSX
}

func Ppt(buf []byte) bool {
Expand All @@ -59,8 +71,113 @@ func Ppt(buf []byte) bool {
}

func Pptx(buf []byte) bool {
return len(buf) > 3 &&
buf[0] == 0x50 && buf[1] == 0x4B &&
buf[2] == 0x07 && buf[3] == 0x08 &&
bytes.Contains(buf[:256], []byte(TypePptx.MIME.Value))
typ, ok := msooxml(buf)
return ok && typ == TYPE_PPTX
}

func msooxml(buf []byte) (typ docType, found bool) {
signature := []byte{'P', 'K', 0x03, 0x04}

// start by checking for ZIP local file header signature
if ok := compareBytes(buf, signature, 0); !ok {
fmt.Println("failed 1")
return
}

// make sure the first file is correct
if v, ok := checkMSOoml(buf, 0x1E); ok {
return v, ok
}

if !compareBytes(buf, []byte("[Content_Types].xml"), 0x1E) && !compareBytes(buf, []byte("_rels/.rels"), 0x1E) {
fmt.Println("failed 2:")
return
}

// skip to the second local file header
// since some documents include a 520-byte extra field following the file
// header, we need to scan for the next header
startOffset := int(binary.LittleEndian.Uint32(buf[18:22]) + 49)
idx := search(buf, startOffset, 6000)
if idx == -1 {
fmt.Println("failed 3")
return
}

// now skip to the *third* local file header; again, we need to scan due to a
// 520-byte extra field following the file header
startOffset += idx + 4 + 26
idx = search(buf, startOffset, 6000)
if idx == -1 {
fmt.Println("failed 4")
return
}

// and check the subdirectory name to determine which type of OOXML
// file we have. Correct the mimetype with the registered ones:
// http://technet.microsoft.com/en-us/library/cc179224.aspx
startOffset += idx + 4 + 26
if typ, ok := checkMSOoml(buf, startOffset); ok {
return typ, ok
}

// OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
startOffset += 26
idx = search(buf, startOffset, 6000)
if idx == -1 {
return TYPE_OOXML, true
}

startOffset += idx + 4 + 26
if typ, ok := checkMSOoml(buf, startOffset); ok {
return typ, ok
} else {
return TYPE_OOXML, true
}
}

func compareBytes(slice, subSlice []byte, startOffset int) bool {
sl := len(subSlice)

if startOffset+sl > len(slice) {
return false
}

s := slice[startOffset : startOffset+sl]
for i := range s {
if subSlice[i] != s[i] {
return false
}
}

return true
}

func checkMSOoml(buf []byte, offset int) (typ docType, ok bool) {
ok = true

switch {
case compareBytes(buf, []byte("word/"), offset):
typ = TYPE_DOCX
case compareBytes(buf, []byte("ppt/"), offset):
typ = TYPE_PPTX
case compareBytes(buf, []byte("xl/"), offset):
typ = TYPE_XLSX
default:
ok = false
}

return
}

func search(buf []byte, start, rangeNum int) int {
length := len(buf)
end := start + rangeNum
signature := []byte{'P', 'K', 0x03, 0x04}

if end > length {
end = length
}

return bytes.Index(buf[start:end], signature)
}
7 changes: 6 additions & 1 deletion matchers/matchers.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package matchers

import "gopkg.in/h2non/filetype.v1/types"
import (
"gopkg.in/h2non/filetype.v1/types"
)

// Internal shortcut to NewType
var newType = types.NewType
Expand All @@ -16,6 +18,7 @@ type TypeMatcher func([]byte) types.Type

// Store registered file type matchers
var Matchers = make(map[types.Type]TypeMatcher)
var MatcherKeys []types.Type

// Create and register a new type matcher function
func NewMatcher(kind types.Type, fn Matcher) TypeMatcher {
Expand All @@ -27,10 +30,12 @@ func NewMatcher(kind types.Type, fn Matcher) TypeMatcher {
}

Matchers[kind] = matcher
MatcherKeys = append(MatcherKeys, kind)
return matcher
}

func register(matchers ...Map) {
MatcherKeys = MatcherKeys[:0]
for _, m := range matchers {
for kind, matcher := range m {
NewMatcher(kind, matcher)
Expand Down

0 comments on commit 1c8a5f9

Please sign in to comment.