Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,19 @@ Archive
- **lz4** - ``application/x-lz4``
- **zstd** - ``application/zstd``

Document
^^^^

- **doc** - ``application/msword``
- **docx** - ``application/vnd.openxmlformats-officedocument.wordprocessingml.document``
- **odt** - ``application/vnd.oasis.opendocument.text``
- **xls** - ``application/vnd.ms-excel``
- **xlsx** - ``application/vnd.openxmlformats-officedocument.spreadsheetml.sheet``
- **ods** - ``application/vnd.oasis.opendocument.spreadsheet``
- **ppt** - ``application/vnd.ms-powerpoint``
- **pptx** - ``application/vnd.openxmlformats-officedocument.presentationml.presentation``
- **odp** - ``application/vnd.oasis.opendocument.presentation``

Font
^^^^

Expand Down
18 changes: 17 additions & 1 deletion filetype/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import absolute_import
from .types import TYPES
from .match import (
image_match, font_match,
image_match, font_match, document_match,
video_match, audio_match, archive_match
)

Expand Down Expand Up @@ -122,3 +122,19 @@ def is_font(obj):
TypeError: if obj is not a supported type.
"""
return font_match(obj) is not None


def is_document(obj):
"""
Checks if a given input is a supported type document.

Args:
obj: path to file, bytes or bytearray.

Returns:
True if obj is a valid document. Otherwise False.

Raises:
TypeError: if obj is not a supported type.
"""
return document_match(obj) is not None
18 changes: 18 additions & 0 deletions filetype/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .types import ARCHIVE as archive_matchers
from .types import AUDIO as audio_matchers
from .types import APPLICATION as application_matchers
from .types import DOCUMENT as document_matchers
from .types import FONT as font_matchers
from .types import IMAGE as image_matchers
from .types import VIDEO as video_matchers
Expand Down Expand Up @@ -135,3 +136,20 @@ def application_match(obj):
TypeError: if obj is not a supported type.
"""
return match(obj, application_matchers)


def document_match(obj):
"""
Matches the given input against the available
document type matchers.

Args:
obj: path to file, bytes or bytearray.

Returns:
Type instance if matches. Otherwise None.

Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, document_matchers)
16 changes: 15 additions & 1 deletion filetype/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from . import archive
from . import audio
from . import application
from . import document
from . import font
from . import image
from . import video
Expand Down Expand Up @@ -98,6 +99,19 @@
application.Wasm(),
)

# Supported document types
DOCUMENT = (
document.Doc(),
document.Docx(),
document.Odt(),
document.Xls(),
document.Xlsx(),
document.Ods(),
document.Ppt(),
document.Pptx(),
document.Odp(),
)


# Expose supported type matchers
TYPES = list(VIDEO + IMAGE + AUDIO + FONT + ARCHIVE + APPLICATION)
TYPES = list(VIDEO + IMAGE + AUDIO + FONT + DOCUMENT + ARCHIVE + APPLICATION)
254 changes: 254 additions & 0 deletions filetype/types/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import

from .base import Type


class ZippedDocumentBase(Type):
def match(self, buf):
# start by checking for ZIP local file header signature
idx = self.search_signature(buf, 0, 6000)
if idx != 0:
return

return self.match_document(buf)

def match_document(self, buf):
raise NotImplementedError

def compare_bytes(self, buf, subslice, start_offset):
sl = len(subslice)

if start_offset + sl > len(buf):
return False

return buf[start_offset:start_offset + sl] == subslice

def search_signature(self, buf, start, rangeNum):
signature = b"PK\x03\x04"
length = len(buf)

end = start + rangeNum
end = length if end > length else end

if start >= end:
return -1

try:
return buf.index(signature, start, end)
except ValueError:
return -1


class OpenDocument(ZippedDocumentBase):
def match_document(self, buf):
# Check if first file in archive is the identifying file
if not self.compare_bytes(buf, b"mimetype", 0x1E):
return

# Check content of mimetype file if it matches current mime
return self.compare_bytes(buf, bytes(self.mime, "ASCII"), 0x26)


class OfficeOpenXml(ZippedDocumentBase):
def match_document(self, buf):
# Check if first file in archive is the identifying file
ft = self.match_filename(buf, 0x1E)
if ft:
return ft

# Otherwise check that the fist file is one of these
if (
not self.compare_bytes(buf, b"[Content_Types].xml", 0x1E)
and not self.compare_bytes(buf, b"_rels/.rels", 0x1E)
and not self.compare_bytes(buf, b"docProps", 0x1E)
):
return

# Loop through next 3 files and check if they match
idx = 0
for i in range(3):
# Search for next file header
idx = self.search_signature(buf, idx + 4, 6000)
if idx == -1:
return

# Filename is at file header + 30
ft = self.match_filename(buf, idx + 30)
if ft:
return ft

def match_filename(self, buf, offset):
if self.compare_bytes(buf, b"word/", offset):
return (
self.mime
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
if self.compare_bytes(buf, b"ppt/", offset):
return (
self.mime
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
if self.compare_bytes(buf, b"xl/", offset):
return (
self.mime
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)


class Doc(Type):
"""
Implements the Microsoft Word (Office 97-2003) document type matcher.
"""

MIME = "application/msword"
EXTENSION = "doc"

def __init__(self):
super(Doc, self).__init__(mime=Doc.MIME, extension=Doc.EXTENSION)

def match(self, buf):
if len(buf) > 515 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
if buf[512:515] == b"\xEC\xA5\xC1\x00":
return True
if (
len(buf) > 2142
and b"\x00\x0A\x00\x00\x00MSWordDoc\x00\x10\x00\x00\x00Word.Document.8\x00\xF49\xB2q"
in buf[2075:2142]
):
return True

return False


class Docx(OfficeOpenXml):
"""
Implements the Microsoft Word OOXML (Office 2007+) document type matcher.
"""

MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
EXTENSION = "docx"

def __init__(self):
super(Docx, self).__init__(mime=Docx.MIME, extension=Docx.EXTENSION)


class Odt(OpenDocument):
"""
Implements the OpenDocument Text document type matcher.
"""

MIME = "application/vnd.oasis.opendocument.text"
EXTENSION = "odt"

def __init__(self):
super(Odt, self).__init__(mime=Odt.MIME, extension=Odt.EXTENSION)


class Xls(Type):
"""
Implements the Microsoft Excel (Office 97-2003) document type matcher.
"""

MIME = "application/vnd.ms-excel"
EXTENSION = "xls"

def __init__(self):
super(Xls, self).__init__(mime=Xls.MIME, extension=Xls.EXTENSION)

def match(self, buf):
if len(buf) > 520 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
if buf[512:516] == b"\xFD\xFF\xFF\xFF" and (
buf[518] == 0x00 or buf[518] == 0x02
):
return True
if buf[512:520] == b"\x09\x08\x10\x00\x00\x06\x05\x00":
return True
if (
len(buf) > 2095
and b"\xE2\x00\x00\x00\x5C\x00\x70\x00\x04\x00\x00Calc"
in buf[1568:2095]
):
return True

return False


class Xlsx(OfficeOpenXml):
"""
Implements the Microsoft Excel OOXML (Office 2007+) document type matcher.
"""

MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
EXTENSION = "xlsx"

def __init__(self):
super(Xlsx, self).__init__(mime=Xlsx.MIME, extension=Xlsx.EXTENSION)


class Ods(OpenDocument):
"""
Implements the OpenDocument Spreadsheet document type matcher.
"""

MIME = "application/vnd.oasis.opendocument.spreadsheet"
EXTENSION = "ods"

def __init__(self):
super(Ods, self).__init__(mime=Ods.MIME, extension=Ods.EXTENSION)


class Ppt(Type):
"""
Implements the Microsoft PowerPoint (Office 97-2003) document type matcher.
"""

MIME = "application/vnd.ms-powerpoint"
EXTENSION = "ppt"

def __init__(self):
super(Ppt, self).__init__(mime=Ppt.MIME, extension=Ppt.EXTENSION)

def match(self, buf):
if len(buf) > 524 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
if buf[512:516] == b"\xA0\x46\x1D\xF0":
return True
if buf[512:516] == b"\x00\x6E\x1E\xF0":
return True
if buf[512:516] == b"\x0F\x00\xE8\x03":
return True
if buf[512:516] == b"\xFD\xFF\xFF\xFF" and buf[522:524] == b"\x00\x00":
return True
if (
len(buf) > 2096
and buf[2072:2096]
== b"\x00\xB9\x29\xE8\x11\x00\x00\x00MS PowerPoint 97"
):
return True

return False


class Pptx(OfficeOpenXml):
"""
Implements the Microsoft PowerPoint OOXML (Office 2007+) document type matcher.
"""

MIME = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
EXTENSION = "pptx"

def __init__(self):
super(Pptx, self).__init__(mime=Pptx.MIME, extension=Pptx.EXTENSION)


class Odp(OpenDocument):
"""
Implements the OpenDocument Presentation document type matcher.
"""

MIME = "application/vnd.oasis.opendocument.presentation"
EXTENSION = "odp"

def __init__(self):
super(Odp, self).__init__(mime=Odp.MIME, extension=Odp.EXTENSION)
Loading