diff --git a/filetype/types/document.py b/filetype/types/document.py index 285f5fe..9f57e98 100644 --- a/filetype/types/document.py +++ b/filetype/types/document.py @@ -67,8 +67,10 @@ def match_document(self, buf): return # Loop through next 3 files and check if they match + # NOTE: OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file + # https://github.com/h2non/filetype/blob/d730d98ad5c990883148485b6fd5adbdd378364a/matchers/document.go#L134 idx = 0 - for i in range(3): + for i in range(4): # Search for next file header idx = self.search_signature(buf, idx + 4, 6000) if idx == -1: @@ -110,7 +112,7 @@ def __init__(self): def match(self, buf): if len(buf) > 515 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1": - if buf[512:515] == b"\xEC\xA5\xC1\x00": + if buf[512:516] == b"\xEC\xA5\xC1\x00": return True if ( len(buf) > 2142 diff --git a/filetype/utils.py b/filetype/utils.py index 787b4cc..493b1e2 100644 --- a/filetype/utils.py +++ b/filetype/utils.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from io import BufferedIOBase # Python 2.7 workaround try: import pathlib @@ -48,7 +49,8 @@ def get_bytes(obj): returning a sliced bytearray. Args: - obj: path to readable, file-like object(with read() method), bytes, bytearray or memoryview + obj: path to readable, file-like object(with read() method), bytes, + bytearray or memoryview Returns: First 8192 bytes of the file content as bytearray type. @@ -71,6 +73,13 @@ def get_bytes(obj): if isinstance(obj, pathlib.PurePath): return get_signature_bytes(obj) + if isinstance(obj, BufferedIOBase): + start_pos = obj.tell() + obj.seek(0) + magic_bytes = obj.read(_NUM_SIGNATURE_BYTES) + obj.seek(start_pos) # restore reader position + return get_bytes(magic_bytes) + if hasattr(obj, 'read'): return get_bytes(obj.read(_NUM_SIGNATURE_BYTES)) diff --git a/tests/fixtures/sample_1.doc b/tests/fixtures/sample_1.doc new file mode 100644 index 0000000..e441b28 Binary files /dev/null and b/tests/fixtures/sample_1.doc differ diff --git a/tests/fixtures/sample_1.docx b/tests/fixtures/sample_1.docx new file mode 100644 index 0000000..c164ed3 Binary files /dev/null and b/tests/fixtures/sample_1.docx differ diff --git a/tests/test_types.py b/tests/test_types.py index 9160ff3..0919aa3 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -14,8 +14,14 @@ class TestFileType(unittest.TestCase): def test_guess_jpeg(self): img_path = FIXTURES + '/sample.jpg' - for obj in (img_path, open(img_path, 'rb')): - kind = filetype.guess(obj) + with open(img_path, 'rb') as fp: + for obj in (img_path, fp): + kind = filetype.guess(obj) + self.assertTrue(kind is not None) + self.assertEqual(kind.mime, 'image/jpeg') + self.assertEqual(kind.extension, 'jpg') + # reset reader position test + kind = filetype.guess(fp) self.assertTrue(kind is not None) self.assertEqual(kind.mime, 'image/jpeg') self.assertEqual(kind.extension, 'jpg') @@ -70,16 +76,18 @@ def test_guess_zstd(self): self.assertEqual(kind.extension, 'zst') def test_guess_doc(self): - kind = filetype.guess(FIXTURES + '/sample.doc') - self.assertIsNotNone(kind) - self.assertEqual(kind.mime, 'application/msword') - self.assertEqual(kind.extension, 'doc') + for name in 'sample.doc', 'sample_1.doc': + kind = filetype.guess(os.path.join(FIXTURES, name)) + self.assertIsNotNone(kind) + self.assertEqual(kind.mime, 'application/msword') + self.assertEqual(kind.extension, 'doc') def test_guess_docx(self): - kind = filetype.guess(FIXTURES + '/sample.docx') - self.assertTrue(kind is not None) - self.assertEqual(kind.mime, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') - self.assertEqual(kind.extension, 'docx') + for name in 'sample.docx', 'sample_1.docx': + kind = filetype.guess(os.path.join(FIXTURES, name)) + self.assertTrue(kind is not None) + self.assertEqual(kind.mime, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') + self.assertEqual(kind.extension, 'docx') def test_guess_odt(self): kind = filetype.guess(FIXTURES + '/sample.odt') diff --git a/tox.ini b/tox.ini index 349eeb4..9bbd805 100644 --- a/tox.ini +++ b/tox.ini @@ -8,8 +8,8 @@ envlist = py{27,35,36,37,38,39}, lint, doc, clean skip_missing_interpreters = true [testenv:test] -deps = pytest-benchmark -commands = pytest +deps = pytest +commands = pytest --ignore=tests/test_benchmark.py [testenv:lint] basepython = python3