From 7eb14b45f60ae34748f982e3cd2141a6ab25e075 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 14 Dec 2023 12:16:53 +0100 Subject: [PATCH 1/3] Add test samples with various text encodings Test-driven development: want to correctly detect these as text in ftguess. Already use future ftguess text type. Since we're at it: slightly improve output of unittest --- tests/ftguess/test_basic.py | 23 ++++++++++++++---- tests/oleid/test_basic.py | 16 +++++++++--- tests/ooxml/test_basic.py | 2 +- tests/test-data/basic/test-sample-ascii.txt | 2 ++ tests/test-data/basic/test-sample-latin1.txt | 2 ++ .../basic/test-sample-utf8-nobom.txt | 2 ++ .../basic/test-sample-utf8-withbom.txt | 2 ++ .../basic/test-sample-utf_16_be-nobom.txt | Bin 0 -> 164 bytes .../basic/test-sample-utf_16_be-withbom.txt | Bin 0 -> 166 bytes .../basic/test-sample-utf_16_le-nobom.txt | Bin 0 -> 164 bytes .../basic/test-sample-utf_16_le-withbom.txt | Bin 0 -> 166 bytes 11 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 tests/test-data/basic/test-sample-ascii.txt create mode 100644 tests/test-data/basic/test-sample-latin1.txt create mode 100644 tests/test-data/basic/test-sample-utf8-nobom.txt create mode 100644 tests/test-data/basic/test-sample-utf8-withbom.txt create mode 100644 tests/test-data/basic/test-sample-utf_16_be-nobom.txt create mode 100644 tests/test-data/basic/test-sample-utf_16_be-withbom.txt create mode 100644 tests/test-data/basic/test-sample-utf_16_le-nobom.txt create mode 100644 tests/test-data/basic/test-sample-utf_16_le-withbom.txt diff --git a/tests/ftguess/test_basic.py b/tests/ftguess/test_basic.py index 3c6311847..944706c7f 100644 --- a/tests/ftguess/test_basic.py +++ b/tests/ftguess/test_basic.py @@ -1,7 +1,7 @@ """Test ftguess""" import unittest -import os +import re from os.path import splitext from oletools import ftguess @@ -14,7 +14,7 @@ class TestFTGuess(unittest.TestCase): """Test ftguess""" def test_all(self): - """Run all files in test-data and compare to known ouput""" + """Run all files in test-data and compare to known output""" # ftguess knows extension for each FType, create a reverse mapping used_types = ( ftguess.FType_RTF, ftguess.FType_Generic_OLE, @@ -31,7 +31,7 @@ def test_all(self): ftguess.FType_Powerpoint2007_Slideshow, ftguess.FType_Powerpoint2007_Macro, ftguess.FType_Powerpoint2007_Slideshow_Macro, - ftguess.FType_XPS, + ftguess.FType_XPS, ftguess.FType_TEXT, ) ftype_for_extension = dict() for ftype in used_types: @@ -45,10 +45,10 @@ def test_all(self): # determine what we expect... before_dot, extension = splitext(filename) - if extension == '.zip': + if extension == '.zip': # zipped files are encrypted versions of other files to not alarm virus scanners extension = splitext(before_dot)[1] elif filename in ('basic/empty', 'basic/text'): - extension = '.csv' # have just like that + extension = '.txt' # behave as if this were simple plain text elif not extension: self.fail('Could not find extension for test sample {0}' .format(filename)) @@ -105,6 +105,19 @@ def test_all(self): self.assertEqual(guess.is_powerpoint(), extension.startswith('p')) + def test_encoding(self): + """Check whether text file encoding is detected correctly""" + n_matches = 0 + for filename, file_contents in loop_over_files(subdir='basic'): + match = re.match(r'basic[/\\]test-sample-(ascii|latin1|utf[816_lbe]+)(?:-nobom|-withbom)?.txt', filename) + if not match: + continue + n_matches += 1 + expect_encoding = match.groups()[0].replace('_', '') + guess = ftguess.ftype_guess(data=file_contents) + self.assertEqual(guess.ftype, ftguess.FType_TEXT) + self.assertEqual(guess.text_encoding.replace('-', ''), expect_encoding) + self.assertGreater(n_matches, 0) # just in case somebody calls this file as a script diff --git a/tests/oleid/test_basic.py b/tests/oleid/test_basic.py index e37ad61ab..cfb2565a5 100644 --- a/tests/oleid/test_basic.py +++ b/tests/oleid/test_basic.py @@ -111,7 +111,9 @@ def test_macros(self): # xlm detection does not work in-memory (yet) # --> xlm is "unknown" for excel files, except some encrypted files - self.assertIn(value_dict['xlm'], ('Unknown', 'No')) + self.assertIn(value_dict['xlm'], ('Unknown', 'No'), + "Unexpected value '{0}' for XLM-content in test sample {1}'" + .format(value_dict['xlm'], filename)) # "macro detection" in text files leads to interesting results: if filename in ('ooxml/dde-in-excel2003.xml', # not really @@ -121,9 +123,17 @@ def test_macros(self): 'oleform/oleform-PR314.docm', 'basic/empty', # WTF? 'basic/text'): # no macros! - self.assertEqual(value_dict['vba'], 'Yes') + self.assertEqual(value_dict['vba'], 'Yes', + "Unexpected value '{0}' for test sample {1}'" + .format(value_dict['xlm'], filename)) + elif filename.startswith(join('basic', 'test-sample-')): # not clear what macro detection should do with text files + self.assertIn(value_dict['vba'], ('Yes', 'Error'), + "Unexpected value '{0}' for test sample {1}'" + .format(value_dict['vba'], filename)) else: - self.assertEqual(value_dict['vba'], 'No') + self.assertEqual(value_dict['vba'], 'No', + "Unexpected value '{0}' for test sample {1}'" + .format(value_dict['vba'], filename)) def test_flash(self): """Test indicator for flash.""" diff --git a/tests/ooxml/test_basic.py b/tests/ooxml/test_basic.py index e21942b13..90d85d427 100644 --- a/tests/ooxml/test_basic.py +++ b/tests/ooxml/test_basic.py @@ -39,7 +39,7 @@ def test_rough_doctype(self): # files that are neither OLE nor xml: except_files = 'empty', 'text' - except_extns = 'rtf', 'csv', 'zip', 'slk' + except_extns = 'rtf', 'csv', 'zip', 'slk', 'txt' # analyse all files in data dir # TODO: use testdata_reader to extract real data from zip files diff --git a/tests/test-data/basic/test-sample-ascii.txt b/tests/test-data/basic/test-sample-ascii.txt new file mode 100644 index 000000000..700629332 --- /dev/null +++ b/tests/test-data/basic/test-sample-ascii.txt @@ -0,0 +1,2 @@ +Test sample file without special chars or emjois, +encoded using ascii diff --git a/tests/test-data/basic/test-sample-latin1.txt b/tests/test-data/basic/test-sample-latin1.txt new file mode 100644 index 000000000..ee5d8d38c --- /dev/null +++ b/tests/test-data/basic/test-sample-latin1.txt @@ -0,0 +1,2 @@ +Test sample file with special chars äöüß, +encoded using latin1 diff --git a/tests/test-data/basic/test-sample-utf8-nobom.txt b/tests/test-data/basic/test-sample-utf8-nobom.txt new file mode 100644 index 000000000..63fa88781 --- /dev/null +++ b/tests/test-data/basic/test-sample-utf8-nobom.txt @@ -0,0 +1,2 @@ +Test sample file with special chars äöüß and emojis 😇🙊, +encoded using utf8 diff --git a/tests/test-data/basic/test-sample-utf8-withbom.txt b/tests/test-data/basic/test-sample-utf8-withbom.txt new file mode 100644 index 000000000..52872f5f2 --- /dev/null +++ b/tests/test-data/basic/test-sample-utf8-withbom.txt @@ -0,0 +1,2 @@ +Test sample file with special chars äöüß and emojis 😇🙊, +encoded using utf8 diff --git a/tests/test-data/basic/test-sample-utf_16_be-nobom.txt b/tests/test-data/basic/test-sample-utf_16_be-nobom.txt new file mode 100644 index 0000000000000000000000000000000000000000..917d2587ecf2770eee71f0beee3a856de8b7c854 GIT binary patch literal 164 zcmXYqF%E)25Jg{S!YTFuHa2!%fZE^tfORizqC20ncGF{>gi7iaQX$4y%Qb$D;AoGqD7Yqpb0 VD>+Hj?yjaL|0+4LV{b1cvIX8@A?g4C literal 0 HcmV?d00001 diff --git a/tests/test-data/basic/test-sample-utf_16_be-withbom.txt b/tests/test-data/basic/test-sample-utf_16_be-withbom.txt new file mode 100644 index 0000000000000000000000000000000000000000..9987b12f6323fe155fbfe38fd403f4fcb2f02b52 GIT binary patch literal 166 zcmXYqF%E)25Jg`{!YTFuHa2!%z}nudvIs_jAcU)0Y2z8JjQT<>X6Enw_h&y}wj{0$ z1bgH{$JtGwr2eZ(HBik&r+4rj^@6HW^(F&PUc7mzA`OKzd9CFnt$$nJGi$?wT&FSH XN~M_`C2Ds^Q=@;C>{zn0?vv23WncV}4Cq2O$ zxzlm86DX6x!r#v4PV?J!gC7DU> ZY@?NCv)8EX9rb$OD%i1NZJqv-@dHBMBk=$L literal 0 HcmV?d00001 From 9560015606b1ec2fbd857c9d8704661c6e85447f Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 14 Dec 2023 12:18:58 +0100 Subject: [PATCH 2/3] Implement detection of plain text This is not so simple since various text encodings can look rather "binary", but a few simple heuristics will deal with many text types (at least those encountered here in Europe). Of course, all xml is text as well, so use checks for "is this text" only after more specialized tests like "is this xml". --- oletools/ftguess.py | 77 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/oletools/ftguess.py b/oletools/ftguess.py index cd447f91a..e259bf843 100644 --- a/oletools/ftguess.py +++ b/oletools/ftguess.py @@ -62,6 +62,7 @@ import olefile import logging import optparse +import codecs # import lxml or ElementTree for XML parsing: try: @@ -290,6 +291,78 @@ def recognize(cls, ftg): class FType_Unknown(FType_Base): pass +class FType_TEXT(FType_Base): + """ + Try a few popular encoding to detect whether this is just plain text. + + Try the most popular encodings according to wikipedia: + https://en.wikipedia.org/wiki/Popularity_of_text_encodings#Popularity_internally_in_software + + Maybe should add 'windows-1251' (cyrillic) or 'big5' (Chinese) or other formats popular in Asia? + + Implementation is rather hacky, but we do not need a perfect solution here (which would be to use + libmagic) and determining encoding is really not easy. + """ + filetype = FTYPE.TEXT + name = 'plain text' + longname = 'unclassified plain text' + extensions = ['txt',] + content_types = ('text/plain',) # behave like `file` on linux + PUID = 'x-fmt/111' + # encodings we try to decode the bytes with; from limited to more general + ENCODINGS = ('ascii', 'latin1', 'utf8', 'utf-16le', 'utf-16be') + CHECK_SIZE = 4096 # do not try to decode megabytes of data, just check the beginning + + @classmethod + def recognize(cls, ftg): + """ + Try to determine whether this data makes sense as encoded text. + + If yes, set :py:data:`ftg.text_encoding`. + """ + # first, try a few simple ones: + if ftg.data.startswith(codecs.BOM_UTF8): + try: + _ = ftg.data.decode('utf8', errors='strict') + ftg.text_encoding = 'utf8' + return True + except UnicodeError: + return False + elif ftg.data.startswith(codecs.BOM_UTF16_LE): + try: + _ = ftg.data.decode('utf-16le', errors='strict') + ftg.text_encoding = 'utf-16le' + return True + except UnicodeError: + return False + elif ftg.data.startswith(codecs.BOM_UTF16_BE): + try: + _ = ftg.data.decode('utf-16be', errors='strict') + ftg.text_encoding = 'utf-16be' + return True + except UnicodeError: + return False + + # no BOM? then try to decode the first part using various encodings + # could also check if every 2nd byte is zero in 90% of time. If so, this is probably utf16 + for encoding in cls.ENCODINGS: + try: + data_size = len(ftg.data) + decoded = ftg.data[:cls.CHECK_SIZE].decode(encoding, errors='strict') + if data_size > cls.CHECK_SIZE: + rep = repr(decoded[:-10]) # remove the last characters, may be erroneous due to cutting + else: + rep = repr(decoded) + bad_chars = rep.count(r'\x') + rep.count(r'\u') # e.g. in latin1 everything "is valid" but looks horrible + if bad_chars > float(data_size) * 0.05: + continue + ftg.text_encoding = encoding + return True + except UnicodeError: + pass + return False + + class FType_RTF(FType_Base): container = CONTAINER.RTF application = APP.MSWORD @@ -802,6 +875,8 @@ def __init__(self, filepath=None, data=None): # For XML: self.root_xmltag = None self.xmlroot = None + # For TEXT: + self.text_encoding = None if filepath is None and data is None: raise ValueError('FileTypeGuesser requires either a file path or file data, or both') @@ -811,7 +886,7 @@ def __init__(self, filepath=None, data=None): self.data_bytesio = io.BytesIO(self.data) # Identify the main container type: - for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG): + for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG, FType_TEXT): if ftype.recognize(self): self.ftype = ftype break From 929d2c0cbaeeff43c3f408488563b630ec1f94eb Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 14 Dec 2023 12:55:00 +0100 Subject: [PATCH 3/3] Decode text in olevba before analyzing it --- oletools/olevba.py | 47 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/oletools/olevba.py b/oletools/olevba.py index 52ffd5126..18da7af68 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -2790,12 +2790,13 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D raise FileOpenError(msg) # Check if it is a SLK/SYLK file - https://en.wikipedia.org/wiki/SYmbolic_LinK_(SYLK) # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel: - if data.startswith(b'ID'): - self.open_slk(data) - # Check if this is a plain text VBA or VBScript file: - # To avoid scanning binary files, we simply check for some control chars: - if self.type is None and b'\x00' not in data: - self.open_text(data) + if self.type is None and self.ftg.ftype == ftguess.FType_TEXT: + data = bytes2str(data, self.ftg.text_encoding) + if data.startswith('ID'): + self.open_slk(data) + else: + # Check if this is a plain text VBA or VBScript file: + self.open_text(data) if self.type is None: # At this stage, could not match a known format: msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename @@ -3101,10 +3102,10 @@ def open_ppt(self): log.debug("File appears not to be a ppt file (%s)" % exc) - def open_slk(self, data): + def open_slk(self, str_data): """ Open a SLK file, which may contain XLM/Excel 4 macros - :param data: file contents in a bytes string + :param str_data: file contents in a [unicode] string :return: nothing """ # TODO: Those results should be stored as XLM macros, not VBA @@ -3112,40 +3113,40 @@ def open_slk(self, data): xlm_macro_found = False xlm_macros = [] xlm_macros.append('Formulas and XLM/Excel 4 macros extracted from SLK file:') - for line in data.splitlines(False): - if line.startswith(b'O'): + for line in str_data.splitlines(False): + if line.startswith('O'): # Option: "O;E" indicates a macro sheet, must appear before NN and C rows - for s in line.split(b';'): - if s.startswith(b'E'): + for s in line.split(';'): + if s.startswith('E'): xlm_macro_found = True log.debug('SLK parser: found macro sheet') - elif line.startswith(b'NN') and xlm_macro_found: + elif line.startswith('NN') and xlm_macro_found: # Name that can trigger a macro, for example "Auto_Open" - for s in line.split(b';'): - if s.startswith(b'N') and s.strip() != b'NN': - xlm_macros.append('Named cell: %s' % bytes2str(s[1:])) - elif line.startswith(b'C') and xlm_macro_found: + for s in line.split(';'): + if s.startswith('N') and s.strip() != 'NN': + xlm_macros.append('Named cell: %s' % s[1:]) + elif line.startswith('C') and xlm_macro_found: # Cell - for s in line.split(b';'): - if s.startswith(b'E'): - xlm_macros.append('Formula or Macro: %s' % bytes2str(s[1:])) + for s in line.split(';'): + if s.startswith('E'): + xlm_macros.append('Formula or Macro: %s' % s[1:]) if xlm_macro_found: self.contains_xlm_macros = True self.xlm_macros = xlm_macros self.type = TYPE_SLK - def open_text(self, data): + def open_text(self, str_data): """ Open a text file containing VBA or VBScript source code - :param data: file contents in a string or bytes + :param str_data: file contents in a [unicode] string :return: nothing """ log.info('Opening text file %s' % self.filename) # directly store the source code: # On Python 2, store it as a raw bytes string # On Python 3, convert it to unicode assuming it was encoded with UTF-8 - self.vba_code_all_modules = bytes2str(data) + self.vba_code_all_modules = str_data self.contains_vba_macros = True # set type only if parsing succeeds self.type = TYPE_TEXT