decalage2 · christian-intra2net · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/oletools/ftguess.py b/oletools/ftguess.py
@@ -62,6 +62,7 @@
 import olefile
 import logging
 import optparse
+import codecs
 
 # import lxml or ElementTree for XML parsing:
 try:
@@ -290,6 +291,78 @@ def recognize(cls, ftg):
 class FType_Unknown(FType_Base):
     pass
 
+class FType_TEXT(FType_Base):
+    """
+    Try a few popular encoding to detect whether this is just plain text.
+
+    Try the most popular encodings according to wikipedia:
+    https://en.wikipedia.org/wiki/Popularity_of_text_encodings#Popularity_internally_in_software
+
+    Maybe should add 'windows-1251' (cyrillic) or 'big5' (Chinese) or other formats popular in Asia?
+
+    Implementation is rather hacky, but we do not need a perfect solution here (which would be to use
+    libmagic) and determining encoding is really not easy.
+    """
+    filetype = FTYPE.TEXT
+    name = 'plain text'
+    longname = 'unclassified plain text'
+    extensions = ['txt',]
+    content_types = ('text/plain',)   # behave like `file` on linux
+    PUID = 'x-fmt/111'
+    # encodings we try to decode the bytes with; from limited to more general
+    ENCODINGS = ('ascii', 'latin1', 'utf8', 'utf-16le', 'utf-16be')
+    CHECK_SIZE = 4096   # do not try to decode megabytes of data, just check the beginning
+
+    @classmethod
+    def recognize(cls, ftg):
+        """
+        Try to determine whether this data makes sense as encoded text.
+
+        If yes, set :py:data:`ftg.text_encoding`.
+        """
+        # first, try a few simple ones:
+        if ftg.data.startswith(codecs.BOM_UTF8):
+            try:
+                _ = ftg.data.decode('utf8', errors='strict')
+                ftg.text_encoding = 'utf8'
+                return True
+            except UnicodeError:
+                return False
+        elif ftg.data.startswith(codecs.BOM_UTF16_LE):
+            try:
+                _ = ftg.data.decode('utf-16le', errors='strict')
+                ftg.text_encoding = 'utf-16le'
+                return True
+            except UnicodeError:
+                return False
+        elif ftg.data.startswith(codecs.BOM_UTF16_BE):
+            try:
+                _ = ftg.data.decode('utf-16be', errors='strict')
+                ftg.text_encoding = 'utf-16be'
+                return True
+            except UnicodeError:
+                return False
+
+        # no BOM? then try to decode the first part using various encodings
+        # could also check if every 2nd byte is zero in 90% of time. If so, this is probably utf16
+        for encoding in cls.ENCODINGS:
+            try:
+                data_size = len(ftg.data)
+                decoded = ftg.data[:cls.CHECK_SIZE].decode(encoding, errors='strict')
+                if data_size > cls.CHECK_SIZE:
+                    rep = repr(decoded[:-10])  # remove the last characters, may be erroneous due to cutting
+                else:
+                    rep = repr(decoded)
+                bad_chars = rep.count(r'\x') + rep.count(r'\u')   # e.g. in latin1 everything "is valid" but looks horrible
+                if bad_chars > float(data_size) * 0.05:
+                    continue
+                ftg.text_encoding = encoding
+                return True
+            except UnicodeError:
+                pass
+        return False
+
+
 class FType_RTF(FType_Base):
     container = CONTAINER.RTF
     application = APP.MSWORD
@@ -802,6 +875,8 @@ def __init__(self, filepath=None, data=None):
         # For XML:
         self.root_xmltag = None
         self.xmlroot = None
+        # For TEXT:
+        self.text_encoding = None
 
         if filepath is None and data is None:
             raise ValueError('FileTypeGuesser requires either a file path or file data, or both')
@@ -811,7 +886,7 @@ def __init__(self, filepath=None, data=None):
         self.data_bytesio = io.BytesIO(self.data)
 
         # Identify the main container type:
-        for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG):
+        for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG, FType_TEXT):
             if ftype.recognize(self):
                 self.ftype = ftype
                 break

diff --git a/oletools/olevba.py b/oletools/olevba.py
@@ -2790,12 +2790,13 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D
                 raise FileOpenError(msg)
             # Check if it is a SLK/SYLK file - https://en.wikipedia.org/wiki/SYmbolic_LinK_(SYLK)
             # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel:
-            if data.startswith(b'ID'):
-                self.open_slk(data)
-            # Check if this is a plain text VBA or VBScript file:
-            # To avoid scanning binary files, we simply check for some control chars:
-            if self.type is None and b'\x00' not in data:
-                self.open_text(data)
+            if self.type is None and self.ftg.ftype == ftguess.FType_TEXT:
+                data = bytes2str(data, self.ftg.text_encoding)
+                if data.startswith('ID'):
+                    self.open_slk(data)
+                else:
+                    # Check if this is a plain text VBA or VBScript file:
+                    self.open_text(data)
         if self.type is None:
             # At this stage, could not match a known format:
             msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename
@@ -3101,51 +3102,51 @@ def open_ppt(self):
                 log.debug("File appears not to be a ppt file (%s)" % exc)
 
 
-    def open_slk(self, data):
+    def open_slk(self, str_data):
         """
         Open a SLK file, which may contain XLM/Excel 4 macros
-        :param data: file contents in a bytes string
+        :param str_data: file contents in a [unicode] string
         :return: nothing
         """
         # TODO: Those results should be stored as XLM macros, not VBA
         log.info('Opening SLK file %s' % self.filename)
         xlm_macro_found = False
         xlm_macros = []
         xlm_macros.append('Formulas and XLM/Excel 4 macros extracted from SLK file:')
-        for line in data.splitlines(False):
-            if line.startswith(b'O'):
+        for line in str_data.splitlines(False):
+            if line.startswith('O'):
                 # Option: "O;E" indicates a macro sheet, must appear before NN and C rows
-                for s in line.split(b';'):
-                    if s.startswith(b'E'):
+                for s in line.split(';'):
+                    if s.startswith('E'):
                         xlm_macro_found = True
                         log.debug('SLK parser: found macro sheet')
-            elif line.startswith(b'NN') and xlm_macro_found:
+            elif line.startswith('NN') and xlm_macro_found:
                 # Name that can trigger a macro, for example "Auto_Open"
-                for s in line.split(b';'):
-                    if s.startswith(b'N') and s.strip() != b'NN':
-                        xlm_macros.append('Named cell: %s' % bytes2str(s[1:]))
-            elif line.startswith(b'C') and xlm_macro_found:
+                for s in line.split(';'):
+                    if s.startswith('N') and s.strip() != 'NN':
+                        xlm_macros.append('Named cell: %s' % s[1:])
+            elif line.startswith('C') and xlm_macro_found:
                 # Cell
-                for s in line.split(b';'):
-                    if s.startswith(b'E'):
-                        xlm_macros.append('Formula or Macro: %s' % bytes2str(s[1:]))
+                for s in line.split(';'):
+                    if s.startswith('E'):
+                        xlm_macros.append('Formula or Macro: %s' % s[1:])
         if xlm_macro_found:
             self.contains_xlm_macros = True
             self.xlm_macros = xlm_macros
         self.type = TYPE_SLK
 
 
-    def open_text(self, data):
+    def open_text(self, str_data):
         """
         Open a text file containing VBA or VBScript source code
-        :param data: file contents in a string or bytes
+        :param str_data: file contents in a [unicode] string
         :return: nothing
         """
         log.info('Opening text file %s' % self.filename)
         # directly store the source code:
         # On Python 2, store it as a raw bytes string
         # On Python 3, convert it to unicode assuming it was encoded with UTF-8
-        self.vba_code_all_modules = bytes2str(data)
+        self.vba_code_all_modules = str_data
         self.contains_vba_macros = True
         # set type only if parsing succeeds
         self.type = TYPE_TEXT

diff --git a/tests/ftguess/test_basic.py b/tests/ftguess/test_basic.py
@@ -1,7 +1,7 @@
 """Test ftguess"""
 
 import unittest
-import os
+import re
 from os.path import splitext
 from oletools import ftguess
 
@@ -14,7 +14,7 @@ class TestFTGuess(unittest.TestCase):
     """Test ftguess"""
 
     def test_all(self):
-        """Run all files in test-data and compare to known ouput"""
+        """Run all files in test-data and compare to known output"""
         # ftguess knows extension for each FType, create a reverse mapping
         used_types = (
             ftguess.FType_RTF, ftguess.FType_Generic_OLE,
@@ -31,7 +31,7 @@ def test_all(self):
             ftguess.FType_Powerpoint2007_Slideshow,
             ftguess.FType_Powerpoint2007_Macro,
             ftguess.FType_Powerpoint2007_Slideshow_Macro,
-            ftguess.FType_XPS,
+            ftguess.FType_XPS, ftguess.FType_TEXT,
         )
         ftype_for_extension = dict()
         for ftype in used_types:
@@ -45,10 +45,10 @@ def test_all(self):
 
             # determine what we expect...
             before_dot, extension = splitext(filename)
-            if extension == '.zip':
+            if extension == '.zip':    # zipped files are encrypted versions of other files to not alarm virus scanners
                 extension = splitext(before_dot)[1]
             elif filename in ('basic/empty', 'basic/text'):
-                extension = '.csv'    # have just like that
+                extension = '.txt'    # behave as if this were simple plain text
             elif not extension:
                 self.fail('Could not find extension for test sample {0}'
                           .format(filename))
@@ -105,6 +105,19 @@ def test_all(self):
                 self.assertEqual(guess.is_powerpoint(),
                                  extension.startswith('p'))
 
+    def test_encoding(self):
+        """Check whether text file encoding is detected correctly"""
+        n_matches = 0
+        for filename, file_contents in loop_over_files(subdir='basic'):
+            match = re.match(r'basic[/\\]test-sample-(ascii|latin1|utf[816_lbe]+)(?:-nobom|-withbom)?.txt', filename)
+            if not match:
+                continue
+            n_matches += 1
+            expect_encoding = match.groups()[0].replace('_', '')
+            guess = ftguess.ftype_guess(data=file_contents)
+            self.assertEqual(guess.ftype, ftguess.FType_TEXT)
+            self.assertEqual(guess.text_encoding.replace('-', ''), expect_encoding)
+        self.assertGreater(n_matches, 0)
 
 
 # just in case somebody calls this file as a script

diff --git a/tests/oleid/test_basic.py b/tests/oleid/test_basic.py
@@ -111,7 +111,9 @@ def test_macros(self):
 
             # xlm detection does not work in-memory (yet)
             # --> xlm is "unknown" for excel files, except some encrypted files
-            self.assertIn(value_dict['xlm'], ('Unknown', 'No'))
+            self.assertIn(value_dict['xlm'], ('Unknown', 'No'),
+                          "Unexpected value '{0}' for XLM-content in test sample {1}'"
+                          .format(value_dict['xlm'], filename))
 
             # "macro detection" in text files leads to interesting results:
             if filename in ('ooxml/dde-in-excel2003.xml',    # not really
@@ -121,9 +123,17 @@ def test_macros(self):
                             'oleform/oleform-PR314.docm',
                             'basic/empty',                   # WTF?
                             'basic/text'):                   # no macros!
-                self.assertEqual(value_dict['vba'], 'Yes')
+                self.assertEqual(value_dict['vba'], 'Yes',
+                                 "Unexpected value '{0}' for test sample {1}'"
+                                 .format(value_dict['xlm'], filename))
+            elif filename.startswith(join('basic', 'test-sample-')):   # not clear what macro detection should do with text files
+                self.assertIn(value_dict['vba'], ('Yes', 'Error'),
+                              "Unexpected value '{0}' for test sample {1}'"
+                              .format(value_dict['vba'], filename))
             else:
-                self.assertEqual(value_dict['vba'], 'No')
+                self.assertEqual(value_dict['vba'], 'No',
+                                 "Unexpected value '{0}' for test sample {1}'"
+                                 .format(value_dict['vba'], filename))
 
     def test_flash(self):
         """Test indicator for flash."""

diff --git a/tests/ooxml/test_basic.py b/tests/ooxml/test_basic.py
@@ -39,7 +39,7 @@ def test_rough_doctype(self):
 
         # files that are neither OLE nor xml:
         except_files = 'empty', 'text'
-        except_extns = 'rtf', 'csv', 'zip', 'slk'
+        except_extns = 'rtf', 'csv', 'zip', 'slk', 'txt'
 
         # analyse all files in data dir
         # TODO: use testdata_reader to extract real data from zip files

diff --git a/tests/test-data/basic/test-sample-ascii.txt b/tests/test-data/basic/test-sample-ascii.txt
@@ -0,0 +1,2 @@
+Test sample file without special chars or emjois,
+encoded using ascii
diff --git a/tests/test-data/basic/test-sample-latin1.txt b/tests/test-data/basic/test-sample-latin1.txt
@@ -0,0 +1,2 @@
+Test sample file with special chars ����,
+encoded using latin1
diff --git a/tests/test-data/basic/test-sample-utf8-nobom.txt b/tests/test-data/basic/test-sample-utf8-nobom.txt
@@ -0,0 +1,2 @@
+Test sample file with special chars äöüß and emojis 😇🙊,
+encoded using utf8
diff --git a/tests/test-data/basic/test-sample-utf8-withbom.txt b/tests/test-data/basic/test-sample-utf8-withbom.txt
@@ -0,0 +1,2 @@
+Test sample file with special chars äöüß and emojis 😇🙊,
+encoded using utf8
diff --git a/tests/test-data/basic/test-sample-utf_16_be-nobom.txt b/tests/test-data/basic/test-sample-utf_16_be-nobom.txt
diff --git a/tests/test-data/basic/test-sample-utf_16_be-withbom.txt b/tests/test-data/basic/test-sample-utf_16_be-withbom.txt
diff --git a/tests/test-data/basic/test-sample-utf_16_le-nobom.txt b/tests/test-data/basic/test-sample-utf_16_le-nobom.txt
diff --git a/tests/test-data/basic/test-sample-utf_16_le-withbom.txt b/tests/test-data/basic/test-sample-utf_16_le-withbom.txt