From 7eb14b45f60ae34748f982e3cd2141a6ab25e075 Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Thu, 14 Dec 2023 12:16:53 +0100
Subject: [PATCH 1/3] Add test samples with various text encodings

Test-driven development: want to correctly detect these as text in ftguess.
Already use future ftguess text type.

Since we're at it: slightly improve output of unittest
---
 tests/ftguess/test_basic.py                   |  23 ++++++++++++++----
 tests/oleid/test_basic.py                     |  16 +++++++++---
 tests/ooxml/test_basic.py                     |   2 +-
 tests/test-data/basic/test-sample-ascii.txt   |   2 ++
 tests/test-data/basic/test-sample-latin1.txt  |   2 ++
 .../basic/test-sample-utf8-nobom.txt          |   2 ++
 .../basic/test-sample-utf8-withbom.txt        |   2 ++
 .../basic/test-sample-utf_16_be-nobom.txt     | Bin 0 -> 164 bytes
 .../basic/test-sample-utf_16_be-withbom.txt   | Bin 0 -> 166 bytes
 .../basic/test-sample-utf_16_le-nobom.txt     | Bin 0 -> 164 bytes
 .../basic/test-sample-utf_16_le-withbom.txt   | Bin 0 -> 166 bytes
 11 files changed, 40 insertions(+), 9 deletions(-)
 create mode 100644 tests/test-data/basic/test-sample-ascii.txt
 create mode 100644 tests/test-data/basic/test-sample-latin1.txt
 create mode 100644 tests/test-data/basic/test-sample-utf8-nobom.txt
 create mode 100644 tests/test-data/basic/test-sample-utf8-withbom.txt
 create mode 100644 tests/test-data/basic/test-sample-utf_16_be-nobom.txt
 create mode 100644 tests/test-data/basic/test-sample-utf_16_be-withbom.txt
 create mode 100644 tests/test-data/basic/test-sample-utf_16_le-nobom.txt
 create mode 100644 tests/test-data/basic/test-sample-utf_16_le-withbom.txt

diff --git a/tests/ftguess/test_basic.py b/tests/ftguess/test_basic.py
index 3c6311847..944706c7f 100644
--- a/tests/ftguess/test_basic.py
+++ b/tests/ftguess/test_basic.py
@@ -1,7 +1,7 @@
 """Test ftguess"""
 
 import unittest
-import os
+import re
 from os.path import splitext
 from oletools import ftguess
 
@@ -14,7 +14,7 @@ class TestFTGuess(unittest.TestCase):
     """Test ftguess"""
 
     def test_all(self):
-        """Run all files in test-data and compare to known ouput"""
+        """Run all files in test-data and compare to known output"""
         # ftguess knows extension for each FType, create a reverse mapping
         used_types = (
             ftguess.FType_RTF, ftguess.FType_Generic_OLE,
@@ -31,7 +31,7 @@ def test_all(self):
             ftguess.FType_Powerpoint2007_Slideshow,
             ftguess.FType_Powerpoint2007_Macro,
             ftguess.FType_Powerpoint2007_Slideshow_Macro,
-            ftguess.FType_XPS,
+            ftguess.FType_XPS, ftguess.FType_TEXT,
         )
         ftype_for_extension = dict()
         for ftype in used_types:
@@ -45,10 +45,10 @@ def test_all(self):
 
             # determine what we expect...
             before_dot, extension = splitext(filename)
-            if extension == '.zip':
+            if extension == '.zip':    # zipped files are encrypted versions of other files to not alarm virus scanners
                 extension = splitext(before_dot)[1]
             elif filename in ('basic/empty', 'basic/text'):
-                extension = '.csv'    # have just like that
+                extension = '.txt'    # behave as if this were simple plain text
             elif not extension:
                 self.fail('Could not find extension for test sample {0}'
                           .format(filename))
@@ -105,6 +105,19 @@ def test_all(self):
                 self.assertEqual(guess.is_powerpoint(),
                                  extension.startswith('p'))
 
+    def test_encoding(self):
+        """Check whether text file encoding is detected correctly"""
+        n_matches = 0
+        for filename, file_contents in loop_over_files(subdir='basic'):
+            match = re.match(r'basic[/\\]test-sample-(ascii|latin1|utf[816_lbe]+)(?:-nobom|-withbom)?.txt', filename)
+            if not match:
+                continue
+            n_matches += 1
+            expect_encoding = match.groups()[0].replace('_', '')
+            guess = ftguess.ftype_guess(data=file_contents)
+            self.assertEqual(guess.ftype, ftguess.FType_TEXT)
+            self.assertEqual(guess.text_encoding.replace('-', ''), expect_encoding)
+        self.assertGreater(n_matches, 0)
 
 
 # just in case somebody calls this file as a script
diff --git a/tests/oleid/test_basic.py b/tests/oleid/test_basic.py
index e37ad61ab..cfb2565a5 100644
--- a/tests/oleid/test_basic.py
+++ b/tests/oleid/test_basic.py
@@ -111,7 +111,9 @@ def test_macros(self):
 
             # xlm detection does not work in-memory (yet)
             # --> xlm is "unknown" for excel files, except some encrypted files
-            self.assertIn(value_dict['xlm'], ('Unknown', 'No'))
+            self.assertIn(value_dict['xlm'], ('Unknown', 'No'),
+                          "Unexpected value '{0}' for XLM-content in test sample {1}'"
+                          .format(value_dict['xlm'], filename))
 
             # "macro detection" in text files leads to interesting results:
             if filename in ('ooxml/dde-in-excel2003.xml',    # not really
@@ -121,9 +123,17 @@ def test_macros(self):
                             'oleform/oleform-PR314.docm',
                             'basic/empty',                   # WTF?
                             'basic/text'):                   # no macros!
-                self.assertEqual(value_dict['vba'], 'Yes')
+                self.assertEqual(value_dict['vba'], 'Yes',
+                                 "Unexpected value '{0}' for test sample {1}'"
+                                 .format(value_dict['xlm'], filename))
+            elif filename.startswith(join('basic', 'test-sample-')):   # not clear what macro detection should do with text files
+                self.assertIn(value_dict['vba'], ('Yes', 'Error'),
+                              "Unexpected value '{0}' for test sample {1}'"
+                              .format(value_dict['vba'], filename))
             else:
-                self.assertEqual(value_dict['vba'], 'No')
+                self.assertEqual(value_dict['vba'], 'No',
+                                 "Unexpected value '{0}' for test sample {1}'"
+                                 .format(value_dict['vba'], filename))
 
     def test_flash(self):
         """Test indicator for flash."""
diff --git a/tests/ooxml/test_basic.py b/tests/ooxml/test_basic.py
index e21942b13..90d85d427 100644
--- a/tests/ooxml/test_basic.py
+++ b/tests/ooxml/test_basic.py
@@ -39,7 +39,7 @@ def test_rough_doctype(self):
 
         # files that are neither OLE nor xml:
         except_files = 'empty', 'text'
-        except_extns = 'rtf', 'csv', 'zip', 'slk'
+        except_extns = 'rtf', 'csv', 'zip', 'slk', 'txt'
 
         # analyse all files in data dir
         # TODO: use testdata_reader to extract real data from zip files
diff --git a/tests/test-data/basic/test-sample-ascii.txt b/tests/test-data/basic/test-sample-ascii.txt
new file mode 100644
index 000000000..700629332
--- /dev/null
+++ b/tests/test-data/basic/test-sample-ascii.txt
@@ -0,0 +1,2 @@
+Test sample file without special chars or emjois,
+encoded using ascii
diff --git a/tests/test-data/basic/test-sample-latin1.txt b/tests/test-data/basic/test-sample-latin1.txt
new file mode 100644
index 000000000..ee5d8d38c
--- /dev/null
+++ b/tests/test-data/basic/test-sample-latin1.txt
@@ -0,0 +1,2 @@
+Test sample file with special chars äöüß,
+encoded using latin1
diff --git a/tests/test-data/basic/test-sample-utf8-nobom.txt b/tests/test-data/basic/test-sample-utf8-nobom.txt
new file mode 100644
index 000000000..63fa88781
--- /dev/null
+++ b/tests/test-data/basic/test-sample-utf8-nobom.txt
@@ -0,0 +1,2 @@
+Test sample file with special chars Ã¤Ã¶Ã¼ÃŸ and emojis ðŸ˜‡ðŸ™Š,
+encoded using utf8
diff --git a/tests/test-data/basic/test-sample-utf8-withbom.txt b/tests/test-data/basic/test-sample-utf8-withbom.txt
new file mode 100644
index 000000000..52872f5f2
--- /dev/null
+++ b/tests/test-data/basic/test-sample-utf8-withbom.txt
@@ -0,0 +1,2 @@
+ï»¿Test sample file with special chars Ã¤Ã¶Ã¼ÃŸ and emojis ðŸ˜‡ðŸ™Š,
+encoded using utf8
diff --git a/tests/test-data/basic/test-sample-utf_16_be-nobom.txt b/tests/test-data/basic/test-sample-utf_16_be-nobom.txt
new file mode 100644
index 0000000000000000000000000000000000000000..917d2587ecf2770eee71f0beee3a856de8b7c854
GIT binary patch
literal 164
zcmXYqF%E)25Jg{S!YTFuHa2!%fZE<Hx(Eh=AjGR$N#hx;EO;RnGxK-;`?DNLJQxY~
z$eo_XO`xRytEXzDT8Uoo>^tfORizqC20ncGF{>gi7iaQX$4y%Qb$D;AoGqD7Yqpb0
VD>+Hj?yjaL|0+4LV{b1cvIX8@A?g4C

literal 0
HcmV?d00001

diff --git a/tests/test-data/basic/test-sample-utf_16_be-withbom.txt b/tests/test-data/basic/test-sample-utf_16_be-withbom.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9987b12f6323fe155fbfe38fd403f4fcb2f02b52
GIT binary patch
literal 166
zcmXYqF%E)25Jg`{!YTFuHa2!%z}nudvIs_jAcU)0Y2z8JjQT<>X6Enw_h&y}wj{0$
z1bgH{$JtGwr2eZ(HBik&r+4rj^@6HW^(F&PUc7mzA`OKzd9CFnt$$nJGi$?wT&FSH
XN~M_`C2Ds^Q=@;C>{zn0?<MjdCy*oO

literal 0
HcmV?d00001

diff --git a/tests/test-data/basic/test-sample-utf_16_le-nobom.txt b/tests/test-data/basic/test-sample-utf_16_le-nobom.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e6bde2d722754b14f9335ab13eee30b568fdaf26
GIT binary patch
literal 164
zcmXYrF%E)25Jg{S!YTFuHa2!#fZE<{bP<dKQHWQi!p1XLS@0k>vv23WncV}4Cq2O$
zxzlm86DX<w^pbk2)#%Jl-lJKND(T^5;KP?6Zz<An_DpVTxoR(YK5nLbET%1)N#ks*
Xl~!}msO??!2Hz?<vSV*u{*o~R`K}@K

literal 0
HcmV?d00001

diff --git a/tests/test-data/basic/test-sample-utf_16_le-withbom.txt b/tests/test-data/basic/test-sample-utf_16_le-withbom.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6796e734e1e7a6bd5feafdabdacc52bf5d8517e1
GIT binary patch
literal 166
zcmXYrF%E)25Jg{S!YTFuHa2!#z}ntybP<dKQHWQi!p1XL8TCPIX5Y?#GrQl<mc)&Y
zV2xa8Iok;oRDZfl9p!AaW_$0^EJ&qvbu#ee#hZr|sX2Hiw>6x!r#v4PV?J!gC7DU>
ZY@?NCv)8EX9rb$OD%i1NZJqv-@dHBMBk=$L

literal 0
HcmV?d00001


From 9560015606b1ec2fbd857c9d8704661c6e85447f Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Thu, 14 Dec 2023 12:18:58 +0100
Subject: [PATCH 2/3] Implement detection of plain text

This is not so simple since various text encodings can look rather
"binary", but a few simple heuristics will deal with many text types (at
least those encountered here in Europe).

Of course, all xml is text as well, so use checks for "is this text" only
after more specialized tests like "is this xml".
---
 oletools/ftguess.py | 77 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/oletools/ftguess.py b/oletools/ftguess.py
index cd447f91a..e259bf843 100644
--- a/oletools/ftguess.py
+++ b/oletools/ftguess.py
@@ -62,6 +62,7 @@
 import olefile
 import logging
 import optparse
+import codecs
 
 # import lxml or ElementTree for XML parsing:
 try:
@@ -290,6 +291,78 @@ def recognize(cls, ftg):
 class FType_Unknown(FType_Base):
     pass
 
+class FType_TEXT(FType_Base):
+    """
+    Try a few popular encoding to detect whether this is just plain text.
+
+    Try the most popular encodings according to wikipedia:
+    https://en.wikipedia.org/wiki/Popularity_of_text_encodings#Popularity_internally_in_software
+
+    Maybe should add 'windows-1251' (cyrillic) or 'big5' (Chinese) or other formats popular in Asia?
+
+    Implementation is rather hacky, but we do not need a perfect solution here (which would be to use
+    libmagic) and determining encoding is really not easy.
+    """
+    filetype = FTYPE.TEXT
+    name = 'plain text'
+    longname = 'unclassified plain text'
+    extensions = ['txt',]
+    content_types = ('text/plain',)   # behave like `file` on linux
+    PUID = 'x-fmt/111'
+    # encodings we try to decode the bytes with; from limited to more general
+    ENCODINGS = ('ascii', 'latin1', 'utf8', 'utf-16le', 'utf-16be')
+    CHECK_SIZE = 4096   # do not try to decode megabytes of data, just check the beginning
+
+    @classmethod
+    def recognize(cls, ftg):
+        """
+        Try to determine whether this data makes sense as encoded text.
+
+        If yes, set :py:data:`ftg.text_encoding`.
+        """
+        # first, try a few simple ones:
+        if ftg.data.startswith(codecs.BOM_UTF8):
+            try:
+                _ = ftg.data.decode('utf8', errors='strict')
+                ftg.text_encoding = 'utf8'
+                return True
+            except UnicodeError:
+                return False
+        elif ftg.data.startswith(codecs.BOM_UTF16_LE):
+            try:
+                _ = ftg.data.decode('utf-16le', errors='strict')
+                ftg.text_encoding = 'utf-16le'
+                return True
+            except UnicodeError:
+                return False
+        elif ftg.data.startswith(codecs.BOM_UTF16_BE):
+            try:
+                _ = ftg.data.decode('utf-16be', errors='strict')
+                ftg.text_encoding = 'utf-16be'
+                return True
+            except UnicodeError:
+                return False
+
+        # no BOM? then try to decode the first part using various encodings
+        # could also check if every 2nd byte is zero in 90% of time. If so, this is probably utf16
+        for encoding in cls.ENCODINGS:
+            try:
+                data_size = len(ftg.data)
+                decoded = ftg.data[:cls.CHECK_SIZE].decode(encoding, errors='strict')
+                if data_size > cls.CHECK_SIZE:
+                    rep = repr(decoded[:-10])  # remove the last characters, may be erroneous due to cutting
+                else:
+                    rep = repr(decoded)
+                bad_chars = rep.count(r'\x') + rep.count(r'\u')   # e.g. in latin1 everything "is valid" but looks horrible
+                if bad_chars > float(data_size) * 0.05:
+                    continue
+                ftg.text_encoding = encoding
+                return True
+            except UnicodeError:
+                pass
+        return False
+
+
 class FType_RTF(FType_Base):
     container = CONTAINER.RTF
     application = APP.MSWORD
@@ -802,6 +875,8 @@ def __init__(self, filepath=None, data=None):
         # For XML:
         self.root_xmltag = None
         self.xmlroot = None
+        # For TEXT:
+        self.text_encoding = None
 
         if filepath is None and data is None:
             raise ValueError('FileTypeGuesser requires either a file path or file data, or both')
@@ -811,7 +886,7 @@ def __init__(self, filepath=None, data=None):
         self.data_bytesio = io.BytesIO(self.data)
 
         # Identify the main container type:
-        for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG):
+        for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG, FType_TEXT):
             if ftype.recognize(self):
                 self.ftype = ftype
                 break

From 929d2c0cbaeeff43c3f408488563b630ec1f94eb Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Thu, 14 Dec 2023 12:55:00 +0100
Subject: [PATCH 3/3] Decode text in olevba before analyzing it

---
 oletools/olevba.py | 47 +++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/oletools/olevba.py b/oletools/olevba.py
index 52ffd5126..18da7af68 100644
--- a/oletools/olevba.py
+++ b/oletools/olevba.py
@@ -2790,12 +2790,13 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D
                 raise FileOpenError(msg)
             # Check if it is a SLK/SYLK file - https://en.wikipedia.org/wiki/SYmbolic_LinK_(SYLK)
             # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel:
-            if data.startswith(b'ID'):
-                self.open_slk(data)
-            # Check if this is a plain text VBA or VBScript file:
-            # To avoid scanning binary files, we simply check for some control chars:
-            if self.type is None and b'\x00' not in data:
-                self.open_text(data)
+            if self.type is None and self.ftg.ftype == ftguess.FType_TEXT:
+                data = bytes2str(data, self.ftg.text_encoding)
+                if data.startswith('ID'):
+                    self.open_slk(data)
+                else:
+                    # Check if this is a plain text VBA or VBScript file:
+                    self.open_text(data)
         if self.type is None:
             # At this stage, could not match a known format:
             msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename
@@ -3101,10 +3102,10 @@ def open_ppt(self):
                 log.debug("File appears not to be a ppt file (%s)" % exc)
 
 
-    def open_slk(self, data):
+    def open_slk(self, str_data):
         """
         Open a SLK file, which may contain XLM/Excel 4 macros
-        :param data: file contents in a bytes string
+        :param str_data: file contents in a [unicode] string
         :return: nothing
         """
         # TODO: Those results should be stored as XLM macros, not VBA
@@ -3112,40 +3113,40 @@ def open_slk(self, data):
         xlm_macro_found = False
         xlm_macros = []
         xlm_macros.append('Formulas and XLM/Excel 4 macros extracted from SLK file:')
-        for line in data.splitlines(False):
-            if line.startswith(b'O'):
+        for line in str_data.splitlines(False):
+            if line.startswith('O'):
                 # Option: "O;E" indicates a macro sheet, must appear before NN and C rows
-                for s in line.split(b';'):
-                    if s.startswith(b'E'):
+                for s in line.split(';'):
+                    if s.startswith('E'):
                         xlm_macro_found = True
                         log.debug('SLK parser: found macro sheet')
-            elif line.startswith(b'NN') and xlm_macro_found:
+            elif line.startswith('NN') and xlm_macro_found:
                 # Name that can trigger a macro, for example "Auto_Open"
-                for s in line.split(b';'):
-                    if s.startswith(b'N') and s.strip() != b'NN':
-                        xlm_macros.append('Named cell: %s' % bytes2str(s[1:]))
-            elif line.startswith(b'C') and xlm_macro_found:
+                for s in line.split(';'):
+                    if s.startswith('N') and s.strip() != 'NN':
+                        xlm_macros.append('Named cell: %s' % s[1:])
+            elif line.startswith('C') and xlm_macro_found:
                 # Cell
-                for s in line.split(b';'):
-                    if s.startswith(b'E'):
-                        xlm_macros.append('Formula or Macro: %s' % bytes2str(s[1:]))
+                for s in line.split(';'):
+                    if s.startswith('E'):
+                        xlm_macros.append('Formula or Macro: %s' % s[1:])
         if xlm_macro_found:
             self.contains_xlm_macros = True
             self.xlm_macros = xlm_macros
         self.type = TYPE_SLK
 
 
-    def open_text(self, data):
+    def open_text(self, str_data):
         """
         Open a text file containing VBA or VBScript source code
-        :param data: file contents in a string or bytes
+        :param str_data: file contents in a [unicode] string
         :return: nothing
         """
         log.info('Opening text file %s' % self.filename)
         # directly store the source code:
         # On Python 2, store it as a raw bytes string
         # On Python 3, convert it to unicode assuming it was encoded with UTF-8
-        self.vba_code_all_modules = bytes2str(data)
+        self.vba_code_all_modules = str_data
         self.contains_vba_macros = True
         # set type only if parsing succeeds
         self.type = TYPE_TEXT