Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recognize txt #836

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
77 changes: 76 additions & 1 deletion oletools/ftguess.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import olefile
import logging
import optparse
import codecs

# import lxml or ElementTree for XML parsing:
try:
Expand Down Expand Up @@ -290,6 +291,78 @@ def recognize(cls, ftg):
class FType_Unknown(FType_Base):
pass

class FType_TEXT(FType_Base):
"""
Try a few popular encoding to detect whether this is just plain text.

Try the most popular encodings according to wikipedia:
https://en.wikipedia.org/wiki/Popularity_of_text_encodings#Popularity_internally_in_software

Maybe should add 'windows-1251' (cyrillic) or 'big5' (Chinese) or other formats popular in Asia?

Implementation is rather hacky, but we do not need a perfect solution here (which would be to use
libmagic) and determining encoding is really not easy.
"""
filetype = FTYPE.TEXT
name = 'plain text'
longname = 'unclassified plain text'
extensions = ['txt',]
content_types = ('text/plain',) # behave like `file` on linux
PUID = 'x-fmt/111'
# encodings we try to decode the bytes with; from limited to more general
ENCODINGS = ('ascii', 'latin1', 'utf8', 'utf-16le', 'utf-16be')
CHECK_SIZE = 4096 # do not try to decode megabytes of data, just check the beginning

@classmethod
def recognize(cls, ftg):
"""
Try to determine whether this data makes sense as encoded text.

If yes, set :py:data:`ftg.text_encoding`.
"""
# first, try a few simple ones:
if ftg.data.startswith(codecs.BOM_UTF8):
try:
_ = ftg.data.decode('utf8', errors='strict')
ftg.text_encoding = 'utf8'
return True
except UnicodeError:
return False
elif ftg.data.startswith(codecs.BOM_UTF16_LE):
try:
_ = ftg.data.decode('utf-16le', errors='strict')
ftg.text_encoding = 'utf-16le'
return True
except UnicodeError:
return False
elif ftg.data.startswith(codecs.BOM_UTF16_BE):
try:
_ = ftg.data.decode('utf-16be', errors='strict')
ftg.text_encoding = 'utf-16be'
return True
except UnicodeError:
return False

# no BOM? then try to decode the first part using various encodings
# could also check if every 2nd byte is zero in 90% of time. If so, this is probably utf16
for encoding in cls.ENCODINGS:
try:
data_size = len(ftg.data)
decoded = ftg.data[:cls.CHECK_SIZE].decode(encoding, errors='strict')
if data_size > cls.CHECK_SIZE:
rep = repr(decoded[:-10]) # remove the last characters, may be erroneous due to cutting
else:
rep = repr(decoded)
bad_chars = rep.count(r'\x') + rep.count(r'\u') # e.g. in latin1 everything "is valid" but looks horrible
if bad_chars > float(data_size) * 0.05:
continue
ftg.text_encoding = encoding
return True
except UnicodeError:
pass
return False


class FType_RTF(FType_Base):
container = CONTAINER.RTF
application = APP.MSWORD
Expand Down Expand Up @@ -802,6 +875,8 @@ def __init__(self, filepath=None, data=None):
# For XML:
self.root_xmltag = None
self.xmlroot = None
# For TEXT:
self.text_encoding = None

if filepath is None and data is None:
raise ValueError('FileTypeGuesser requires either a file path or file data, or both')
Expand All @@ -811,7 +886,7 @@ def __init__(self, filepath=None, data=None):
self.data_bytesio = io.BytesIO(self.data)

# Identify the main container type:
for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG):
for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG, FType_TEXT):
if ftype.recognize(self):
self.ftype = ftype
break
Expand Down
47 changes: 24 additions & 23 deletions oletools/olevba.py
Original file line number Diff line number Diff line change
Expand Up @@ -2790,12 +2790,13 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D
raise FileOpenError(msg)
# Check if it is a SLK/SYLK file - https://en.wikipedia.org/wiki/SYmbolic_LinK_(SYLK)
# It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel:
if data.startswith(b'ID'):
self.open_slk(data)
# Check if this is a plain text VBA or VBScript file:
# To avoid scanning binary files, we simply check for some control chars:
if self.type is None and b'\x00' not in data:
self.open_text(data)
if self.type is None and self.ftg.ftype == ftguess.FType_TEXT:
data = bytes2str(data, self.ftg.text_encoding)
if data.startswith('ID'):
self.open_slk(data)
else:
# Check if this is a plain text VBA or VBScript file:
self.open_text(data)
if self.type is None:
# At this stage, could not match a known format:
msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename
Expand Down Expand Up @@ -3101,51 +3102,51 @@ def open_ppt(self):
log.debug("File appears not to be a ppt file (%s)" % exc)


def open_slk(self, data):
def open_slk(self, str_data):
"""
Open a SLK file, which may contain XLM/Excel 4 macros
:param data: file contents in a bytes string
:param str_data: file contents in a [unicode] string
:return: nothing
"""
# TODO: Those results should be stored as XLM macros, not VBA
log.info('Opening SLK file %s' % self.filename)
xlm_macro_found = False
xlm_macros = []
xlm_macros.append('Formulas and XLM/Excel 4 macros extracted from SLK file:')
for line in data.splitlines(False):
if line.startswith(b'O'):
for line in str_data.splitlines(False):
if line.startswith('O'):
# Option: "O;E" indicates a macro sheet, must appear before NN and C rows
for s in line.split(b';'):
if s.startswith(b'E'):
for s in line.split(';'):
if s.startswith('E'):
xlm_macro_found = True
log.debug('SLK parser: found macro sheet')
elif line.startswith(b'NN') and xlm_macro_found:
elif line.startswith('NN') and xlm_macro_found:
# Name that can trigger a macro, for example "Auto_Open"
for s in line.split(b';'):
if s.startswith(b'N') and s.strip() != b'NN':
xlm_macros.append('Named cell: %s' % bytes2str(s[1:]))
elif line.startswith(b'C') and xlm_macro_found:
for s in line.split(';'):
if s.startswith('N') and s.strip() != 'NN':
xlm_macros.append('Named cell: %s' % s[1:])
elif line.startswith('C') and xlm_macro_found:
# Cell
for s in line.split(b';'):
if s.startswith(b'E'):
xlm_macros.append('Formula or Macro: %s' % bytes2str(s[1:]))
for s in line.split(';'):
if s.startswith('E'):
xlm_macros.append('Formula or Macro: %s' % s[1:])
if xlm_macro_found:
self.contains_xlm_macros = True
self.xlm_macros = xlm_macros
self.type = TYPE_SLK


def open_text(self, data):
def open_text(self, str_data):
"""
Open a text file containing VBA or VBScript source code
:param data: file contents in a string or bytes
:param str_data: file contents in a [unicode] string
:return: nothing
"""
log.info('Opening text file %s' % self.filename)
# directly store the source code:
# On Python 2, store it as a raw bytes string
# On Python 3, convert it to unicode assuming it was encoded with UTF-8
self.vba_code_all_modules = bytes2str(data)
self.vba_code_all_modules = str_data
self.contains_vba_macros = True
# set type only if parsing succeeds
self.type = TYPE_TEXT
Expand Down
23 changes: 18 additions & 5 deletions tests/ftguess/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Test ftguess"""

import unittest
import os
import re
from os.path import splitext
from oletools import ftguess

Expand All @@ -14,7 +14,7 @@ class TestFTGuess(unittest.TestCase):
"""Test ftguess"""

def test_all(self):
"""Run all files in test-data and compare to known ouput"""
"""Run all files in test-data and compare to known output"""
# ftguess knows extension for each FType, create a reverse mapping
used_types = (
ftguess.FType_RTF, ftguess.FType_Generic_OLE,
Expand All @@ -31,7 +31,7 @@ def test_all(self):
ftguess.FType_Powerpoint2007_Slideshow,
ftguess.FType_Powerpoint2007_Macro,
ftguess.FType_Powerpoint2007_Slideshow_Macro,
ftguess.FType_XPS,
ftguess.FType_XPS, ftguess.FType_TEXT,
)
ftype_for_extension = dict()
for ftype in used_types:
Expand All @@ -45,10 +45,10 @@ def test_all(self):

# determine what we expect...
before_dot, extension = splitext(filename)
if extension == '.zip':
if extension == '.zip': # zipped files are encrypted versions of other files to not alarm virus scanners
extension = splitext(before_dot)[1]
elif filename in ('basic/empty', 'basic/text'):
extension = '.csv' # have just like that
extension = '.txt' # behave as if this were simple plain text
elif not extension:
self.fail('Could not find extension for test sample {0}'
.format(filename))
Expand Down Expand Up @@ -105,6 +105,19 @@ def test_all(self):
self.assertEqual(guess.is_powerpoint(),
extension.startswith('p'))

def test_encoding(self):
"""Check whether text file encoding is detected correctly"""
n_matches = 0
for filename, file_contents in loop_over_files(subdir='basic'):
match = re.match(r'basic[/\\]test-sample-(ascii|latin1|utf[816_lbe]+)(?:-nobom|-withbom)?.txt', filename)
if not match:
continue
n_matches += 1
expect_encoding = match.groups()[0].replace('_', '')
guess = ftguess.ftype_guess(data=file_contents)
self.assertEqual(guess.ftype, ftguess.FType_TEXT)
self.assertEqual(guess.text_encoding.replace('-', ''), expect_encoding)
self.assertGreater(n_matches, 0)


# just in case somebody calls this file as a script
Expand Down
16 changes: 13 additions & 3 deletions tests/oleid/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ def test_macros(self):

# xlm detection does not work in-memory (yet)
# --> xlm is "unknown" for excel files, except some encrypted files
self.assertIn(value_dict['xlm'], ('Unknown', 'No'))
self.assertIn(value_dict['xlm'], ('Unknown', 'No'),
"Unexpected value '{0}' for XLM-content in test sample {1}'"
.format(value_dict['xlm'], filename))

# "macro detection" in text files leads to interesting results:
if filename in ('ooxml/dde-in-excel2003.xml', # not really
Expand All @@ -121,9 +123,17 @@ def test_macros(self):
'oleform/oleform-PR314.docm',
'basic/empty', # WTF?
'basic/text'): # no macros!
self.assertEqual(value_dict['vba'], 'Yes')
self.assertEqual(value_dict['vba'], 'Yes',
"Unexpected value '{0}' for test sample {1}'"
.format(value_dict['xlm'], filename))
elif filename.startswith(join('basic', 'test-sample-')): # not clear what macro detection should do with text files
self.assertIn(value_dict['vba'], ('Yes', 'Error'),
"Unexpected value '{0}' for test sample {1}'"
.format(value_dict['vba'], filename))
else:
self.assertEqual(value_dict['vba'], 'No')
self.assertEqual(value_dict['vba'], 'No',
"Unexpected value '{0}' for test sample {1}'"
.format(value_dict['vba'], filename))

def test_flash(self):
"""Test indicator for flash."""
Expand Down
2 changes: 1 addition & 1 deletion tests/ooxml/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_rough_doctype(self):

# files that are neither OLE nor xml:
except_files = 'empty', 'text'
except_extns = 'rtf', 'csv', 'zip', 'slk'
except_extns = 'rtf', 'csv', 'zip', 'slk', 'txt'

# analyse all files in data dir
# TODO: use testdata_reader to extract real data from zip files
Expand Down
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file without special chars or emjois,
encoded using ascii
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-latin1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file with special chars ����,
encoded using latin1
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-utf8-nobom.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file with special chars äöüß and emojis 😇🙊,
encoded using utf8
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-utf8-withbom.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file with special chars äöüß and emojis 😇🙊,
encoded using utf8
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.