diff --git a/dissect/regf/regf.py b/dissect/regf/regf.py index fb2ee28..3bf5b46 100644 --- a/dissect/regf/regf.py +++ b/dissect/regf/regf.py @@ -478,7 +478,17 @@ def try_decode_sz(data): # This will return the string utf-16-le decoded up until the first # double NULL byte. - return data.split(b"\x00\x00")[0].decode("utf-16-le") + # A naive split on two NULL bytes will not work as the possibility + # exists that the first NULL byte is the high byte of the first + # character and the second NULL byte the low byte of the second + # character. So the first NULL byte should start at an even index in + # the data. + idx = -1 + while (idx := data.find(b"\x00\x00", idx + 1)) & 1: + if idx == -1: + idx = len(data) + break + return data[:idx].decode("utf-16-le") except UnicodeDecodeError: # Last ditch effort, decode the whole bytestring as if it were utf-16, diff --git a/tests/test_regf.py b/tests/test_regf.py index c90b7cb..cfdd946 100644 --- a/tests/test_regf.py +++ b/tests/test_regf.py @@ -1,3 +1,5 @@ +import pytest + from dissect.regf import regf @@ -22,3 +24,56 @@ def test_regf(system_hive): assert lsa.subkey("Data").class_name == "a282942c" assert hive.open("ControlSet001\\Services\\Tcpip\\Parameters\\DNSRegisteredAdapters").class_name == "DynDRootClass" + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + b"", + "", + ), + ( + b"The Quick Brown Fox\x00Jumped Over The Lazy Dog", + "The Quick Brown Fox", + ), + ( + b"The Quick Brown Fox\x00Jumped Over The Lazy Dog\x00", + "The Quick Brown Fox", + ), + ( + b"The Quick Brown Fox", + "The Quick Brown Fox", + ), + ( + "The Quick Brown Fox\x00Jumped Over The Lazy Dog".encode("utf-16-le"), + "The Quick Brown Fox", + ), + ( + "The Quick Brown Fox\x00Jumped Over The Lazy Dog\x00".encode("utf-16-le"), + "The Quick Brown Fox", + ), + ( + "The Quick Brown Fox\x00Jumped Over The Lazy Dog".encode("utf-16-le") + b"\x00", + "The Quick Brown Fox", + ), + ( + "The Quick Brown Fox".encode("utf-16-le"), + "The Quick Brown Fox", + ), + ( + b"\xe4bcd\x00", # interpreted as latin1 + "äbcd", + ), + ( + b"\xe4bcd", # interpreted as utf-16-le + "拤摣", + ), + ( + b"\x41\x00\x00\x01\x42\x00", + "AĀB", + ), + ], +) +def test_try_decode_sz(data, expected): + assert regf.try_decode_sz(data) == expected