Fix parsing NULL-terminated UTF-16 REG_SZ type registry values (#19)

(DIS-2317)
fox-it · Oct 4, 2023 · a10fabc · a10fabc
1 parent 14cfb86
commit a10fabc
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 1 deletion.
diff --git a/dissect/regf/regf.py b/dissect/regf/regf.py
@@ -478,7 +478,17 @@ def try_decode_sz(data):
 
         # This will return the string utf-16-le decoded up until the first
         # double NULL byte.
-        return data.split(b"\x00\x00")[0].decode("utf-16-le")
+        # A naive split on two NULL bytes will not work as the possibility
+        # exists that the first NULL byte is the high byte of the first
+        # character and the second NULL byte the low byte of the second
+        # character. So the first NULL byte should start at an even index in
+        # the data.
+        idx = -1
+        while (idx := data.find(b"\x00\x00", idx + 1)) & 1:
+            if idx == -1:
+                idx = len(data)
+                break
+        return data[:idx].decode("utf-16-le")
 
     except UnicodeDecodeError:
         # Last ditch effort, decode the whole bytestring as if it were utf-16,

diff --git a/tests/test_regf.py b/tests/test_regf.py
@@ -1,3 +1,5 @@
+import pytest
+
 from dissect.regf import regf
 
 
@@ -22,3 +24,56 @@ def test_regf(system_hive):
     assert lsa.subkey("Data").class_name == "a282942c"
 
     assert hive.open("ControlSet001\\Services\\Tcpip\\Parameters\\DNSRegisteredAdapters").class_name == "DynDRootClass"
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        (
+            b"",
+            "",
+        ),
+        (
+            b"The Quick Brown Fox\x00Jumped Over The Lazy Dog",
+            "The Quick Brown Fox",
+        ),
+        (
+            b"The Quick Brown Fox\x00Jumped Over The Lazy Dog\x00",
+            "The Quick Brown Fox",
+        ),
+        (
+            b"The Quick Brown Fox",
+            "The Quick Brown Fox",
+        ),
+        (
+            "The Quick Brown Fox\x00Jumped Over The Lazy Dog".encode("utf-16-le"),
+            "The Quick Brown Fox",
+        ),
+        (
+            "The Quick Brown Fox\x00Jumped Over The Lazy Dog\x00".encode("utf-16-le"),
+            "The Quick Brown Fox",
+        ),
+        (
+            "The Quick Brown Fox\x00Jumped Over The Lazy Dog".encode("utf-16-le") + b"\x00",
+            "The Quick Brown Fox",
+        ),
+        (
+            "The Quick Brown Fox".encode("utf-16-le"),
+            "The Quick Brown Fox",
+        ),
+        (
+            b"\xe4bcd\x00",  # interpreted as latin1
+            "äbcd",
+        ),
+        (
+            b"\xe4bcd",  # interpreted as utf-16-le
+            "拤摣",
+        ),
+        (
+            b"\x41\x00\x00\x01\x42\x00",
+            "AĀB",
+        ),
+    ],
+)
+def test_try_decode_sz(data, expected):
+    assert regf.try_decode_sz(data) == expected