Skip to content

Commit

Permalink
Rewrite expression parser to support more complex expressions (#37)
Browse files Browse the repository at this point in the history
Co-authored-by: Schamper <1254028+Schamper@users.noreply.github.com>
  • Loading branch information
sMezaOrellana and Schamper committed Aug 31, 2023
1 parent c254806 commit a9374ed
Show file tree
Hide file tree
Showing 4 changed files with 318 additions and 60 deletions.
8 changes: 8 additions & 0 deletions dissect/cstruct/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@ class NullPointerDereference(Error):

class ArraySizeError(Error):
pass


class ExpressionParserError(Error):
pass


class ExpressionTokenizerError(Error):
pass
329 changes: 273 additions & 56 deletions dissect/cstruct/expression.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,301 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Dict
import string
from typing import TYPE_CHECKING, Callable, Optional, Union

from dissect.cstruct.exceptions import ExpressionParserError, ExpressionTokenizerError

if TYPE_CHECKING:
from dissect.cstruct import cstruct


HEXBIN_SUFFIX = {"x", "X", "b", "B"}


class ExpressionTokenizer:
def __init__(self, expression: str):
self.expression = expression
self.pos = 0
self.tokens = []

def equal(self, token: str, expected: Union[str, str[str]]) -> bool:
if isinstance(expected, set):
return token in expected
else:
return token == expected

def alnum(self, token: str) -> bool:
return token.isalnum()

def alpha(self, token: str) -> bool:
return token.isalpha()

def digit(self, token: str) -> bool:
return token.isdigit()

def hexdigit(self, token: str) -> bool:
return token in string.hexdigits

def operator(self, token: str) -> bool:
return token in {"*", "/", "+", "-", "%", "&", "^", "|", "(", ")", "~"}

def match(
self,
func: Optional[Callable[[str], bool]] = None,
expected: Optional[str] = None,
consume: bool = True,
append: bool = True,
) -> bool:
if self.eol():
return False

token = self.get_token()

if expected and self.equal(token, expected):
if append:
self.tokens.append(token)
if consume:
self.consume()
return True

if func and func(token):
if append:
self.tokens.append(token)
if consume:
self.consume()
return True

return False

def consume(self) -> None:
self.pos += 1

def eol(self) -> bool:
return self.pos >= len(self.expression)

def get_token(self) -> str:
if self.eol():
raise ExpressionTokenizerError(f"Out of bounds index: {self.pos}, length: {len(self.expression)}")
return self.expression[self.pos]

def tokenize(self) -> list[str]:
token = ""

# Loop over expression runs in linear time
while not self.eol():
# If token is a single character operand add it to tokens
if self.match(self.operator):
continue

# If token is a single digit, keep looping over expression and build the number
elif self.match(self.digit, consume=False, append=False):
token += self.get_token()
self.consume()

# Support for binary and hexadecimal notation
if self.match(expected=HEXBIN_SUFFIX, consume=False, append=False):
token += self.get_token()
self.consume()

while self.match(self.hexdigit, consume=False, append=False):
token += self.get_token()
self.consume()
if self.eol():
break

# Checks for suffixes in numbers
if self.match(expected={"u", "U"}, consume=False, append=False):
self.consume()
self.match(expected={"l", "L"}, append=False)
self.match(expected={"l", "L"}, append=False)

elif self.match(expected={"l", "L"}, append=False):
self.match(expected={"l", "L"}, append=False)
self.match(expected={"u", "U"}, append=False)
else:
pass

# Number cannot end on x or b in the case of binary or hexadecimal notation
if len(token) == 2 and token[-1] in HEXBIN_SUFFIX:
raise ExpressionTokenizerError("Invalid binary or hex notation")

if len(token) > 1 and token[0] == "0" and token[1] not in HEXBIN_SUFFIX:
token = token[:1] + "o" + token[1:]
self.tokens.append(token)
token = ""

# If token is alpha or underscore we need to build the identifier
elif self.match(self.alpha, consume=False, append=False) or self.match(
expected="_", consume=False, append=False
):
while self.match(self.alnum, consume=False, append=False) or self.match(
expected="_", consume=False, append=False
):
token += self.get_token()
self.consume()
if self.eol():
break
self.tokens.append(token)
token = ""
# If token is length 2 operand make sure next character is part of length 2 operand append to tokens
elif self.match(expected=">", append=False) and self.match(expected=">", append=False):
self.tokens.append(">>")
elif self.match(expected="<", append=False) and self.match(expected="<", append=False):
self.tokens.append("<<")
elif self.match(expected=" ", append=False):
continue
else:
raise ExpressionTokenizerError(
f"Tokenizer does not recognize following token '{self.expression[self.pos]}'"
)
return self.tokens


class Expression:
"""Expression parser for simple calculations in definitions."""

operators = [
("*", lambda a, b: a * b),
("/", lambda a, b: a // b),
("%", lambda a, b: a % b),
("+", lambda a, b: a + b),
("-", lambda a, b: a - b),
(">>", lambda a, b: a >> b),
("<<", lambda a, b: a << b),
("&", lambda a, b: a & b),
("^", lambda a, b: a ^ b),
("|", lambda a, b: a | b),
]
"""Expression parser for calculations in definitions."""

operators = {
"|": lambda a, b: a | b,
"^": lambda a, b: a ^ b,
"&": lambda a, b: a & b,
"<<": lambda a, b: a << b,
">>": lambda a, b: a >> b,
"+": lambda a, b: a + b,
"-": lambda a, b: a - b,
"*": lambda a, b: a * b,
"/": lambda a, b: a // b,
"%": lambda a, b: a % b,
"u": lambda a: -a,
"~": lambda a: ~a,
}

precedence_levels = {
"|": 0,
"^": 1,
"&": 2,
"<<": 3,
">>": 3,
"+": 4,
"-": 4,
"*": 5,
"/": 5,
"%": 5,
"u": 6,
"~": 6,
"sizeof": 6,
}

def __init__(self, cstruct: cstruct, expression: str):
self.cstruct = cstruct
self.expression = expression
self.tokens = ExpressionTokenizer(expression).tokenize()
self.stack = []
self.queue = []

def __repr__(self) -> str:
return self.expression

def evaluate(self, context: Dict[str, int] = None) -> int:
context = context or {}
levels = []
buf = ""
def precedence(self, o1: str, o2: str) -> bool:
return self.precedence_levels[o1] >= self.precedence_levels[o2]

for i in range(len(self.expression)):
if self.expression[i] == "(":
levels.append(buf)
buf = ""
continue
def evaluate_exp(self) -> None:
operator = self.stack.pop(-1)
res = 0

if self.expression[i] == ")":
if levels[-1] == "sizeof":
value = len(self.cstruct.resolve(buf))
levels[-1] = ""
else:
value = self.evaluate_part(buf, context)
buf = levels.pop()
buf += str(value)
continue
if len(self.queue) < 1:
raise ExpressionParserError("Invalid expression: not enough operands")

right = self.queue.pop(-1)
if operator in ("u", "~"):
res = self.operators[operator](right)
else:
if len(self.queue) < 1:
raise ExpressionParserError("Invalid expression: not enough operands")

left = self.queue.pop(-1)
res = self.operators[operator](left, right)

buf += self.expression[i]
self.queue.append(res)

def is_number(self, token: str) -> bool:
return token.isnumeric() or (len(token) > 2 and token[0] == "0" and token[1] in ("x", "X", "b", "B", "o", "O"))

def evaluate(self, context: Optional[dict[str, int]] = None) -> int:
"""Evaluates an expression using a Shunting-Yard implementation."""

self.stack = []
self.queue = []
operators = set(self.operators.keys())

context = context or {}
tmp_expression = self.tokens

return self.evaluate_part(buf, context)
# Unary minus tokens; we change the semantic of '-' depending on the previous token
for i in range(len(self.tokens)):
if self.tokens[i] == "-":
if i == 0:
self.tokens[i] = "u"
continue
if self.tokens[i - 1] in operators or self.tokens[i - 1] == "u" or self.tokens[i - 1] == "(":
self.tokens[i] = "u"
continue

def evaluate_part(self, buf: str, context: Dict[str, int]) -> int:
buf = buf.strip()
i = 0
while i < len(tmp_expression):
current_token = tmp_expression[i]
if self.is_number(current_token):
self.queue.append(int(current_token, 0))
elif current_token in context:
self.queue.append(int(context[current_token]))
elif current_token in self.cstruct.consts:
self.queue.append(int(self.cstruct.consts[current_token]))
elif current_token == "u":
self.stack.append(current_token)
elif current_token == "~":
self.stack.append(current_token)
elif current_token == "sizeof":
if len(tmp_expression) < i + 3 or (tmp_expression[i + 1] != "(" or tmp_expression[i + 3] != ")"):
raise ExpressionParserError("Invalid sizeof operation")
self.queue.append(len(self.cstruct.resolve(tmp_expression[i + 2])))
i += 3
elif current_token in operators:
while (
len(self.stack) != 0 and self.stack[-1] != "(" and (self.precedence(self.stack[-1], current_token))
):
self.evaluate_exp()
self.stack.append(current_token)
elif current_token == "(":
if i > 0:
previous_token = tmp_expression[i - 1]
if self.is_number(previous_token):
raise ExpressionParserError(
f"Parser expected sizeof or an arethmethic operator instead got: '{previous_token}'"
)

# Very simple way to support an expression(part) that is a single,
# negative value. To use negative values in more complex expressions,
# they must be wrapped in brackets, e.g.: 2 * (-5).
#
# To have full support for the negation operator a proper expression
# parser must be build.
if buf.startswith("-") and buf[1:].isnumeric():
return int(buf)
self.stack.append(current_token)
elif current_token == ")":
if i > 0:
previous_token = tmp_expression[i - 1]
if previous_token == "(":
raise ExpressionParserError(
f"Parser expected an expression, instead received empty parenthesis. Index: {i}"
)

for operator in self.operators:
if operator[0] in buf:
a, b = buf.rsplit(operator[0], 1)
if len(self.stack) == 0:
raise ExpressionParserError("Invalid expression")

return operator[1](self.evaluate_part(a, context), self.evaluate_part(b, context))
while self.stack[-1] != "(":
self.evaluate_exp()

if buf in context:
return context[buf]
self.stack.pop(-1)
else:
raise ExpressionParserError(f"Unmatched token: '{current_token}'")
i += 1

if buf.startswith("0x"):
return int(buf, 16)
while len(self.stack) != 0:
if self.stack[-1] == "(":
raise ExpressionParserError("Invalid expression")

if buf in self.cstruct.consts:
return int(self.cstruct.consts[buf])
self.evaluate_exp()

return int(buf)
return self.queue[0]
Loading

0 comments on commit a9374ed

Please sign in to comment.