Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expression parser support for order of operations and unary negation operator #37

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dissect/cstruct/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@ class NullPointerDereference(Error):

class ArraySizeError(Error):
pass


class ExpressionParserError(Error):
pass


class ExpressionTokenizerError(Error):
pass
329 changes: 273 additions & 56 deletions dissect/cstruct/expression.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,301 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Dict
import string
from typing import TYPE_CHECKING, Callable, Optional, Union

from dissect.cstruct.exceptions import ExpressionParserError, ExpressionTokenizerError

if TYPE_CHECKING:
from dissect.cstruct import cstruct


HEXBIN_SUFFIX = {"x", "X", "b", "B"}


class ExpressionTokenizer:
def __init__(self, expression: str):
self.expression = expression
self.pos = 0
self.tokens = []

def equal(self, token: str, expected: Union[str, str[str]]) -> bool:
if isinstance(expected, set):
return token in expected
else:
return token == expected

def alnum(self, token: str) -> bool:
return token.isalnum()

def alpha(self, token: str) -> bool:
return token.isalpha()

def digit(self, token: str) -> bool:
return token.isdigit()

def hexdigit(self, token: str) -> bool:
return token in string.hexdigits

def operator(self, token: str) -> bool:
return token in {"*", "/", "+", "-", "%", "&", "^", "|", "(", ")", "~"}

def match(
self,
func: Optional[Callable[[str], bool]] = None,
expected: Optional[str] = None,
consume: bool = True,
append: bool = True,
) -> bool:
if self.eol():
return False

token = self.get_token()

if expected and self.equal(token, expected):
if append:
self.tokens.append(token)

Check warning on line 56 in dissect/cstruct/expression.py

View check run for this annotation

Codecov / codecov/patch

dissect/cstruct/expression.py#L56

Added line #L56 was not covered by tests
if consume:
self.consume()
return True

if func and func(token):
if append:
self.tokens.append(token)
if consume:
self.consume()
return True

return False

def consume(self) -> None:
self.pos += 1

def eol(self) -> bool:
return self.pos >= len(self.expression)

def get_token(self) -> str:
if self.eol():
raise ExpressionTokenizerError(f"Out of bounds index: {self.pos}, length: {len(self.expression)}")

Check warning on line 78 in dissect/cstruct/expression.py

View check run for this annotation

Codecov / codecov/patch

dissect/cstruct/expression.py#L78

Added line #L78 was not covered by tests
return self.expression[self.pos]

def tokenize(self) -> list[str]:
token = ""

# Loop over expression runs in linear time
while not self.eol():
# If token is a single character operand add it to tokens
if self.match(self.operator):
continue

# If token is a single digit, keep looping over expression and build the number
elif self.match(self.digit, consume=False, append=False):
token += self.get_token()
self.consume()

# Support for binary and hexadecimal notation
if self.match(expected=HEXBIN_SUFFIX, consume=False, append=False):
token += self.get_token()
self.consume()

while self.match(self.hexdigit, consume=False, append=False):
token += self.get_token()
self.consume()
if self.eol():
break

# Checks for suffixes in numbers
if self.match(expected={"u", "U"}, consume=False, append=False):
self.consume()
self.match(expected={"l", "L"}, append=False)
self.match(expected={"l", "L"}, append=False)

elif self.match(expected={"l", "L"}, append=False):
self.match(expected={"l", "L"}, append=False)
self.match(expected={"u", "U"}, append=False)
else:
pass

# Number cannot end on x or b in the case of binary or hexadecimal notation
if len(token) == 2 and token[-1] in HEXBIN_SUFFIX:
raise ExpressionTokenizerError("Invalid binary or hex notation")

if len(token) > 1 and token[0] == "0" and token[1] not in HEXBIN_SUFFIX:
token = token[:1] + "o" + token[1:]
self.tokens.append(token)
token = ""

# If token is alpha or underscore we need to build the identifier
elif self.match(self.alpha, consume=False, append=False) or self.match(
expected="_", consume=False, append=False
):
while self.match(self.alnum, consume=False, append=False) or self.match(
expected="_", consume=False, append=False
):
token += self.get_token()
self.consume()
if self.eol():
break
self.tokens.append(token)
token = ""
# If token is length 2 operand make sure next character is part of length 2 operand append to tokens
elif self.match(expected=">", append=False) and self.match(expected=">", append=False):
self.tokens.append(">>")
elif self.match(expected="<", append=False) and self.match(expected="<", append=False):
self.tokens.append("<<")
elif self.match(expected=" ", append=False):
continue
else:
raise ExpressionTokenizerError(
f"Tokenizer does not recognize following token '{self.expression[self.pos]}'"
)
return self.tokens


class Expression:
"""Expression parser for simple calculations in definitions."""

operators = [
("*", lambda a, b: a * b),
("/", lambda a, b: a // b),
("%", lambda a, b: a % b),
("+", lambda a, b: a + b),
("-", lambda a, b: a - b),
(">>", lambda a, b: a >> b),
("<<", lambda a, b: a << b),
("&", lambda a, b: a & b),
("^", lambda a, b: a ^ b),
("|", lambda a, b: a | b),
]
"""Expression parser for calculations in definitions."""

operators = {
"|": lambda a, b: a | b,
"^": lambda a, b: a ^ b,
"&": lambda a, b: a & b,
"<<": lambda a, b: a << b,
">>": lambda a, b: a >> b,
"+": lambda a, b: a + b,
"-": lambda a, b: a - b,
"*": lambda a, b: a * b,
"/": lambda a, b: a // b,
"%": lambda a, b: a % b,
"u": lambda a: -a,
"~": lambda a: ~a,
}

precedence_levels = {
"|": 0,
"^": 1,
"&": 2,
"<<": 3,
">>": 3,
"+": 4,
"-": 4,
"*": 5,
"/": 5,
"%": 5,
"u": 6,
"~": 6,
"sizeof": 6,
}

def __init__(self, cstruct: cstruct, expression: str):
self.cstruct = cstruct
self.expression = expression
self.tokens = ExpressionTokenizer(expression).tokenize()
self.stack = []
self.queue = []

def __repr__(self) -> str:
return self.expression

def evaluate(self, context: Dict[str, int] = None) -> int:
context = context or {}
levels = []
buf = ""
def precedence(self, o1: str, o2: str) -> bool:
return self.precedence_levels[o1] >= self.precedence_levels[o2]

for i in range(len(self.expression)):
if self.expression[i] == "(":
levels.append(buf)
buf = ""
continue
def evaluate_exp(self) -> None:
operator = self.stack.pop(-1)
res = 0

if self.expression[i] == ")":
if levels[-1] == "sizeof":
value = len(self.cstruct.resolve(buf))
levels[-1] = ""
else:
value = self.evaluate_part(buf, context)
buf = levels.pop()
buf += str(value)
continue
if len(self.queue) < 1:
raise ExpressionParserError("Invalid expression: not enough operands")

right = self.queue.pop(-1)
if operator in ("u", "~"):
res = self.operators[operator](right)
else:
if len(self.queue) < 1:
raise ExpressionParserError("Invalid expression: not enough operands")

Check warning on line 213 in dissect/cstruct/expression.py

View check run for this annotation

Codecov / codecov/patch

dissect/cstruct/expression.py#L213

Added line #L213 was not covered by tests

left = self.queue.pop(-1)
res = self.operators[operator](left, right)

buf += self.expression[i]
self.queue.append(res)

def is_number(self, token: str) -> bool:
return token.isnumeric() or (len(token) > 2 and token[0] == "0" and token[1] in ("x", "X", "b", "B", "o", "O"))
Schamper marked this conversation as resolved.
Show resolved Hide resolved

def evaluate(self, context: Optional[dict[str, int]] = None) -> int:
"""Evaluates an expression using a Shunting-Yard implementation."""

self.stack = []
self.queue = []
operators = set(self.operators.keys())

context = context or {}
tmp_expression = self.tokens

return self.evaluate_part(buf, context)
# Unary minus tokens; we change the semantic of '-' depending on the previous token
for i in range(len(self.tokens)):
if self.tokens[i] == "-":
if i == 0:
self.tokens[i] = "u"
continue
if self.tokens[i - 1] in operators or self.tokens[i - 1] == "u" or self.tokens[i - 1] == "(":
self.tokens[i] = "u"
continue

def evaluate_part(self, buf: str, context: Dict[str, int]) -> int:
buf = buf.strip()
i = 0
while i < len(tmp_expression):
current_token = tmp_expression[i]
if self.is_number(current_token):
self.queue.append(int(current_token, 0))
elif current_token in context:
self.queue.append(int(context[current_token]))
elif current_token in self.cstruct.consts:
self.queue.append(int(self.cstruct.consts[current_token]))
elif current_token == "u":
self.stack.append(current_token)
elif current_token == "~":
self.stack.append(current_token)
elif current_token == "sizeof":
if len(tmp_expression) < i + 3 or (tmp_expression[i + 1] != "(" or tmp_expression[i + 3] != ")"):
raise ExpressionParserError("Invalid sizeof operation")
self.queue.append(len(self.cstruct.resolve(tmp_expression[i + 2])))
i += 3
elif current_token in operators:
while (
len(self.stack) != 0 and self.stack[-1] != "(" and (self.precedence(self.stack[-1], current_token))
):
self.evaluate_exp()
self.stack.append(current_token)
elif current_token == "(":
if i > 0:
previous_token = tmp_expression[i - 1]
if self.is_number(previous_token):
raise ExpressionParserError(
f"Parser expected sizeof or an arethmethic operator instead got: '{previous_token}'"
)

# Very simple way to support an expression(part) that is a single,
# negative value. To use negative values in more complex expressions,
# they must be wrapped in brackets, e.g.: 2 * (-5).
#
# To have full support for the negation operator a proper expression
# parser must be build.
if buf.startswith("-") and buf[1:].isnumeric():
return int(buf)
self.stack.append(current_token)
elif current_token == ")":
if i > 0:
previous_token = tmp_expression[i - 1]
if previous_token == "(":
raise ExpressionParserError(
f"Parser expected an expression, instead received empty parenthesis. Index: {i}"
)

for operator in self.operators:
if operator[0] in buf:
a, b = buf.rsplit(operator[0], 1)
if len(self.stack) == 0:
raise ExpressionParserError("Invalid expression")

return operator[1](self.evaluate_part(a, context), self.evaluate_part(b, context))
while self.stack[-1] != "(":
self.evaluate_exp()

if buf in context:
return context[buf]
self.stack.pop(-1)
else:
raise ExpressionParserError(f"Unmatched token: '{current_token}'")
i += 1

if buf.startswith("0x"):
return int(buf, 16)
while len(self.stack) != 0:
if self.stack[-1] == "(":
raise ExpressionParserError("Invalid expression")

if buf in self.cstruct.consts:
return int(self.cstruct.consts[buf])
self.evaluate_exp()

return int(buf)
return self.queue[0]
Loading