Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplement "Dependencies.strip_string_literals()" #5994

Merged
merged 7 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
195 changes: 116 additions & 79 deletions Cython/Build/Dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,94 +306,131 @@ def apply(self, extension):
setattr(extension, key, value)


@cython.locals(start=cython.Py_ssize_t, q=cython.Py_ssize_t,
single_q=cython.Py_ssize_t, double_q=cython.Py_ssize_t,
hash_mark=cython.Py_ssize_t, end=cython.Py_ssize_t,
k=cython.Py_ssize_t, counter=cython.Py_ssize_t, quote_len=cython.Py_ssize_t)
def strip_string_literals(code, prefix='__Pyx_L'):
_FIND_TOKEN = cython.declare(object, re.compile(r"""
(?P<comment> [#] ) |
(?P<brace> [{}] ) |
(?P<fstring> f )? (?P<quote> '+ | "+ )
""", re.VERBOSE).search)

_FIND_STRING_TOKEN = cython.declare(object, re.compile(r"""
(?P<escape> [\\]+ ) (?P<escaped_quote> ['"] ) |
(?P<fstring> f )? (?P<quote> '+ | "+ )
""", re.VERBOSE).search)

_FIND_FSTRING_TOKEN = cython.declare(object, re.compile(r"""
(?P<braces> [{]+ | [}]+ ) |
(?P<escape> [\\]+ ) (?P<escaped_quote> ['"] ) |
(?P<fstring> f )? (?P<quote> '+ | "+ )
""", re.VERBOSE).search)


def strip_string_literals(code: str, prefix: str = '__Pyx_L'):
"""
Normalizes every string literal to be of the form '__Pyx_Lxxx',
returning the normalized code and a mapping of labels to
string literals.
"""
new_code = []
literals = {}
counter = 0
start = q = 0
in_quote = False
hash_mark = single_q = double_q = -1
code_len = len(code)
quote_type = None
quote_len = -1

while True:
if hash_mark < q:
hash_mark = code.find('#', q)
if single_q < q:
single_q = code.find("'", q)
if double_q < q:
double_q = code.find('"', q)
q = min(single_q, double_q)
if q == -1:
q = max(single_q, double_q)

# We're done.
if q == -1 and hash_mark == -1:
new_code.append(code[start:])
break

# Try to close the quote.
elif in_quote:
if code[q-1] == '\\':
k = 2
while q >= k and code[q-k] == '\\':
k += 1
if k % 2 == 0:
q += 1
new_code: list = []
literals: dict = {}
counter: cython.Py_ssize_t = 0
find_token = _FIND_TOKEN

def append_new_label(literal):
nonlocal counter
counter += 1
label = f"{prefix}{counter}_"
literals[label] = literal
new_code.append(label)

def parse_string(quote_type: str, start: cython.Py_ssize_t, is_fstring: cython.bint) -> cython.Py_ssize_t:
charpos: cython.Py_ssize_t = start

find_token = _FIND_FSTRING_TOKEN if is_fstring else _FIND_STRING_TOKEN

while charpos != -1:
token = find_token(code, charpos)
if token is None:
# This probably indicates an unclosed string literal, i.e. a broken file.
append_new_label(code[start:])
charpos = -1
break
charpos = token.end()

if token['escape']:
if len(token['escape']) % 2 == 0 and token['escaped_quote'] == quote_type[0]:
# Quote is not actually escaped and might be part of a terminator, look at it next.
charpos -= 1

elif is_fstring and token['braces']:
# Formats or brace(s) in fstring.
if len(token['braces']) % 2 == 0:
# Normal brace characters in string.
continue
if code[q] == quote_type and (
quote_len == 1 or (code_len > q + 2 and quote_type == code[q+1] == code[q+2])):
counter += 1
label = "%s%s_" % (prefix, counter)
literals[label] = code[start+quote_len:q]
full_quote = code[q:q+quote_len]
new_code.append(full_quote)
new_code.append(label)
new_code.append(full_quote)
q += quote_len
in_quote = False
start = q
else:
q += 1

# Process comment.
elif -1 != hash_mark and (hash_mark < q or q == -1):
new_code.append(code[start:hash_mark+1])
end = code.find('\n', hash_mark)
counter += 1
label = "%s%s_" % (prefix, counter)
if end == -1:
end_or_none = None
else:
end_or_none = end
literals[label] = code[hash_mark+1:end_or_none]
new_code.append(label)
if end == -1:
if token['braces'][-1] == '{':
if start < charpos-1:
append_new_label(code[start : charpos-1])
new_code.append('{')
start = charpos = parse_code(charpos, in_fstring=True)

elif token['quote'].startswith(quote_type):
# Closing quote found (potentially together with further, unrelated quotes).
charpos = token.start('quote')
if charpos > start:
append_new_label(code[start : charpos])
new_code.append(quote_type)
charpos += len(quote_type)
break
start = q = end

# Open the quote.
else:
if code_len >= q+3 and (code[q] == code[q+1] == code[q+2]):
quote_len = 3
else:
quote_len = 1
in_quote = True
quote_type = code[q]
new_code.append(code[start:q])
start = q
q += quote_len
return charpos

def parse_code(start: cython.Py_ssize_t, in_fstring: cython.bint = False) -> cython.Py_ssize_t:
charpos: cython.Py_ssize_t = start
end: cython.Py_ssize_t
quote: str

while charpos != -1:
token = find_token(code, charpos)
if token is None:
new_code.append(code[start:])
charpos = -1
break
charpos = end = token.end()

if token['quote']:
quote = token['quote']
if len(quote) >= 6:
# Ignore empty tripple-quoted strings: '''''' or """"""
quote = quote[:len(quote) % 6]
if quote and len(quote) != 2:
if len(quote) > 3:
end -= len(quote) - 3
quote = quote[:3]
new_code.append(code[start:end])
start = charpos = parse_string(quote, end, is_fstring=token['fstring'])

elif token['comment']:
new_code.append(code[start:end])
charpos = code.find('\n', end)
append_new_label(code[end : charpos if charpos != -1 else None])
if charpos == -1:
break # EOF
start = charpos

elif in_fstring and token['brace']:
if token['brace'] == '}':
# Closing '}' of f-string.
charpos = end = token.start() + 1
new_code.append(code[start:end]) # with '}'
break
else:
# Starting a calculated format modifier inside of an f-string format.
end = token.start() + 1
new_code.append(code[start:end]) # with '{'
start = charpos = parse_code(end, in_fstring=True)

return charpos

parse_code(0)
return "".join(new_code), literals


Expand Down