Skip to content

Commit

Permalink
Recognize integer multicharacter constants like 'ABCD' (#350)
Browse files Browse the repository at this point in the history
Recognize integer multicharacter constants like 'ABCD'

The feature I am adding is defined here - 5th case.
https://en.cppreference.com/w/c/language/character_constant
Also here: 6.4.4.4.10 of C99.

Put simply, pycparser thought a statement like this is an error:
int a = 'ABCD';
However it is not.

It is likely possible to just modify char_const regular expression in c_lexer.py:240 to allow longer characters, but the way it is done in this PR - multicharacter constants are clearly separated. I am also limiting the length of multicharacter const integers to 4 characters - this matches VS compiler behavior (gcc allows any length with a warning) and lets pycparser NOT consider lengthy single-quoted strings as integers - these would be nonsensical anyway.
  • Loading branch information
yaroslav-o authored and eliben committed Sep 25, 2019
1 parent 62ee4ba commit a4a7127
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
7 changes: 6 additions & 1 deletion pycparser/c_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def _make_tok_location(self, token):
'TYPEID',

# constants
'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN',
'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR',
'FLOAT_CONST', 'HEX_FLOAT_CONST',
'CHAR_CONST',
'WCHAR_CONST',
Expand Down Expand Up @@ -239,6 +239,7 @@ def _make_tok_location(self, token):
cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
char_const = "'"+cconst_char+"'"
wchar_const = 'L'+char_const
multicharacter_constant = "'"+cconst_char+"{2,4}'"
unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""

Expand Down Expand Up @@ -468,6 +469,10 @@ def t_INT_CONST_DEC(self, t):
# Must come before bad_char_const, to prevent it from
# catching valid char constants as invalid
#
@TOKEN(multicharacter_constant)
def t_INT_CONST_CHAR(self, t):
return t

@TOKEN(char_const)
def t_CHAR_CONST(self, t):
return t
Expand Down
1 change: 1 addition & 0 deletions pycparser/c_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1766,6 +1766,7 @@ def p_constant_1(self, p):
| INT_CONST_OCT
| INT_CONST_HEX
| INT_CONST_BIN
| INT_CONST_CHAR
"""
uCount = 0
lCount = 0
Expand Down
12 changes: 8 additions & 4 deletions tests/test_c_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ def test_integer_constants(self):
self.assertTokensTypes('0xf7', ['INT_CONST_HEX'])
self.assertTokensTypes('0b110', ['INT_CONST_BIN'])
self.assertTokensTypes('0x01202AAbbf7Ul', ['INT_CONST_HEX'])
self.assertTokensTypes("'12'", ['INT_CONST_CHAR'])
self.assertTokensTypes("'123'", ['INT_CONST_CHAR'])
self.assertTokensTypes("'1AB4'", ['INT_CONST_CHAR'])
self.assertTokensTypes(r"'1A\n4'", ['INT_CONST_CHAR'])

# no 0 before x, so ID catches it
self.assertTokensTypes('xf7', ['ID'])
Expand Down Expand Up @@ -448,11 +452,11 @@ def test_char_constants(self):
self.assertLexerError("'", ERR_UNMATCHED_QUOTE)
self.assertLexerError("'b\n", ERR_UNMATCHED_QUOTE)
self.assertLexerError("'\\xaa\n'", ERR_UNMATCHED_QUOTE)

self.assertLexerError(r"'\12a'", ERR_INVALID_CCONST)
self.assertLexerError(r"'\xabg'", ERR_INVALID_CCONST)
self.assertLexerError(r"'123\12a'", ERR_INVALID_CCONST)
self.assertLexerError(r"'123\xabg'", ERR_INVALID_CCONST)
self.assertLexerError("''", ERR_INVALID_CCONST)
self.assertLexerError("'jx'", ERR_INVALID_CCONST)
self.assertLexerError("'abcjx'", ERR_INVALID_CCONST)
self.assertLexerError(r"'\*'", ERR_INVALID_CCONST)

def test_string_literals(self):
Expand Down

0 comments on commit a4a7127

Please sign in to comment.