-
Notifications
You must be signed in to change notification settings - Fork 280
Robust type name to type identifier conversion for C harnesses #5420
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5a3e001
7a20ab6
83e2756
6a6a6bd
4a8ca1a
05f85d2
2733b56
a9f7f01
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,8 @@ Author: Daniel Kroening, kroening@cs.cmu.edu | |
#include <util/std_types.h> | ||
#include <util/symbol_table.h> | ||
|
||
#include <regex> | ||
|
||
typedef std::unordered_map<irep_idt, std::pair<size_t, bool>> symbol_numbert; | ||
|
||
static std::string type2name( | ||
|
@@ -277,27 +279,48 @@ std::string type2name(const typet &type, const namespacet &ns) | |
/// If we want utilities like dump_c to work properly identifiers | ||
/// should ideally always be valid C identifiers | ||
/// This replaces some invalid characters that can appear in type2name output. | ||
std::string type2identifier(const typet &type, const namespacet &ns) | ||
std::string type_name2type_identifier(const std::string &name) | ||
{ | ||
auto type2name_res = type2name(type, ns); | ||
std::string result{}; | ||
for(char c : type2name_res) | ||
{ | ||
switch(c) | ||
const auto replace_special_characters = [](const std::string &name) { | ||
std::string result{}; | ||
for(char c : name) | ||
{ | ||
case '*': | ||
result += "_ptr_"; | ||
break; | ||
case '{': | ||
result += "_start_sub_"; | ||
break; | ||
case '}': | ||
result += "_end_sub_"; | ||
break; | ||
default: | ||
result += c; | ||
break; | ||
switch(c) | ||
{ | ||
case '*': | ||
result += "_ptr_"; | ||
break; | ||
case '{': | ||
result += "_start_sub_"; | ||
break; | ||
case '}': | ||
result += "_end_sub_"; | ||
break; | ||
default: | ||
result += c; | ||
break; | ||
} | ||
} | ||
} | ||
return result; | ||
return result; | ||
}; | ||
const auto replace_invalid_characters_with_underscore = | ||
[](const std::string &identifier) { | ||
static const std::regex non_alpha_numeric{"[^A-Za-z0-9\x80-\xff]+"}; | ||
return std::regex_replace(identifier, non_alpha_numeric, "_"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This, I assume, risks generating clashing names? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chrisr-diffblue Yes, in general this isn’t generating a unique identifier. The idea would be to use this as an identifier fragment useful for generating a human-readable-but-unique identifier elsewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We were already risking clashing names, rather than guaranteeing uniqueness. For example |
||
}; | ||
const auto strip_leading_digits = [](const std::string &identifier) { | ||
static const std::regex identifier_regex{ | ||
"[A-Za-z\x80-\xff_][A-Za-z0-9_\x80-\xff]*"}; | ||
std::smatch match_results; | ||
bool found = std::regex_search(identifier, match_results, identifier_regex); | ||
POSTCONDITION(found); | ||
return match_results.str(0); | ||
}; | ||
return strip_leading_digits(replace_invalid_characters_with_underscore( | ||
replace_special_characters(name))); | ||
} | ||
|
||
std::string type_to_partial_identifier(const typet &type, const namespacet &ns) | ||
{ | ||
return type_name2type_identifier(type2name(type, ns)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice 👍 |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,15 @@ class namespacet; | |
|
||
std::string type2name(const typet &type, const namespacet &ns); | ||
|
||
std::string type2identifier(const typet &type, const namespacet &ns); | ||
/** | ||
* Constructs a string describing the given type, which can be used as part of | ||
* a `C` identifier. The resulting identifier is not guaranteed to uniquely | ||
* identify the type in all cases. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might need clarifying - in fact, it might actually be worth making this explicit in the function name, e.g. rename it to something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get that "identifier is a bit of a misnomer in this case. However There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
* @param type Internal representation of the type to describe. | ||
* @param ns Namespace for looking up any types on which the `type` parameter | ||
* depends. | ||
* @return An identifying string which can be used as part of a `C` identifier. | ||
*/ | ||
std::string type_to_partial_identifier(const typet &type, const namespacet &ns); | ||
|
||
#endif // CPROVER_ANSI_C_TYPE2NAME_H |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/*******************************************************************\ | ||
|
||
Module: Unit tests for type_name2type_identifier | ||
|
||
Author: Thomas Spriggs | ||
|
||
\*******************************************************************/ | ||
|
||
#include <testing-utils/use_catch.h> | ||
|
||
#include <numeric> | ||
|
||
extern std::string type_name2type_identifier(const std::string &name); | ||
|
||
TEST_CASE( | ||
"type_name2type_identifier sanity check", | ||
"[core][ansi-c][type_name2type_identifier]") | ||
{ | ||
CHECK(type_name2type_identifier("char") == "char"); | ||
} | ||
|
||
TEST_CASE( | ||
"type_name2type_identifier special characters", | ||
"[core][ansi-c][type_name2type_identifier]") | ||
{ | ||
CHECK(type_name2type_identifier("char*") == "char_ptr_"); | ||
CHECK(type_name2type_identifier("foo{bar}") == "foo_start_sub_bar_end_sub_"); | ||
} | ||
|
||
/** | ||
* @return A string containing all 7-bit ascii printable characters. | ||
*/ | ||
static std::string all_printable_characters() | ||
{ | ||
const char first = 32; | ||
const char last = 127; | ||
std::string printable_characters(last - first, ' '); | ||
std::iota(printable_characters.begin(), printable_characters.end(), first); | ||
return printable_characters; | ||
} | ||
|
||
TEST_CASE( | ||
"type_name2type_identifier invalid characters", | ||
"[core][ansi-c][type_name2type_identifier]") | ||
{ | ||
const std::string printable_characters = all_printable_characters(); | ||
CHECK( | ||
printable_characters == | ||
R"( !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`)" | ||
R"(abcdefghijklmnopqrstuvwxyz{|}~)"); | ||
CHECK( | ||
type_name2type_identifier(printable_characters) == | ||
"_ptr_0123456789_ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_" | ||
"start_sub_end_sub_"); | ||
} | ||
|
||
TEST_CASE( | ||
"type_name2type_identifier leading characters", | ||
"[core][ansi-c][type_name2type_identifier]") | ||
{ | ||
CHECK( | ||
type_name2type_identifier("0123456789banana_0123456789_split_0123456789") == | ||
"banana_0123456789_split_0123456789"); | ||
CHECK(type_name2type_identifier("0123456789_banana") == "_banana"); | ||
CHECK(type_name2type_identifier("_0123456789") == "_0123456789"); | ||
} | ||
|
||
TEST_CASE( | ||
"type_name2type_identifier UTF-8 characters", | ||
"[core][ansi-c][type_name2type_identifier]") | ||
{ | ||
const std::string utf8_example = "\xF0\x9F\x8D\x8C\xF0\x9F\x8D\xA8"; | ||
CHECK(type_name2type_identifier(utf8_example) == utf8_example); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still don’t like the
2
naming convention here, but changing that isn’t really in scope for this PR.