diffblue · thomasspriggs · Jul 16, 2020 · Jul 10, 2020 · Jul 10, 2020 · Jul 10, 2020
@@ -1893,15 +1893,15 @@ void c_typecheck_baset::typecheck_side_effect_function_call(
         /// XXX why are we only looking at the type of the first parameter?
         if(parameters.front().type().id() == ID_pointer)
         {
-          identifier_with_type =
-            id2string(identifier) + "_" +
-            type2identifier(parameters.front().type().subtype(), *this);
+          identifier_with_type = id2string(identifier) + "_" +
+                                 type_to_partial_identifier(
+                                   parameters.front().type().subtype(), *this);
         }
         else
         {
           identifier_with_type =
             id2string(identifier) + "_" +
-            type2identifier(parameters.front().type(), *this);
+            type_to_partial_identifier(parameters.front().type(), *this);
         }
         gcc_polymorphic->set_identifier(identifier_with_type);
 

@@ -19,6 +19,8 @@ Author: Daniel Kroening, kroening@cs.cmu.edu
 #include <util/std_types.h>
 #include <util/symbol_table.h>
 
+#include <regex>
+
 typedef std::unordered_map<irep_idt, std::pair<size_t, bool>> symbol_numbert;
 
 static std::string type2name(
@@ -277,27 +279,48 @@ std::string type2name(const typet &type, const namespacet &ns)
 /// If we want utilities like dump_c to work properly identifiers
 /// should ideally always be valid C identifiers
 /// This replaces some invalid characters that can appear in type2name output.
-std::string type2identifier(const typet &type, const namespacet &ns)
+std::string type_name2type_identifier(const std::string &name)
 {
-  auto type2name_res = type2name(type, ns);
-  std::string result{};
-  for(char c : type2name_res)
-  {
-    switch(c)
+  const auto replace_special_characters = [](const std::string &name) {
+    std::string result{};
+    for(char c : name)
     {
-    case '*':
-      result += "_ptr_";
-      break;
-    case '{':
-      result += "_start_sub_";
-      break;
-    case '}':
-      result += "_end_sub_";
-      break;
-    default:
-      result += c;
-      break;
+      switch(c)
+      {
+      case '*':
+        result += "_ptr_";
+        break;
+      case '{':
+        result += "_start_sub_";
+        break;
+      case '}':
+        result += "_end_sub_";
+        break;
+      default:
+        result += c;
+        break;
+      }
     }
-  }
-  return result;
+    return result;
+  };
+  const auto replace_invalid_characters_with_underscore =
+    [](const std::string &identifier) {
+      static const std::regex non_alpha_numeric{"[^A-Za-z0-9\x80-\xff]+"};
+      return std::regex_replace(identifier, non_alpha_numeric, "_");
+    };
+  const auto strip_leading_digits = [](const std::string &identifier) {
+    static const std::regex identifier_regex{
+      "[A-Za-z\x80-\xff_][A-Za-z0-9_\x80-\xff]*"};
+    std::smatch match_results;
+    bool found = std::regex_search(identifier, match_results, identifier_regex);
+    POSTCONDITION(found);
+    return match_results.str(0);
+  };
+  return strip_leading_digits(replace_invalid_characters_with_underscore(
+    replace_special_characters(name)));
+}
+
+std::string type_to_partial_identifier(const typet &type, const namespacet &ns)
+{
+  return type_name2type_identifier(type2name(type, ns));
 }
@@ -20,6 +20,15 @@ class namespacet;
 
 std::string type2name(const typet &type, const namespacet &ns);
 
-std::string type2identifier(const typet &type, const namespacet &ns);
+/**
+ * Constructs a string describing the given type, which can be used as part of
+ * a `C` identifier. The resulting identifier is not guaranteed to uniquely
+ * identify the type in all cases.
+ * @param type Internal representation of the type to describe.
+ * @param ns Namespace for looking up any types on which the `type` parameter
+ *   depends.
+ * @return An identifying string which can be used as part of a `C` identifier.
+ */
+std::string type_to_partial_identifier(const typet &type, const namespacet &ns);
 
 #endif // CPROVER_ANSI_C_TYPE2NAME_H
diff --git a/unit/Makefile b/unit/Makefile
@@ -15,6 +15,7 @@ SRC += analyses/ai/ai.cpp \
        analyses/does_remove_const/does_type_preserve_const_correctness.cpp \
        analyses/does_remove_const/is_type_at_least_as_const_as.cpp \
        ansi-c/max_malloc_size.cpp \
+       ansi-c/type2name.cpp \
        big-int/big-int.cpp \
        compound_block_locations.cpp \
        get_goto_model_from_c_test.cpp \

diff --git a/unit/ansi-c/type2name.cpp b/unit/ansi-c/type2name.cpp
@@ -0,0 +1,74 @@
+/*******************************************************************\
+
+Module: Unit tests for type_name2type_identifier
+
+Author: Thomas Spriggs
+
+\*******************************************************************/
+
+#include <testing-utils/use_catch.h>
+
+#include <numeric>
+
+extern std::string type_name2type_identifier(const std::string &name);
+
+TEST_CASE(
+  "type_name2type_identifier sanity check",
+  "[core][ansi-c][type_name2type_identifier]")
+{
+  CHECK(type_name2type_identifier("char") == "char");
+}
+
+TEST_CASE(
+  "type_name2type_identifier special characters",
+  "[core][ansi-c][type_name2type_identifier]")
+{
+  CHECK(type_name2type_identifier("char*") == "char_ptr_");
+  CHECK(type_name2type_identifier("foo{bar}") == "foo_start_sub_bar_end_sub_");
+}
+
+/**
+ * @return A string containing all 7-bit ascii printable characters.
+ */
+static std::string all_printable_characters()
+{
+  const char first = 32;
+  const char last = 127;
+  std::string printable_characters(last - first, ' ');
+  std::iota(printable_characters.begin(), printable_characters.end(), first);
+  return printable_characters;
+}
+
+TEST_CASE(
+  "type_name2type_identifier invalid characters",
+  "[core][ansi-c][type_name2type_identifier]")
+{
+  const std::string printable_characters = all_printable_characters();
+  CHECK(
+    printable_characters ==
+    R"( !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`)"
+    R"(abcdefghijklmnopqrstuvwxyz{|}~)");
+  CHECK(
+    type_name2type_identifier(printable_characters) ==
+    "_ptr_0123456789_ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_"
+    "start_sub_end_sub_");
+}
+
+TEST_CASE(
+  "type_name2type_identifier leading characters",
+  "[core][ansi-c][type_name2type_identifier]")
+{
+  CHECK(
+    type_name2type_identifier("0123456789banana_0123456789_split_0123456789") ==
+    "banana_0123456789_split_0123456789");
+  CHECK(type_name2type_identifier("0123456789_banana") == "_banana");
+  CHECK(type_name2type_identifier("_0123456789") == "_0123456789");
+}
+
+TEST_CASE(
+  "type_name2type_identifier UTF-8 characters",
+  "[core][ansi-c][type_name2type_identifier]")
+{
+  const std::string utf8_example = "\xF0\x9F\x8D\x8C\xF0\x9F\x8D\xA8";
+  CHECK(type_name2type_identifier(utf8_example) == utf8_example);
+}