diff --git a/core/ast.h b/core/ast.h index d530a0837..aa9f11593 100644 --- a/core/ast.h +++ b/core/ast.h @@ -25,6 +25,7 @@ limitations under the License. #include #include "core/lexer.h" +#include "core/string.h" enum ASTType { AST_APPLY, @@ -51,18 +52,17 @@ enum ASTType { AST_VAR }; - /** Represents a variable / parameter / field name. */ struct Identifier { - std::string name; - Identifier(const std::string &name) + String name; + Identifier(const String &name) : name(name) { } }; static inline std::ostream &operator<<(std::ostream &o, const Identifier *id) { - o << id->name; + o << encode_utf8(id->name); return o; } @@ -213,16 +213,16 @@ struct Function : public AST { /** Represents import "file". */ struct Import : public AST { - std::string file; - Import(const LocationRange &lr, const std::string &file) + String file; + Import(const LocationRange &lr, const String &file) : AST(lr, AST_IMPORT), file(file) { } }; /** Represents importstr "file". */ struct Importstr : public AST { - std::string file; - Importstr(const LocationRange &lr, const std::string &file) + String file; + Importstr(const LocationRange &lr, const String &file) : AST(lr, AST_IMPORTSTR), file(file) { } }; @@ -271,8 +271,8 @@ struct LiteralNumber : public AST { /** Represents JSON strings. */ struct LiteralString : public AST { - std::string value; - LiteralString(const LocationRange &lr, const std::string &value) + String value; + LiteralString(const LocationRange &lr, const String &value) : AST(lr, AST_LITERAL_STRING), value(value) { } }; @@ -373,7 +373,7 @@ struct Var : public AST { /** Allocates ASTs on demand, frees them in its destructor. */ class Allocator { - std::map internedIdentifiers; + std::map internedIdentifiers; std::vector allocated; public: template T* make(Args&&... args) @@ -386,7 +386,7 @@ class Allocator { * * The location used in the Identifier AST is that of the first one parsed. */ - const Identifier *makeIdentifier(const std::string &name) + const Identifier *makeIdentifier(const String &name) { auto it = internedIdentifiers.find(name); if (it != internedIdentifiers.end()) { diff --git a/core/lexer.cpp b/core/lexer.cpp index f368ad5dc..c94f5c658 100644 --- a/core/lexer.cpp +++ b/core/lexer.cpp @@ -19,8 +19,9 @@ limitations under the License. #include #include -#include "core/static_error.h" #include "core/lexer.h" +#include "core/static_error.h" +#include "core/string.h" static bool is_upper(char c) { @@ -411,23 +412,8 @@ std::list jsonnet_lex(const std::string &filename, const char *input) codepoint += digit; } - // Encode in UTF-8. - if (codepoint < 0x0080) { - data += codepoint; - } else { - auto msg = "Codepoint out of ascii range."; - throw StaticError(filename, begin, msg); - } -/* - } else if (codepoint < 0x0800) { - data += 0xC0 | (codepoint >> 6); - data += 0x80 | (codepoint & 0x3F); - } else { - data += 0xE0 | (codepoint >> 12); - data += 0x80 | ((codepoint >> 6) & 0x3F); - data += 0x80 | (codepoint & 0x3F); - } -*/ + encode_utf8(codepoint, data); + // Leave us on the last char, ready for the ++c at // the outer for loop. c += 3; diff --git a/core/lexer.h b/core/lexer.h index 981d83429..d21f53501 100644 --- a/core/lexer.h +++ b/core/lexer.h @@ -23,6 +23,7 @@ limitations under the License. #include #include +#include "core/string.h" #include "core/static_error.h" struct Token { @@ -71,6 +72,8 @@ struct Token { std::string data; + String data32(void) { return decode_utf8(data); } + LocationRange location; Token(Kind kind, const std::string &data, const LocationRange &location) diff --git a/core/parser.cpp b/core/parser.cpp index 2975a486a..a90cc9d5e 100644 --- a/core/parser.cpp +++ b/core/parser.cpp @@ -34,26 +34,28 @@ limitations under the License. // For generated ASTs, use a bogus location. static const LocationRange gen; -std::string jsonnet_unparse_escape(const std::string &str) +String jsonnet_unparse_escape(const String &str) { - std::stringstream ss; - ss << '\"'; + StringStream ss; + ss << U'\"'; for (std::size_t i=0 ; i 0x7e) { + if (c < 0x20 || (c >= 0x7f && c <= 0x9f)) { //Unprintable, use \u - ss << "\\u" << std::hex << std::setfill('0') << std::setw(4) - << unsigned((unsigned char)(c)); + std::stringstream ss8; + ss8 << "\\u" << std::hex << std::setfill('0') << std::setw(4) + << (unsigned long)(c); + ss << decode_utf8(ss8.str()); } else { // Printable, write verbatim ss << c; @@ -61,7 +63,7 @@ std::string jsonnet_unparse_escape(const std::string &str) } } } - ss << '\"'; + ss << U'\"'; return ss.str(); } @@ -129,16 +131,16 @@ static std::string unparse(const AST *ast_) ss << "function "; const char *prefix = "("; for (const Identifier *arg : ast->parameters) { - ss << prefix << arg->name; + ss << prefix << encode_utf8(arg->name); prefix = ", "; } ss << ") " << unparse(ast->body); } else if (auto *ast = dynamic_cast(ast_)) { - ss << "import " << jsonnet_unparse_escape(ast->file); + ss << "import " << encode_utf8(jsonnet_unparse_escape(ast->file)); } else if (auto *ast = dynamic_cast(ast_)) { - ss << "importstr " << jsonnet_unparse_escape(ast->file); + ss << "importstr " << encode_utf8(jsonnet_unparse_escape(ast->file)); } else if (auto *ast = dynamic_cast(ast_)) { ss << unparse(ast->target) << "[" @@ -147,7 +149,7 @@ static std::string unparse(const AST *ast_) } else if (auto *ast = dynamic_cast(ast_)) { const char *prefix = "local "; for (const auto &bind : ast->binds) { - ss << prefix << bind.first->name << " = " << unparse(bind.second); + ss << prefix << encode_utf8(bind.first->name) << " = " << unparse(bind.second); prefix = ", "; } ss << "; " << unparse(ast->body); @@ -159,7 +161,7 @@ static std::string unparse(const AST *ast_) ss << ast->value; } else if (auto *ast = dynamic_cast(ast_)) { - ss << jsonnet_unparse_escape(ast->value); + ss << encode_utf8(jsonnet_unparse_escape(ast->value)); } else if (dynamic_cast(ast_)) { ss << "null"; @@ -191,7 +193,7 @@ static std::string unparse(const AST *ast_) } else if (auto *ast = dynamic_cast(ast_)) { ss << "{[" << unparse(ast->field) << "]: " << unparse(ast->value); - ss << " for " << ast->id->name << " in " << unparse(ast->array); + ss << " for " << encode_utf8(ast->id->name) << " in " << unparse(ast->array); ss << "}"; } else if (dynamic_cast(ast_)) { @@ -204,7 +206,7 @@ static std::string unparse(const AST *ast_) ss << uop_string(ast->op) << unparse(ast->expr); } else if (auto *ast = dynamic_cast(ast_)) { - ss << ast->id->name; + ss << encode_utf8(ast->id->name); } else { std::cerr << "INTERNAL ERROR: Unknown AST: " << ast_ << std::endl; @@ -458,7 +460,7 @@ namespace { void parseBind(Local::Binds &binds, unsigned obj_level) { Token var_id = popExpect(Token::IDENTIFIER); - auto *id = alloc->makeIdentifier(var_id.data); + auto *id = alloc->makeIdentifier(var_id.data32()); if (binds.find(id) != binds.end()) { throw StaticError(var_id.location, "Duplicate local var: " + var_id.data); @@ -487,7 +489,7 @@ namespace { // Hidden variable to allow outer/top binding. if (obj_level == 0) { - const Identifier *hidden_var = alloc->makeIdentifier("$"); + const Identifier *hidden_var = alloc->makeIdentifier(U"$"); let_binds[hidden_var] = alloc->make(LocationRange()); } @@ -546,7 +548,7 @@ namespace { throw StaticError(next.location, "Unexpected comma before for."); } Token id_tok = popExpect(Token::IDENTIFIER); - const Identifier *id = alloc->makeIdentifier(id_tok.data); + const Identifier *id = alloc->makeIdentifier(id_tok.data32()); popExpect(Token::IN); AST *array = parse(MAX_PRECEDENCE, obj_level); Token last = popExpect(Token::BRACE_R); @@ -592,14 +594,14 @@ namespace { if (!literal_fields.insert(next.data).second) { throw StaticError(next.location, "Duplicate field: "+next.data); } - AST *field_expr = alloc->make(next.location, next.data); + AST *field_expr = alloc->make(next.location, next.data32()); AST *body = parse(MAX_PRECEDENCE, obj_level+1); if (is_method) { body = alloc->make(body->location, params, body); } if (plus_sugar) { - AST *f = alloc->make(plus_loc, next.data); + AST *f = alloc->make(plus_loc, next.data32()); AST *super_f = alloc->make(plus_loc, alloc->make(LocationRange()), f); body = alloc->make(body->location, super_f, BOP_PLUS, body); } @@ -639,7 +641,7 @@ namespace { pop(); msg = parse(MAX_PRECEDENCE, obj_level + 1); } else { - std::string msg_str = "Assertion failed."; + auto msg_str = U"Assertion failed."; msg = alloc->make(cond->location, msg_str); } AST *tru = alloc->make(gen, true); @@ -703,15 +705,15 @@ namespace { LocationRange l; pop(); Token id_token = popExpect(Token::IDENTIFIER); - const Identifier *id = alloc->makeIdentifier(id_token.data); + const Identifier *id = alloc->makeIdentifier(id_token.data32()); std::vector params = {id}; - AST *std = alloc->make(l, alloc->makeIdentifier("std")); + AST *std = alloc->make(l, alloc->makeIdentifier(U"std")); AST *map_func = alloc->make(first->location, params, first); popExpect(Token::IN); AST *arr = parse(MAX_PRECEDENCE, obj_level); Token maybe_if = pop(); if (maybe_if.kind == Token::BRACKET_R) { - AST *map_str = alloc->make(l, "map"); + AST *map_str = alloc->make(l, U"map"); AST *map = alloc->make(l, std, map_str); std::vector args = {map_func, arr}; return alloc->make(span(tok, maybe_if), map, args, false); @@ -719,7 +721,7 @@ namespace { AST *cond = parse(MAX_PRECEDENCE, obj_level); Token last = popExpect(Token::BRACKET_R); AST *filter_func = alloc->make(cond->location, params, cond); - AST *fmap_str = alloc->make(l, "filterMap"); + AST *fmap_str = alloc->make(l, U"filterMap"); AST *fmap = alloc->make(l, std, fmap_str); std::vector args = {filter_func, map_func, arr}; return alloc->make(span(tok, last), fmap, args, false); @@ -766,7 +768,7 @@ namespace { return alloc->make(span(tok), strtod(tok.data.c_str(), nullptr)); case Token::STRING: - return alloc->make(span(tok), tok.data); + return alloc->make(span(tok), tok.data32()); case Token::FALSE: return alloc->make(span(tok), false); @@ -780,12 +782,12 @@ namespace { // Import case Token::IMPORT: { Token file = popExpect(Token::STRING); - return alloc->make(span(tok, file), file.data); + return alloc->make(span(tok, file), file.data32()); } case Token::IMPORTSTR: { Token file = popExpect(Token::STRING); - return alloc->make(span(tok, file), file.data); + return alloc->make(span(tok, file), file.data32()); } @@ -794,10 +796,10 @@ namespace { if (obj_level == 0) { throw StaticError(tok.location, "No top-level object found."); } - return alloc->make(span(tok), alloc->makeIdentifier("$")); + return alloc->make(span(tok), alloc->makeIdentifier(U"$")); case Token::IDENTIFIER: - return alloc->make(span(tok), alloc->makeIdentifier(tok.data)); + return alloc->make(span(tok), alloc->makeIdentifier(tok.data32())); case Token::SELF: return alloc->make(span(tok)); @@ -827,7 +829,7 @@ namespace { pop(); msg = parse(MAX_PRECEDENCE, obj_level); } else { - std::string msg_str = "Assertion failed."; + auto msg_str = U"Assertion failed."; msg = alloc->make(begin.location, msg_str); } popExpect(Token::SEMICOLON); @@ -968,7 +970,7 @@ namespace { } else if (op.kind == Token::DOT) { Token field = popExpect(Token::IDENTIFIER); - AST *index = alloc->make(span(field), field.data); + AST *index = alloc->make(span(field), field.data32()); lhs = alloc->make(span(begin, field), lhs, index); } else if (op.kind == Token::PAREN_L) { @@ -989,8 +991,8 @@ namespace { } else if (op.data == "%") { AST *rhs = parse(precedence - 1, obj_level); - AST *std = alloc->make(gen, alloc->makeIdentifier("std")); - AST *mod_str = alloc->make(gen, "mod"); + AST *std = alloc->make(gen, alloc->makeIdentifier(U"std")); + AST *mod_str = alloc->make(gen, U"mod"); AST *f_mod = alloc->make(gen, std, mod_str); std::vector args = {lhs, rhs}; lhs = alloc->make(span(begin, rhs), f_mod, args, false); @@ -1004,8 +1006,8 @@ namespace { invert = true; } if (bop == BOP_MANIFEST_EQUAL) { - AST *std = alloc->make(gen, alloc->makeIdentifier("std")); - AST *equals_str = alloc->make(gen, "equals"); + AST *std = alloc->make(gen, alloc->makeIdentifier(U"std")); + AST *equals_str = alloc->make(gen, U"equals"); AST *f_equals = alloc->make(gen, std, equals_str); std::vector args = {lhs, rhs}; lhs = alloc->make(span(begin, rhs), f_equals, args, false); @@ -1027,31 +1029,31 @@ static unsigned long max_builtin = 24; BuiltinDecl jsonnet_builtin_decl(unsigned long builtin) { switch (builtin) { - case 0: return {"makeArray", {"sz", "func"}}; - case 1: return {"pow", {"x", "n"}}; - case 2: return {"floor", {"x"}}; - case 3: return {"ceil", {"x"}}; - case 4: return {"sqrt", {"x"}}; - case 5: return {"sin", {"x"}}; - case 6: return {"cos", {"x"}}; - case 7: return {"tan", {"x"}}; - case 8: return {"asin", {"x"}}; - case 9: return {"acos", {"x"}}; - case 10: return {"atan", {"x"}}; - case 11: return {"type", {"x"}}; - case 12: return {"filter", {"func", "arr"}}; - case 13: return {"objectHasEx", {"obj", "f", "inc_hidden"}}; - case 14: return {"length", {"x"}}; - case 15: return {"objectFieldsEx", {"obj", "inc_hidden"}}; - case 16: return {"codepoint", {"str"}}; - case 17: return {"char", {"n"}}; - case 18: return {"log", {"n"}}; - case 19: return {"exp", {"n"}}; - case 20: return {"mantissa", {"n"}}; - case 21: return {"exponent", {"n"}}; - case 22: return {"modulo", {"a", "b"}}; - case 23: return {"extVar", {"x"}}; - case 24: return {"primitiveEquals", {"a", "b"}}; + case 0: return {U"makeArray", {U"sz", U"func"}}; + case 1: return {U"pow", {U"x", U"n"}}; + case 2: return {U"floor", {U"x"}}; + case 3: return {U"ceil", {U"x"}}; + case 4: return {U"sqrt", {U"x"}}; + case 5: return {U"sin", {U"x"}}; + case 6: return {U"cos", {U"x"}}; + case 7: return {U"tan", {U"x"}}; + case 8: return {U"asin", {U"x"}}; + case 9: return {U"acos", {U"x"}}; + case 10: return {U"atan", {U"x"}}; + case 11: return {U"type", {U"x"}}; + case 12: return {U"filter", {U"func", U"arr"}}; + case 13: return {U"objectHasEx", {U"obj", U"f", U"inc_hidden"}}; + case 14: return {U"length", {U"x"}}; + case 15: return {U"objectFieldsEx", {U"obj", U"inc_hidden"}}; + case 16: return {U"codepoint", {U"str"}}; + case 17: return {U"char", {U"n"}}; + case 18: return {U"log", {U"n"}}; + case 19: return {U"exp", {U"n"}}; + case 20: return {U"mantissa", {U"n"}}; + case 21: return {U"exponent", {U"n"}}; + case 22: return {U"modulo", {U"a", U"b"}}; + case 23: return {U"extVar", {U"x"}}; + case 24: return {U"primitiveEquals", {U"a", U"b"}}; default: std::cerr << "INTERNAL ERROR: Unrecognized builtin function: " << builtin << std::endl; std::abort(); @@ -1103,11 +1105,11 @@ AST *jsonnet_parse(Allocator *alloc, const std::string &file, const char *input) fields.emplace_back(alloc->make(gen, decl.name), Object::Field::HIDDEN, alloc->make(gen, c, params)); } - fields.emplace_back(alloc->make(gen, "thisFile"), Object::Field::HIDDEN, - alloc->make(gen, file)); + fields.emplace_back(alloc->make(gen, U"thisFile"), Object::Field::HIDDEN, + alloc->make(gen, decode_utf8(file))); Local::Binds std_binds; - std_binds[alloc->makeIdentifier("std")] = std_obj; + std_binds[alloc->makeIdentifier(U"std")] = std_obj; AST *wrapped = alloc->make(expr->location, std_binds, expr); return wrapped; } diff --git a/core/parser.h b/core/parser.h index a2d5eace2..d4f326f7e 100644 --- a/core/parser.h +++ b/core/parser.h @@ -19,8 +19,9 @@ limitations under the License. #include -#include "core/lexer.h" #include "core/ast.h" +#include "core/lexer.h" +#include "core/string.h" /** Parse a given JSON++ string. * @@ -34,15 +35,15 @@ AST *jsonnet_parse(Allocator *alloc, const std::string &file, const char *input) /** Escapes a string for JSON output. */ -std::string jsonnet_unparse_escape(const std::string &str); +String jsonnet_unparse_escape(const String &str); /** Outputs a number, trying to preserve precision as well as possible. */ std::string jsonnet_unparse_number(double v); struct BuiltinDecl { - std::string name; - std::vector params; + String name; + std::vector params; }; /** Returns the name of each built-in function. */ diff --git a/core/state.h b/core/state.h index cca6169a2..ff6250761 100644 --- a/core/state.h +++ b/core/state.h @@ -260,8 +260,8 @@ namespace { /** Stores a simple string on the heap. */ struct HeapString : public HeapEntity { - const std::string value; - HeapString(const std::string &value) + const String value; + HeapString(const String &value) : value(value) { } }; diff --git a/core/static_analysis.cpp b/core/static_analysis.cpp index daa033929..28d573d55 100644 --- a/core/static_analysis.cpp +++ b/core/static_analysis.cpp @@ -68,7 +68,8 @@ static IdSet static_analysis(AST *ast_, bool in_object, const IdSet &vars) IdSet params; for (auto *p : ast->parameters) { if (params.find(p) != params.end()) { - throw StaticError(ast_->location, "Duplicate function parameter: " + p->name); + std::string msg = "Duplicate function parameter: " + encode_utf8(p->name); + throw StaticError(ast_->location, msg); } params.insert(p); new_vars.insert(p); @@ -148,7 +149,7 @@ static IdSet static_analysis(AST *ast_, bool in_object, const IdSet &vars) } else if (auto *ast = dynamic_cast(ast_)) { if (vars.find(ast->id) == vars.end()) { - throw StaticError(ast->location, "Unknown variable: "+ast->id->name); + throw StaticError(ast->location, "Unknown variable: "+encode_utf8(ast->id->name)); } r.insert(ast->id); diff --git a/core/string.h b/core/string.h new file mode 100644 index 000000000..8e6b60178 --- /dev/null +++ b/core/string.h @@ -0,0 +1,158 @@ +/* +Copyright 2015 Google Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef JSONNET_STRING_H +#define JSONNET_STRING_H + +/** Substituted when a unicode translation format encoding error is encountered. */ +#define JSONNET_CODEPOINT_ERROR 0xfffd +#define JSONNET_CODEPOINT_MAX 0x110000 + +/** Convert a unicode codepoint to UTF8. + * + * \param x The unicode codepoint. + * \param s The UTF-8 string to append to. + * \returns The number of characters appended. + */ +static inline int encode_utf8(char32_t x, std::string &s) +{ + if (x >= JSONNET_CODEPOINT_MAX) + x = JSONNET_CODEPOINT_ERROR; + + // 00ZZZzzz 00zzYYYY 00Yyyyxx 00xxxxxx + long bytes = ((x & 0x1C0000) << 6) | ((x & 0x03F000) << 4) | ((x & 0x0FC0) << 2) | (x & 0x3F); + + if (x < 0x80) { + s.push_back((char)x); + return 1; + } else if (x < 0x800) { // note that capital 'Y' bits must be 0 + bytes |= 0xC080; + s.push_back((bytes >> 8) & 0xFF); + s.push_back((bytes >> 0) & 0xFF); + return 2; + } else if (x < 0x10000) { // note that 'z' bits must be 0 + bytes |= 0xE08080; + s.push_back((bytes >> 16) & 0xFF); + s.push_back((bytes >> 8) & 0xFF); + s.push_back((bytes >> 0) & 0xFF); + return 3; + } else if (x < 0x110000) { // note that capital 'Z' bits must be 0 + bytes |= 0xF0808080; + s.push_back((bytes >> 24) & 0xFF); + s.push_back((bytes >> 16) & 0xFF); + s.push_back((bytes >> 8) & 0xFF); + s.push_back((bytes >> 0) & 0xFF); + return 4; + } else { + std::cerr << "Should never get here." << std::endl; + abort(); + } +} + +/** Convert the UTF8 byte sequence in the given string to a unicode code point. + * + * \param str The string. + * \param i The index of the string from which to start decoding and returns the index of the last + * byte of the encoded codepoint. + * \returns The decoded unicode codepoint. + */ +static inline char32_t decode_utf8(const std::string &str, size_t &i) +{ + char c0 = str[i]; + if ((c0 & 0x80) == 0) { //0xxxxxxx + return c0; + } else if ((c0 & 0xE0) == 0xC0) { //110yyyxx 10xxxxxx + if (i+1 >= str.length()) { + return JSONNET_CODEPOINT_ERROR; + } + char c1 = str[++i]; + if ((c1 & 0xC0) != 0x80) { + return JSONNET_CODEPOINT_ERROR; + } + return ((c0 & 0x1F) << 6ul) | (c1 & 0x3F); + } else if ((c0 & 0xF0) == 0xE0) { //1110yyyy 10yyyyxx 10xxxxxx + if (i+2 >= str.length()) { + return JSONNET_CODEPOINT_ERROR; + } + char c1 = str[++i]; + if ((c1 & 0xC0) != 0x80) { + return JSONNET_CODEPOINT_ERROR; + } + char c2 = str[++i]; + if ((c2 & 0xC0) != 0x80) { + return JSONNET_CODEPOINT_ERROR; + } + return ((c0 & 0xF) << 12ul) | ((c1 & 0x3F) << 6) | (c2 & 0x3F); + } else if ((c0 & 0xF8) == 0xF) { //11110zzz 10zzyyyy 10yyyyxx 10xxxxxx + if (i+3 >= str.length()) { + return JSONNET_CODEPOINT_ERROR; + } + char c1 = str[++i]; + if ((c1 & 0xC0) != 0x80) { + return JSONNET_CODEPOINT_ERROR; + } + char c2 = str[++i]; + if ((c2 & 0xC0) != 0x80) { + return JSONNET_CODEPOINT_ERROR; + } + char c3 = str[++i]; + if ((c3 & 0xC0) != 0x80) { + return JSONNET_CODEPOINT_ERROR; + } + return ((c0 & 0x7) << 24ul) | ((c1 & 0x3F) << 12ul) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + } else { + return JSONNET_CODEPOINT_ERROR; + } +} + +/** A string class capable of holding unicode codepoints. */ +typedef std::basic_string String; + + +static inline void encode_utf8(const String &s, std::string &r) +{ + for (char32_t cp : s) + encode_utf8(cp, r); +} + +static inline std::string encode_utf8(const String &s) +{ + std::string r; + encode_utf8(s, r); + return r; +} + +static inline String decode_utf8(const std::string &s) +{ + String r; + for (size_t i = 0; i < s.length(); ++i) + r.push_back(decode_utf8(s, i)); + return r; +} + +/** A stringstream-like class capable of holding unicode codepoints. + * The C++ standard does not support std::basic_stringstreamfilled) continue; if (!thunk->content.isHeap()) continue; if (e != thunk->content.v.h) continue; - name = pair.first->name; + name = encode_utf8(pair.first->name); } // Do not go into the next call frame, keep local reasoning. if (f.isCall()) break; @@ -272,11 +272,11 @@ namespace { if (dynamic_cast(e)) { return "object <" + name + ">"; } else if (auto *thunk = dynamic_cast(e)) { - return "thunk <" + thunk->name->name + ">"; + return "thunk <" + encode_utf8(thunk->name->name) + ">"; } else { const auto *func = static_cast(e); if (func->body == nullptr) { - name = jsonnet_builtin_decl(func->builtin).name; + name = encode_utf8(jsonnet_builtin_decl(func->builtin).name); return "builtin function <" + name + ">"; } return "function <" + name + ">"; @@ -438,7 +438,7 @@ namespace { }; /** Cache for imported Jsonnet files. */ - std::map, + std::map, const ImportCacheValue *> cachedImports; /** External variables for std.extVar. */ @@ -550,7 +550,7 @@ namespace { return r; } - Value makeString(const std::string &v) + Value makeString(const String &v) { Value r; r.t = Value::STRING; @@ -669,7 +669,7 @@ namespace { * \param loc Location of the import statement. * \param file Path to the filename. */ - AST *import(const LocationRange &loc, const std::string &file) + AST *import(const LocationRange &loc, const String &file) { const ImportCacheValue *input = importString(loc, file); AST *expr = jsonnet_parse(alloc, input->foundHere, input->content.c_str()); @@ -686,11 +686,11 @@ namespace { * \param file Path to the filename. * \param found_here If non-null, used to store the actual path of the file */ - const ImportCacheValue *importString(const LocationRange &loc, const std::string &file) + const ImportCacheValue *importString(const LocationRange &loc, const String &file) { std::string dir = dir_name(loc.file); - std::pair key(dir, file); + std::pair key(dir, file); const ImportCacheValue *cached_value = cachedImports[key]; if (cached_value != nullptr) return cached_value; @@ -699,16 +699,14 @@ namespace { int success = 0; char *found_here_cptr; char *content = - importCallback(importCallbackContext, dir.c_str(), file.c_str(), + importCallback(importCallbackContext, dir.c_str(), encode_utf8(file).c_str(), &found_here_cptr, &success); std::string input(content); - - input.assign(content); ::free(content); if (!success) { - std::string msg = "Couldn't open import \"" + file + "\": "; + std::string msg = "Couldn't open import \"" + encode_utf8(file) + "\": "; msg += input; throw makeError(loc, msg); } @@ -760,8 +758,8 @@ namespace { unsigned max_stack, double gc_min_objects, double gc_growth_trigger, JsonnetImportCallback *import_callback, void *import_callback_context) : heap(gc_min_objects, gc_growth_trigger), stack(max_stack), alloc(alloc), - idArrayElement(alloc->makeIdentifier("array_element")), - idInvariant(alloc->makeIdentifier("object_assert")), externalVars(ext_vars), + idArrayElement(alloc->makeIdentifier(U"array_element")), + idInvariant(alloc->makeIdentifier(U"object_assert")), externalVars(ext_vars), importCallback(import_callback), importCallbackContext(import_callback_context) { scratch = makeNull(); @@ -799,9 +797,9 @@ namespace { return; } bad:; - const std::string &name = jsonnet_builtin_decl(builtin).name; + const String &name = jsonnet_builtin_decl(builtin).name; std::stringstream ss; - ss << "Builtin function " + name + " expected ("; + ss << "Builtin function " + encode_utf8(name) + " expected ("; const char *prefix = ""; for (auto p : params) { ss << prefix << type_str(p); @@ -818,9 +816,9 @@ namespace { } - std::string toString(const LocationRange &loc) + String toString(const LocationRange &loc) { - return manifestJson(loc, false, ""); + return manifestJson(loc, false, U""); } @@ -867,7 +865,7 @@ namespace { HeapObject *self = nullptr; HeapLeafObject *found = findObject(f, obj, obj, 0, found_at, self); if (found == nullptr) { - throw makeError(loc, "Field does not exist: " + f->name); + throw makeError(loc, "Field does not exist: " + encode_utf8(f->name)); } if (auto *simp = dynamic_cast(found)) { auto it = simp->fields.find(f); @@ -996,7 +994,7 @@ namespace { case AST_IMPORTSTR: { const auto &ast = *static_cast(ast_); const ImportCacheValue *value = importString(ast.location, ast.file); - scratch = makeString(value->content); + scratch = makeString(decode_utf8(value->content)); } break; case AST_INDEX: { @@ -1102,8 +1100,8 @@ namespace { const auto &ast = *static_cast(ast_); auto *thunk = stack.lookUpVar(ast.id); if (thunk == nullptr) { - std::cerr << "INTERNAL ERROR: Could not bind variable: " << ast.id->name - << std::endl; + std::cerr << "INTERNAL ERROR: Could not bind variable: " + << encode_utf8(ast.id->name) << std::endl; std::abort(); } if (thunk->filled) { @@ -1374,9 +1372,9 @@ namespace { break; case Value::STRING: { - const std::string &lhs_str = + const String &lhs_str = static_cast(lhs.v.h)->value; - const std::string &rhs_str = + const String &rhs_str = static_cast(rhs.v.h)->value; switch (ast.op) { case BOP_PLUS: @@ -1534,31 +1532,31 @@ namespace { case 11: { // type switch (args[0].t) { case Value::NULL_TYPE: - scratch = makeString("null"); + scratch = makeString(U"null"); break; case Value::BOOLEAN: - scratch = makeString("boolean"); + scratch = makeString(U"boolean"); break; case Value::DOUBLE: - scratch = makeString("number"); + scratch = makeString(U"number"); break; case Value::ARRAY: - scratch = makeString("array"); + scratch = makeString(U"array"); break; case Value::FUNCTION: - scratch = makeString("function"); + scratch = makeString(U"function"); break; case Value::OBJECT: - scratch = makeString("object"); + scratch = makeString(U"object"); break; case Value::STRING: - scratch = makeString("string"); + scratch = makeString(U"string"); break; } @@ -1649,7 +1647,7 @@ namespace { const auto *obj = static_cast(args[0].v.h); bool include_hidden = args[1].v.b; // Stash in a set first to sort them. - std::set fields; + std::set fields; for (const auto &field : objectFields(obj, !include_hidden)) { fields.insert(field->name); } @@ -1665,7 +1663,7 @@ namespace { case 16: { // codepoint validateBuiltinArgs(loc, builtin, args, {Value::STRING}); - const std::string &str = + const String &str = static_cast(args[0].v.h)->value; if (str.length() != 1) { std::stringstream ss; @@ -1673,26 +1671,25 @@ namespace { << str.length(); throw makeError(loc, ss.str()); } - char c = static_cast(args[0].v.h)->value[0]; - scratch = makeDouble((unsigned char)(c)); + char32_t c = static_cast(args[0].v.h)->value[0]; + scratch = makeDouble((unsigned long)(c)); } break; case 17: { // char validateBuiltinArgs(loc, builtin, args, {Value::DOUBLE}); - long l = (unsigned long)(args[0].v.d); + long l = long(args[0].v.d); if (l < 0) { std::stringstream ss; ss << "Codepoints must be >= 0, got " << l; throw makeError(ast.location, ss.str()); } - if (l >= 128) { + if (l >= JSONNET_CODEPOINT_MAX) { std::stringstream ss; - ss << "Sorry, only ASCII supported right now. "; - ss << "Codepoints must be < 128, got " << l; + ss << "Invalid unicode codepoint, got " << l; throw makeError(ast.location, ss.str()); } - char c = l; - scratch = makeString(std::string(&c, 1)); + char32_t c = l; + scratch = makeString(String(&c, 1)); } break; case 18: { // log @@ -1731,15 +1728,17 @@ namespace { case 23: { // extVar validateBuiltinArgs(loc, builtin, args, {Value::STRING}); - const std::string &var = + const String &var = static_cast(args[0].v.h)->value; - if (externalVars.find(var) == externalVars.end()) { - throw makeError(ast.location, - "Undefined external variable: " + var); + std::string var8 = encode_utf8(var); + auto it = externalVars.find(var8); + if (it == externalVars.end()) { + std::string msg = "Undefined external variable: " + var8; + throw makeError(ast.location, msg); } - const VmExt &ext = externalVars[var]; + const VmExt &ext = it->second; if (ext.isCode) { - std::string filename = ""; + std::string filename = ""; AST *expr = jsonnet_parse(alloc, filename, ext.data.c_str()); jsonnet_static_analysis(expr); @@ -1747,7 +1746,7 @@ namespace { stack.pop(); goto recurse; } else { - scratch = makeString(ext.data); + scratch = makeString(decode_utf8(ext.data)); } } break; @@ -1834,7 +1833,8 @@ namespace { if (scratch.t != Value::STRING) throw makeError(ast.location, "Error message must be string, got " + type_str(scratch) + "."); - throw makeError(ast.location, static_cast(scratch.v.h)->value); + std::string msg = encode_utf8(static_cast(scratch.v.h)->value); + throw makeError(ast.location, msg); } break; case FRAME_IF: { @@ -1883,7 +1883,7 @@ namespace { "Object index must be string, got " + type_str(scratch) + "."); } - const std::string &index_name = + const String &index_name = static_cast(scratch.v.h)->value; auto *fid = alloc->makeIdentifier(index_name); stack.pop(); @@ -1897,7 +1897,6 @@ namespace { "String index must be a number, got " + type_str(scratch) + "."); } - // TODO(dcunnin): UTF-8 support goes here. long sz = obj->value.length(); long i = (long)scratch.v.d; if (i < 0 || i >= sz) { @@ -1906,7 +1905,7 @@ namespace { << " not within [0, " << sz << ")"; throw makeError(ast.location, ss.str()); } - char ch[] = {obj->value[i], '\0'}; + char32_t ch[] = {obj->value[i], U'\0'}; scratch = makeString(ch); } else { std::cerr << "INTERNAL ERROR: Not object / array / string." @@ -1987,8 +1986,9 @@ namespace { const auto &fname = static_cast(scratch.v.h)->value; const Identifier *fid = alloc->makeIdentifier(fname); if (f.objectFields.find(fid) != f.objectFields.end()) { - throw makeError(ast.location, - "Duplicate field name: \"" + fname + "\""); + std::string msg = "Duplicate field name: \"" + + encode_utf8(fname) + "\""; + throw makeError(ast.location, msg); } f.objectFields[fid].hide = f.fit->hide; f.objectFields[fid].body = f.fit->body; @@ -2039,7 +2039,7 @@ namespace { const Identifier *fid = alloc->makeIdentifier(fname); if (f.elements.find(fid) != f.elements.end()) { throw makeError(ast.location, - "Duplicate field name: \"" + fname + "\""); + "Duplicate field name: \"" + encode_utf8(fname) + "\""); } f.elements[fid] = arr->elements[f.elementId]; f.elementId++; @@ -2059,20 +2059,20 @@ namespace { const auto &ast = *static_cast(f.ast); const Value &lhs = stack.top().val; const Value &rhs = stack.top().val2; - std::stringstream ss; + String output; if (lhs.t == Value::STRING) { - ss << static_cast(lhs.v.h)->value; + output.append(static_cast(lhs.v.h)->value); } else { scratch = lhs; - ss << toString(ast.left->location); + output.append(toString(ast.left->location)); } if (rhs.t == Value::STRING) { - ss << static_cast(rhs.v.h)->value; + output.append(static_cast(rhs.v.h)->value); } else { scratch = rhs; - ss << toString(ast.right->location); + output.append(toString(ast.right->location)); } - scratch = makeString(ss.str()); + scratch = makeString(output); } break; case FRAME_UNARY: { @@ -2136,21 +2136,20 @@ namespace { * * \param multiline If true, will print objects and arrays in an indented fashion. */ - std::string manifestJson(const LocationRange &loc, bool multiline, - const std::string &indent) + String manifestJson(const LocationRange &loc, bool multiline, const String &indent) { // Printing fields means evaluating and binding them, which can trigger // garbage collection. - std::stringstream ss; + StringStream ss; switch (scratch.t) { case Value::ARRAY: { HeapArray *arr = static_cast(scratch.v.h); if (arr->elements.size() == 0) { - ss << "[ ]"; + ss << U"[ ]"; } else { - const char *prefix = multiline ? "[\n" : "["; - std::string indent2 = multiline ? indent + " " : indent; + const char32_t *prefix = multiline ? U"[\n" : U"["; + String indent2 = multiline ? indent + U" " : indent; for (auto *thunk : arr->elements) { LocationRange tloc = thunk->body == nullptr ? loc @@ -2172,26 +2171,26 @@ namespace { scratch = stack.top().val; stack.pop(); ss << prefix << indent2 << element; - prefix = multiline ? ",\n" : ", "; + prefix = multiline ? U",\n" : U", "; } - ss << (multiline ? "\n" : "") << indent << "]"; + ss << (multiline ? U"\n" : U"") << indent << U"]"; } } break; case Value::BOOLEAN: - ss << (scratch.v.b ? "true" : "false"); + ss << (scratch.v.b ? U"true" : U"false"); break; case Value::DOUBLE: - ss << jsonnet_unparse_number(scratch.v.d); + ss << decode_utf8(jsonnet_unparse_number(scratch.v.d)); break; case Value::FUNCTION: throw makeError(loc, "Couldn't manifest function in JSON output."); case Value::NULL_TYPE: - ss << "null"; + ss << U"null"; break; case Value::OBJECT: { @@ -2199,15 +2198,15 @@ namespace { runInvariants(loc, obj); // Using std::map has the useful side-effect of ordering the fields // alphabetically. - std::map fields; + std::map fields; for (const auto &f : objectFields(obj, true)) { fields[f->name] = f; } if (fields.size() == 0) { - ss << "{ }"; + ss << U"{ }"; } else { - std::string indent2 = multiline ? indent + " " : indent; - const char *prefix = multiline ? "{\n" : "{"; + String indent2 = multiline ? indent + U" " : indent; + const char32_t *prefix = multiline ? U"{\n" : U"{"; for (const auto &f : fields) { // pushes FRAME_CALL const AST *body = objectIndex(loc, obj, f.second); @@ -2218,16 +2217,16 @@ namespace { // get GC'd. scratch = stack.top().val; stack.pop(); - ss << prefix << indent2 << "\"" << f.first << "\": " << vstr; - prefix = multiline ? ",\n" : ", "; + ss << prefix << indent2 << U"\"" << f.first << U"\": " << vstr; + prefix = multiline ? U",\n" : U", "; } - ss << (multiline ? "\n" : "") << indent << "}"; + ss << (multiline ? U"\n" : U"") << indent << U"}"; } } break; case Value::STRING: { - const std::string &str = static_cast(scratch.v.h)->value; + const String &str = static_cast(scratch.v.h)->value; ss << jsonnet_unparse_escape(str); } break; @@ -2235,7 +2234,7 @@ namespace { return ss.str(); } - std::string manifestString(const LocationRange &loc) + String manifestString(const LocationRange &loc) { if (scratch.t != Value::STRING) { std::stringstream ss; @@ -2258,7 +2257,7 @@ namespace { } auto *obj = static_cast(scratch.v.h); runInvariants(loc, obj); - std::map fields; + std::map fields; for (const auto &f : objectFields(obj, true)) { fields[f->name] = f; } @@ -2268,12 +2267,12 @@ namespace { stack.top().val = scratch; evaluate(body, stack.size()); auto vstr = string ? manifestString(body->location) - : manifestJson(body->location, true, ""); + : manifestJson(body->location, true, U""); // Reset scratch so that the object we're manifesting doesn't // get GC'd. scratch = stack.top().val; stack.pop(); - r[f.first] = vstr; + r[encode_utf8(f.first)] = encode_utf8(vstr); } return r; } @@ -2293,9 +2292,9 @@ std::string jsonnet_vm_execute(Allocator *alloc, const AST *ast, import_callback, ctx); vm.evaluate(ast, 0); if (string_output) { - return vm.manifestString(LocationRange("During manifestation")); + return encode_utf8(vm.manifestString(LocationRange("During manifestation"))); } else { - return vm.manifestJson(LocationRange("During manifestation"), true, ""); + return encode_utf8(vm.manifestJson(LocationRange("During manifestation"), true, U"")); } } diff --git a/test_suite/error.string.invalid_escape_unicode_ascii.jsonnet b/test_suite/error.string.invalid_escape_unicode_ascii.jsonnet deleted file mode 100644 index cdc0bad42..000000000 --- a/test_suite/error.string.invalid_escape_unicode_ascii.jsonnet +++ /dev/null @@ -1,17 +0,0 @@ -/* -Copyright 2015 Google Inc. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -"\u0100" diff --git a/test_suite/unicode.jsonnet b/test_suite/unicode.jsonnet new file mode 100644 index 000000000..926d0f40c --- /dev/null +++ b/test_suite/unicode.jsonnet @@ -0,0 +1,36 @@ +/* +Copyright 2015 Google Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +std.assertEqual("Ā", "Ā") && + +std.assertEqual(std.length("Ā"), 1) && +std.assertEqual("Ā" + "Ā", "ĀĀ") && + +std.assertEqual("£7"[0], "£") && +std.assertEqual("£7"[1], "7") && + +local test_korean = "안녕 세상아!"; // Hello world! +std.assertEqual(std.length(test_korean), 7) && + +local test_russian = "ЁЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ ёйцукенгшщзхъфывапролджэячсмитьбю"; +std.assertEqual(std.length(test_russian), 67) && + +local test_chinese = "肉"; // Meat. +std.assertEqual(std.length(test_chinese), 1) && + +std.assertEqual("\u0100", "Ā") && + +true