Skip to content

Commit

Permalink
Add new AtU8 beam chunk
Browse files Browse the repository at this point in the history
The new chunk stores atoms encoded in UTF-8.

beam_lib has also been modified to handle the new
'utf8_atoms' attribute while the 'atoms' attribute
may be a missing chunk from now on.

The binary_to_atom/2 BIF can now encode any utf8
binary with up to 255 characters.

The list_to_atom/1 BIF can now accept codepoints
higher than 255 with up to 255 characters (thanks
to Björn Gustavsson).
  • Loading branch information
José Valim committed Jan 13, 2017
1 parent 6c7539b commit d14cae9
Show file tree
Hide file tree
Showing 19 changed files with 369 additions and 168 deletions.
35 changes: 24 additions & 11 deletions erts/emulator/beam/atom.c
Expand Up @@ -233,10 +233,10 @@ static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
}

/*
* erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
* erts_atom_put_index() may fail. Returns negative indexes for errors.
*/
Eterm
erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
int
erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
const byte *text = name;
Expand All @@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = 0;
else
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
}

switch (enc) {
Expand All @@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = MAX_ATOM_CHARACTERS;
else
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
}
#ifdef DEBUG
for (aix = 0; aix < len; aix++) {
Expand All @@ -276,15 +276,15 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = MAX_ATOM_CHARACTERS;
else
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
}
no_latin1_chars = tlen;
latin1_to_utf8(utf8_copy, &text, &tlen);
break;
case ERTS_ATOM_ENC_UTF8:
/* First sanity check; need to verify later */
if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
break;
}

Expand All @@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
atom_read_unlock();
if (aix >= 0) {
/* Already in table no need to verify it */
return make_atom(aix);
return aix;
}

if (enc == ERTS_ATOM_ENC_UTF8) {
Expand All @@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
case ERTS_UTF8_OK_MAX_CHARS:
/* Truncated... */
if (!trunc)
return THE_NON_VALUE;
return ATOM_MAX_CHARS_ERROR;
ASSERT(no_chars == MAX_ATOM_CHARACTERS);
tlen = err_pos - text;
break;
default:
/* Bad utf8... */
return THE_NON_VALUE;
return ATOM_BAD_ENCODING_ERROR;
}
}

Expand All @@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
atom_write_lock();
aix = index_put(&erts_atom_table, (void*) &a);
atom_write_unlock();
return make_atom(aix);
return aix;
}

/*
* erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
*/
Eterm
erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
int aix = erts_atom_put_index(name, len, enc, trunc);
if (aix >= 0)
return make_atom(aix);
else
return THE_NON_VALUE;
}

Eterm
Expand Down
3 changes: 3 additions & 0 deletions erts/emulator/beam/atom.h
Expand Up @@ -29,6 +29,8 @@
#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
#define ATOM_LIMIT (1024*1024)
#define MIN_ATOM_TABLE_SIZE 8192
#define ATOM_BAD_ENCODING_ERROR -1
#define ATOM_MAX_CHARS_ERROR -2

#ifndef ARCH_32
/* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an
Expand Down Expand Up @@ -133,6 +135,7 @@ int atom_table_sz(void); /* table size in bytes, excluding stored objects */

Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
void init_atom_table(void);
void atom_info(fmtfn_t, void *);
void dump_atoms(fmtfn_t, void *);
Expand Down
66 changes: 48 additions & 18 deletions erts/emulator/beam/beam_load.c
Expand Up @@ -157,13 +157,15 @@ typedef struct {
#define STR_CHUNK 2
#define IMP_CHUNK 3
#define EXP_CHUNK 4
#define NUM_MANDATORY 5
#define MIN_MANDATORY 1
#define MAX_MANDATORY 5

#define LAMBDA_CHUNK 5
#define LITERAL_CHUNK 6
#define ATTR_CHUNK 7
#define COMPILE_CHUNK 8
#define LINE_CHUNK 9
#define UTF8_ATOM_CHUNK 10

#define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0]))

Expand All @@ -173,9 +175,13 @@ typedef struct {

static Uint chunk_types[] = {
/*
* Mandatory chunk types -- these MUST be present.
* Atom chunk types -- Atom or AtU8 MUST be present.
*/
MakeIffId('A', 't', 'o', 'm'), /* 0 */

/*
* Mandatory chunk types -- these MUST be present.
*/
MakeIffId('C', 'o', 'd', 'e'), /* 1 */
MakeIffId('S', 't', 'r', 'T'), /* 2 */
MakeIffId('I', 'm', 'p', 'T'), /* 3 */
Expand All @@ -189,6 +195,7 @@ static Uint chunk_types[] = {
MakeIffId('A', 't', 't', 'r'), /* 7 */
MakeIffId('C', 'I', 'n', 'f'), /* 8 */
MakeIffId('L', 'i', 'n', 'e'), /* 9 */
MakeIffId('A', 't', 'U', '8'), /* 10 */
};

/*
Expand Down Expand Up @@ -490,9 +497,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks,
#endif
static int init_iff_file(LoaderState* stp, byte* code, Uint size);
static int scan_iff_file(LoaderState* stp, Uint* chunk_types,
Uint num_types, Uint num_mandatory);
Uint num_types);
static int verify_chunks(LoaderState* stp);
static int load_atom_table(LoaderState* stp);
static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc);
static int load_import_table(LoaderState* stp);
static int read_export_table(LoaderState* stp);
static int is_bif(Eterm mod, Eterm func, unsigned arity);
Expand Down Expand Up @@ -629,7 +636,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
CHKALLOC();
CHKBLK(ERTS_ALC_T_CODE,stp->code);
if (!init_iff_file(stp, code, unloaded_size) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto load_error;
}
Expand Down Expand Up @@ -674,9 +681,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
*/

CHKBLK(ERTS_ALC_T_CODE,stp->code);
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp)) {
goto load_error;
if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
goto load_error;
}
} else {
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
goto load_error;
}
}

/*
Expand Down Expand Up @@ -1212,7 +1226,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size)
* Scan the IFF file. The header should have been verified by init_iff_file().
*/
static int
scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory)
scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types)
{
Uint count;
Uint id;
Expand Down Expand Up @@ -1291,7 +1305,16 @@ verify_chunks(LoaderState* stp)
MD5_CTX context;

MD5Init(&context);
for (i = 0; i < NUM_MANDATORY; i++) {

if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) {
MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size);
} else if (stp->chunks[ATOM_CHUNK].start != NULL) {
MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size);
} else {
LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n");
}

for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) {
if (stp->chunks[i].start != NULL) {
MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size);
} else {
Expand Down Expand Up @@ -1352,7 +1375,7 @@ verify_chunks(LoaderState* stp)
}

static int
load_atom_table(LoaderState* stp)
load_atom_table(LoaderState* stp, ErtsAtomEncoding enc)
{
unsigned int i;

Expand All @@ -1371,7 +1394,7 @@ load_atom_table(LoaderState* stp)

GetByte(stp, n);
GetString(stp, atom, n);
stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
stp->atom[i] = erts_atom_put(atom, n, enc, 1);
}

/*
Expand Down Expand Up @@ -5937,7 +5960,7 @@ code_get_chunk_2(BIF_ALIST_2)
goto error;
}
if (!init_iff_file(stp, start, binary_size(Bin)) ||
!scan_iff_file(stp, &chunk, 1, 1) ||
!scan_iff_file(stp, &chunk, 1) ||
stp->chunks[0].start == NULL) {
res = am_undefined;
goto done;
Expand Down Expand Up @@ -5986,7 +6009,7 @@ code_module_md5_1(BIF_ALIST_1)
}
stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */
if (!init_iff_file(stp, bytes, binary_size(Bin)) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
res = am_undefined;
goto done;
Expand Down Expand Up @@ -6335,17 +6358,24 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info)
if (!init_iff_file(stp, bytes, size)) {
goto error;
}
if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto error;
}
define_file(stp, "code chunk header", CODE_CHUNK);
if (!read_code_header(stp)) {
goto error;
}
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp)) {
goto error;
if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
goto error;
}
} else {
define_file(stp, "atom table", ATOM_CHUNK);
if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
goto error;
}
}
define_file(stp, "export table", EXP_CHUNK);
if (!stub_read_export_table(stp)) {
Expand Down
14 changes: 7 additions & 7 deletions erts/emulator/beam/bif.c
Expand Up @@ -3022,8 +3022,8 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
{
Eterm res;
char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
Sint i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT);
Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);

if (i < 0) {
erts_free(ERTS_ALC_T_TMP, (void *) buf);
Expand All @@ -3033,7 +3033,7 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
}
BIF_ERROR(BIF_P, BADARG);
}
res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1);
res = erts_atom_put(buf, i, ERTS_ATOM_ENC_UTF8, 1);
ASSERT(is_atom(res));
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_RET(res);
Expand All @@ -3043,17 +3043,17 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)

BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1)
{
Sint i;
char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT);
Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);

if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) {
if (i < 0) {
error:
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_ERROR(BIF_P, BADARG);
} else {
Eterm a;

if (erts_atom_get(buf, i, &a, ERTS_ATOM_ENC_LATIN1)) {
if (erts_atom_get((char *) buf, i, &a, ERTS_ATOM_ENC_UTF8)) {
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_RET(a);
} else {
Expand Down

0 comments on commit d14cae9

Please sign in to comment.