Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support emoji encoding for Flux jobids #5174

Merged
merged 11 commits into from May 17, 2023
3 changes: 3 additions & 0 deletions doc/man1/flux-jobs.rst
Expand Up @@ -274,6 +274,9 @@ The field names that can be specified are:
**id.words**
job ID in mnemonic encoding

**id.emoji**
job ID in emoji encoding

**userid**
job submitter's userid

Expand Down
6 changes: 6 additions & 0 deletions src/bindings/python/flux/job/JobID.py
Expand Up @@ -45,6 +45,7 @@ class JobID(int):
- dotted hex (dothex) (xxxx.xxxx.xxxx.xxxx)
- kvs dir (dotted hex with `job.` prefix)
- RFC19 F58: (Base58 encoding with prefix `ƒ` or `f`)
- basemoji (emoji encoding)

A JobID object also has properties for encoding a JOBID into each
of the above representations, e.g. jobid.f85, jobid.words, jobid.dothex...
Expand Down Expand Up @@ -92,6 +93,11 @@ def words(self):
"""Return words (mnemonic) representation of a JobID"""
return self.encode("words")

@property
def emoji(self):
"""Return emoji representation of a JobID"""
return self.encode("emoji")

@property
def kvs(self):
"""Return KVS directory path of a JobID"""
Expand Down
2 changes: 2 additions & 0 deletions src/bindings/python/flux/job/info.py
Expand Up @@ -605,6 +605,7 @@ def job_fields_to_attrs(fields):
"id.dec": (),
"id.hex": (),
"id.f58": (),
"id.emoji": (),
"id.kvs": (),
"id.words": (),
"id.dothex": (),
Expand Down Expand Up @@ -697,6 +698,7 @@ class JobInfoFormat(flux.util.OutputFormat):
"id.dec": "JOBID",
"id.hex": "JOBID",
"id.f58": "JOBID",
"id.emoji": "JOBID",
"id.kvs": "JOBID",
"id.words": "JOBID",
"id.dothex": "JOBID",
Expand Down
2 changes: 2 additions & 0 deletions src/common/libjob/id.c
Expand Up @@ -88,6 +88,8 @@ int flux_job_id_encode (flux_jobid_t id,
t = FLUID_STRING_MNEMONIC;
else if (strcasecmp (type, "f58") == 0)
t = FLUID_STRING_F58;
else if (strcasecmp (type, "emoji") == 0)
t = FLUID_STRING_EMOJI;
else {
/* Return EPROTO for invalid type to differentiate from
* other invalid arguments.
Expand Down
4 changes: 4 additions & 0 deletions src/common/libjob/test/job.c
Expand Up @@ -369,6 +369,7 @@ struct jobid_parse_test jobid_parse_tests[] = {
{ "dothex", 0, "0000.0000.0000.0000" },
{ "kvs", 0, "job.0000.0000.0000.0000" },
{ "words", 0, "academy-academy-academy--academy-academy-academy" },
{ "emoji", 0, "😃" },
#if ASSUME_BROKEN_LOCALE
{ "f58", 0, "f1" },
#else
Expand All @@ -380,6 +381,7 @@ struct jobid_parse_test jobid_parse_tests[] = {
{ "dothex", 1, "0000.0000.0000.0001" },
{ "kvs", 1, "job.0000.0000.0000.0001" },
{ "words", 1, "acrobat-academy-academy--academy-academy-academy" },
{ "emoji", 1, "😄" },
#if ASSUME_BROKEN_LOCALE
{ "f58", 1, "f2" },
#else
Expand All @@ -391,6 +393,7 @@ struct jobid_parse_test jobid_parse_tests[] = {
{ "dothex", 65535, "0000.0000.0000.ffff" },
{ "kvs", 65535, "job.0000.0000.0000.ffff" },
{ "words", 65535, "nevada-archive-academy--academy-academy-academy" },
{ "emoji", 65535, "💁📚" },
#if ASSUME_BROKEN_LOCALE
{ "f58", 65535, "fLUv" },
#else
Expand All @@ -402,6 +405,7 @@ struct jobid_parse_test jobid_parse_tests[] = {
{ "dothex", 6787342413402046, "0018.1d0d.4d85.0fbe" },
{ "kvs", 6787342413402046, "job.0018.1d0d.4d85.0fbe" },
{ "words", 6787342413402046, "cake-plume-nepal--neuron-pencil-academy" },
{ "emoji", 6787342413402046, "👴😱🔚🎮🕙🚩" },
#if ASSUME_BROKEN_LOCALE
{ "f58", 6787342413402046, "fuzzybunny" },
#else
Expand Down
11 changes: 9 additions & 2 deletions src/common/libutil/Makefile.am
Expand Up @@ -98,7 +98,9 @@ libutil_la_SOURCES = \
slice.c \
slice.h \
strstrip.c \
strstrip.h
strstrip.h \
basemoji.h \
basemoji.c

EXTRA_DIST = veb_mach.c

Expand Down Expand Up @@ -132,7 +134,8 @@ TESTS = test_sha1.t \
test_strstrip.t \
test_slice.t \
test_timestamp.t \
test_environment.t
test_environment.t \
test_basemoji.t

test_ldadd = \
$(top_builddir)/src/common/libutil/libutil.la \
Expand Down Expand Up @@ -283,3 +286,7 @@ test_timestamp_t_LDADD = $(test_ldadd)
test_environment_t_SOURCES = test/environment.c
test_environment_t_CPPFLAGS = $(test_cppflags)
test_environment_t_LDADD = $(test_ldadd)

test_basemoji_t_SOURCES = test/basemoji.c
test_basemoji_t_CPPFLAGS = $(test_cppflags)
test_basemoji_t_LDADD = $(test_ldadd)
228 changes: 228 additions & 0 deletions src/common/libutil/basemoji.c
@@ -0,0 +1,228 @@
/************************************************************\
* Copyright 2023 Lawrence Livermore National Security, LLC
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
*
* This file is part of the Flux resource manager framework.
* For details, see https://github.com/flux-framework.
*
* SPDX-License-Identifier: LGPL-3.0
\************************************************************/

/* basemoji.c - an emoji encoding for unsigned 64 bit integers
*/

#if HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>

#include "ccan/array_size/array_size.h"
#include "basemoji.h"

/* Minimum length of a b576 string is 1 emoji, or 4 bytes */
#define BASEMOJI_MINLEN 4

/* Maximum number of emoji "digits" in a basemoji string is
*
* ceil (ln (2^64-1)/ln (576)) = 7
*
* 4 bytes per emoji, so 4*7 = 28 bytes.
*/
#define BASEMOJI_MAXLEN 28

/* The following is a Selection of 576 emoji in CLDR[1] collation order[2]
* taken from the version 2010 Unicode emoji set[3]. Note: Selected code
* points are all represented in 4 bytes, which is assumed in the
* implementation in this module. Additionally, every character in this
* selected set has a common first two bytes of F0 9F in UTF-8 encoding,
* which aids in detection of a valid basemoji string.
*
* 1. https://cldr.unicode.org
* 2. https://unicode.org/emoji/charts-12.1/emoji-ordering.txt
* 3. https://unicode.org/emoji/charts/emoji-versions.html
*
*/
const char *emojis[] = {
"😃", "😄", "😁", "😆", "😅", "😂", "😉", "😊", "😍", "😘", "😚", "😋",
"😜", "😝", "😏", "😒", "😌", "😔", "😪", "😷", "😵", "😲", "😳", "😨",
"😰", "😥", "😢", "😭", "😱", "😖", "😣", "😞", "😓", "😩", "😫", "😤",
"😡", "😠", "👿", "💀", "💩", "👹", "👺", "👻", "👽", "👾", "😺", "😸",
"😹", "😻", "😼", "😽", "🙀", "😿", "😾", "🙈", "🙉", "🙊", "💌", "💘",
"💝", "💖", "💗", "💓", "💞", "💕", "💟", "💔", "💛", "💚", "💙", "💜",
"💋", "💯", "💢", "💥", "💫", "💦", "💨", "💬", "💤", "👋", "👌", "👈",
"👉", "👆", "👇", "👍", "👎", "👊", "👏", "🙌", "👐", "🙏", "💅", "💪",
"👂", "👃", "👀", "👅", "👄", "👶", "👦", "👧", "👱", "👨", "👩", "👴",
"👵", "🙍", "🙎", "🙅", "🙆", "💁", "🙋", "🙇", "👮", "💂", "👷", "👸",
"👳", "👲", "👰", "👼", "🎅", "💆", "💇", "🚶", "🏃", "💃", "👯", "🏂",
"🏄", "🏊", "🛀", "👫", "💏", "💑", "👪", "👤", "👣", "🐵", "🐒", "🐶",
"🐩", "🐺", "🐱", "🐯", "🐴", "🐎", "🐮", "🐷", "🐗", "🐽", "🐑", "🐫",
"🐘", "🐭", "🐹", "🐰", "🐻", "🐨", "🐼", "🐾", "🐔", "🐣", "🐤", "🐥",
"🐦", "🐧", "🐸", "🐢", "🐍", "🐲", "🐳", "🐬", "🐟", "🐠", "🐡", "🐙",
"🐚", "🐌", "🐛", "🐜", "🐝", "🐞", "💐", "🌸", "💮", "🌹", "🌺", "🌻",
"🌼", "🌷", "🌱", "🌴", "🌵", "🌾", "🌿", "🍀", "🍁", "🍂", "🍃", "🍄",
"🍇", "🍈", "🍉", "🍊", "🍌", "🍍", "🍎", "🍏", "🍑", "🍒", "🍓", "🍅",
"🍆", "🌽", "🌰", "🍞", "🍖", "🍗", "🍔", "🍟", "🍕", "🍳", "🍲", "🍱",
"🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🍡",
"🍦", "🍧", "🍨", "🍩", "🍪", "🎂", "🍰", "🍫", "🍬", "🍭", "🍮", "🍯",
"🍵", "🍶", "🍷", "🍸", "🍹", "🍺", "🍻", "🍴", "🔪", "🌏", "🗾", "🌋",
"🗻", "🏠", "🏡", "🏢", "🏣", "🏥", "🏦", "🏨", "🏩", "🏪", "🏫", "🏬",
"🏭", "🏯", "🏰", "💒", "🗼", "🗽", "🌁", "🌃", "🌄", "🌅", "🌆", "🌇",
"🌉", "🎠", "🎡", "🎢", "💈", "🎪", "🚃", "🚄", "🚅", "🚇", "🚉", "🚌",
"🚑", "🚒", "🚓", "🚕", "🚗", "🚙", "🚚", "🚲", "🚏", "🚨", "🚥", "🚧",
"🚤", "🚢", "💺", "🚀", "🕛", "🕐", "🕑", "🕒", "🕓", "🕔", "🕕", "🕖",
"🕗", "🕘", "🕙", "🕚", "🌑", "🌓", "🌔", "🌕", "🌙", "🌛", "🌟", "🌠",
"🌌", "🌀", "🌈", "🌂", "🔥", "💧", "🌊", "🎃", "🎄", "🎆", "🎇", "🎈",
"🎉", "🎊", "🎋", "🎍", "🎎", "🎏", "🎐", "🎑", "🎀", "🎁", "🎫", "🏆",
"🏀", "🏈", "🎾", "🎳", "🎣", "🎽", "🎿", "🎯", "🔫", "🎱", "🔮", "🎮",
"🎰", "🎲", "🃏", "🀄", "🎴", "🎭", "🎨", "👓", "👔", "👕", "👖", "👗",
"👘", "👙", "👚", "👛", "👜", "👝", "🎒", "👞", "👟", "👠", "👡", "👢",
"👑", "👒", "🎩", "🎓", "💄", "💍", "💎", "🔊", "📢", "📣", "🔔", "🎼",
"🎵", "🎶", "🎤", "🎧", "📻", "🎷", "🎸", "🎹", "🎺", "🎻", "📱", "📲",
"📞", "📟", "📠", "🔋", "🔌", "💻", "💽", "💾", "💿", "📀", "🎥", "🎬",
"📺", "📷", "📹", "📼", "🔍", "🔎", "💡", "🔦", "🏮", "📔", "📕", "📖",
"📗", "📘", "📙", "📚", "📓", "📒", "📃", "📜", "📄", "📰", "📑", "🔖",
"💰", "💴", "💵", "💸", "💳", "💹", "📧", "📨", "📩", "📤", "📥", "📦",
"📫", "📪", "📮", "📝", "💼", "📁", "📂", "📅", "📆", "📇", "📈", "📉",
"📊", "📋", "📌", "📍", "📎", "📏", "📐", "🔒", "🔓", "🔏", "🔐", "🔑",
"🔨", "💣", "🔧", "🔩", "🔗", "📡", "💉", "💊", "🚪", "🚽", "🚬", "🗿",
"🏧", "🚹", "🚺", "🚻", "🚼", "🚾", "🚫", "🚭", "🔞", "🔃", "🔙", "🔚",
"🔛", "🔜", "🔝", "🔯", "🔼", "🔽", "🎦", "📶", "📳", "📴", "💱", "💲",
"🔱", "📛", "🔰", "🔟", "🔠", "🔡", "🔢", "🔣", "🔤", "🆎", "🆑", "🆒",
"🆓", "🆔", "🆕", "🆖", "🆗", "🆘", "🆙", "🆚", "🈁", "🈶", "🈯", "🉐",
"🈹", "🈚", "🈲", "🉑", "🈸", "🈴", "🈳", "🈺", "🈵", "🔴", "🔵", "🔶",
"🔷", "🔸", "🔹", "🔺", "🔻", "💠", "🔘", "🔳", "🔲", "🏁", "🚩", "🎌",
};

bool is_basemoji_string (const char *s)
{
int len = strlen (s);

/* This code assumes length of emoji array is 576
* Generate error at build time if this becomes untrue:
*/
BUILD_ASSERT(ARRAY_SIZE(emojis) == 576);

/* Check for expected length of a basemoji string, and if the
* first two bytes match the expected UTF-8 encoding.
* This doesn't guarantee that `s` is a valid basemoji string,
* but this will catch most obvious cases and other invalid strings
* are left to be detected in decode.
*/
if (len >= BASEMOJI_MINLEN
&& len <= BASEMOJI_MAXLEN
&& len % 4 == 0
&& (uint8_t)s[0] == 0xf0
&& (uint8_t)s[1] == 0x9f)
return true;
return false;
}

/* Encode id into buf in reverse (i.e. higher order bytes are encoded
* and placed first into 'buf' since we're doing progressive division.)
*/
static int emoji_revenc (char *buf, int buflen, uint64_t id)
{
int index = 0;
memset (buf, 0, buflen);
if (id == 0) {
memcpy (buf, emojis[0], 4);
return 4;
}
while (id > 0) {
int rem = id % 576;
memcpy (buf+index, emojis[rem], 4);
index += 4;
id = id / 576;
}
return index;
}

int uint64_basemoji_encode (uint64_t id, char *buf, int buflen)
{
int count;
int n;
char reverse[BASEMOJI_MAXLEN+1];

if (buf == NULL || buflen <= 0) {
errno = EINVAL;
return -1;
}

/* Encode bytes to emoji (in reverse), which also gives us a count
* of the total bytes required for this encoding.
*/
if ((count = emoji_revenc (reverse, sizeof (reverse), id)) < 0) {
errno = EINVAL;
return -1;

Check warning on line 162 in src/common/libutil/basemoji.c

View check run for this annotation

Codecov / codecov/patch

src/common/libutil/basemoji.c#L161-L162

Added lines #L161 - L162 were not covered by tests
}

/* Check for overflow of provided buffer:
* Need space for count bytes for emoji + NUL
*/
if (count + 1 > buflen) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parens around (count + 1)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, but operator precedence puts + before > so what is the need here? Just preference?

Copy link
Member

@garlick garlick May 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I would not add parens there (precedence is clear).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok no prob, I tend to add parens when there's more than "1 thingie" on one side of the gt or lt.

errno = EOVERFLOW;
return -1;
}

memset (buf, 0, buflen);
n = 0;

/* Copy 4-byte emojis back in order so that most significant bits are
* on the left:
*/
for (int i = count - 4; i >= 0; i-=4) {
memcpy (buf+n, reverse+i, 4);
n+=4;
}
return 0;
}


static int basemoji_lookup (const char *c, int *result)
{
for (int i = 0; i < 576; i++) {
if (memcmp (c, emojis[i], 4) == 0) {
*result = i;
return 0;
}
}
errno = EINVAL;
return -1;

Check warning on line 196 in src/common/libutil/basemoji.c

View check run for this annotation

Codecov / codecov/patch

src/common/libutil/basemoji.c#L195-L196

Added lines #L195 - L196 were not covered by tests
}

int uint64_basemoji_decode (const char *str, uint64_t *idp)
{
uint64_t id = 0;
uint64_t scale = 1;
int len;

if (str == NULL
|| idp == NULL
|| !is_basemoji_string (str)) {
errno = EINVAL;
return -1;
}

/* Move through basemoji string in reverse since least significant
* bits are at the end. Since all emoji are 4 bytes, start at 4 from
* the end to point to the final emoji.
*/
len = strlen (str);
for (int i = len - 4; i >= 0; i-=4) {
int c;
if (basemoji_lookup (str+i, &c) < 0) {
errno = EINVAL;
return -1;

Check warning on line 221 in src/common/libutil/basemoji.c

View check run for this annotation

Codecov / codecov/patch

src/common/libutil/basemoji.c#L220-L221

Added lines #L220 - L221 were not covered by tests
}
id += c * scale;
scale *= 576;
}
*idp = id;
return 0;
}
46 changes: 46 additions & 0 deletions src/common/libutil/basemoji.h
@@ -0,0 +1,46 @@
/************************************************************\
* Copyright 2023 Lawrence Livermore National Security, LLC
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
*
* This file is part of the Flux resource manager framework.
* For details, see https://github.com/flux-framework.
*
* SPDX-License-Identifier: LGPL-3.0
\************************************************************/

#ifndef _UTIL_BASEMOJI_H
#define _UTIL_BASEMOJI_H

#include <stdint.h>
#include <stdbool.h>

/* basemoji - an implementation the RFC 19 FLUID emoji encoding
*/

/* Convert a 64 bit unsigned integer to basemoji, placing the result
* in buffer 'buf' of size 'buflen'.
*
* Returns 0 on success, -1 on failure with errno set:
* EINVAL: Invalid arguments
* EOVERFLOW: buffer too small for encoded string
*/
int uint64_basemoji_encode (uint64_t id, char *buf, int buflen);

/* Decode a string in basemoji to an unsigned 64 bit integer.
*
* Returns 0 on success, -1 on failure with errno set:
* EINVAL: Invalid arguments
*/
int uint64_basemoji_decode (const char *str, uint64_t *idp);

/* Return true if 's' could be a basemoji string, i.e. it falls
* within the minimum and maximum lengths, and starts with the
* expected bytes.
*/
bool is_basemoji_string (const char *s);

#endif /* !_UTIL_BASEMOJI_H */

/*
* vi:tabstop=4 shiftwidth=4 expandtab
*/