Skip to content

Commit

Permalink
[LibOS,PAL] Fix wrong handling of sysfs NUMA nodes when nodes are off…
Browse files Browse the repository at this point in the history
…line

Previously, Gramine iterated through all NUMA nodes within the range
obtained from the host's `/sys/devices/system/node/possible` file to get
`cpulist` and `distance`. It also used the count of possible nodes to
parse the `distance` between nodes. However, there is no
`/sys/devices/system/node/nodeX` directory when a node is not online.
Also, `/sys/devices/system/node/nodeX/distance` lists distances between
online nodes only (jumping over offline nodes). This can lead to
failures when Gramine tries to read values of `cpulist` and `distance`
pseudo-files from the directory. Plus a correct parsing of `distance`
should consider only online nodes.

Co-authored-by: Kailun Qin <kailun.qin@intel.com>
Co-authored-by: Sankaranarayanan Venkatasubramanian <sankaranarayanan.venkatasubramanian@intel.com>
Signed-off-by: Sankaranarayanan Venkatasubramanian <sankaranarayanan.venkatasubramanian@intel.com>
Signed-off-by: Kailun Qin <kailun.qin@intel.com>
Signed-off-by: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
  • Loading branch information
3 people committed Sep 13, 2023
1 parent 058997e commit d12f96e
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 23 deletions.
8 changes: 6 additions & 2 deletions libos/src/fs/sys/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ bool sys_resource_name_exists(struct libos_dentry* parent, const char* name) {
if (pseudo_parse_ulong(&name[prefix_len], total - 1, &n) < 0)
return false;

if (!strcmp(parent->name, "node") && !g_pal_public_state->topo_info.numa_nodes[n].is_online)
return false;

return true;
}

Expand All @@ -198,6 +201,9 @@ int sys_resource_list_names(struct libos_dentry* parent, readdir_callback_t call
/* Generate "{prefix}N" names for all N less than total */

for (size_t i = 0; i < total; i++) {
if (!strcmp(parent->name, "node") && !g_pal_public_state->topo_info.numa_nodes[i].is_online)
continue;

char ent_name[strlen(prefix) + 22];
snprintf(ent_name, sizeof(ent_name), "%s%zu", prefix, i);
int ret = callback(ent_name, arg);
Expand Down Expand Up @@ -283,8 +289,6 @@ static void init_node_dir(struct pseudo_node* node) {
pseudo_add_str(nodeX, "distance", &sys_node_load);
pseudo_add_str(nodeX, "meminfo", &sys_node_meminfo_load);

// TODO(mkow): Does this show up for offline nodes? I never succeeded in shutting down one, even
// after shutting down all CPUs inside the node it shows up as online on `node/online` list.
struct pseudo_node* hugepages = pseudo_add_dir(nodeX, "hugepages");
struct pseudo_node* hugepages_2m = pseudo_add_dir(hugepages, "hugepages-2048kB");
pseudo_add_str(hugepages_2m, "nr_hugepages", &sys_node_load);
Expand Down
19 changes: 17 additions & 2 deletions libos/src/fs/sys/node_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,31 @@ int sys_node_load(struct libos_dentry* dent, char** out_data, size_t* out_size)
const char* name = dent->name;
const struct pal_topo_info* topo = &g_pal_public_state->topo_info;
const struct pal_numa_node_info* numa_node = &topo->numa_nodes[node_id];
if (!numa_node->is_online)
return -ENOENT;

char str[PAL_SYSFS_MAP_FILESZ] = {0};
if (strcmp(name, "cpumap") == 0) {
ret = sys_print_as_bitmask(str, sizeof(str), topo->threads_cnt, is_in_same_node, &node_id);
} else if (strcmp(name, "distance") == 0) {
/* Linux reflects only online nodes in the `distance` file, do the same */
size_t* distances = topo->numa_distance_matrix + node_id * topo->numa_nodes_cnt;
size_t str_pos = 0;
for (size_t i = 0; i < topo->numa_nodes_cnt; i++) {
ret = snprintf(str + str_pos, sizeof(str) - str_pos,
"%zu%s", distances[i], i != (topo->numa_nodes_cnt - 1) ? " " : "\n");
if (!topo->numa_nodes[i].is_online)
continue;
assert(distances[i]);
ret = snprintf(str + str_pos, sizeof(str) - str_pos, "%s%zu", str_pos ? " " : "",
distances[i]);
if (ret < 0)
return ret;
if ((size_t)ret >= sizeof(str) - str_pos)
return -EOVERFLOW;
str_pos += ret;
}
if (str_pos >= sizeof(str) - 1)
return -EOVERFLOW;
str[str_pos] = '\n';
} else if (strcmp(name, "nr_hugepages") == 0) {
const char* parent_name = dent->parent->name;
if (strcmp(parent_name, "hugepages-2048kB") == 0) {
Expand Down Expand Up @@ -115,6 +125,11 @@ int sys_node_meminfo_load(struct libos_dentry* dent, char** out_data, size_t* ou
if (ret < 0)
return ret;

const struct pal_topo_info* topo = &g_pal_public_state->topo_info;
const struct pal_numa_node_info* numa_node = &topo->numa_nodes[node_id];
if (!numa_node->is_online)
return -ENOENT;

size_t size = 0, max = 256;
char* str = malloc(max);
if (!str)
Expand Down
3 changes: 2 additions & 1 deletion pal/include/arch/x86_64/pal_topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ struct pal_topo_info {
struct pal_numa_node_info* numa_nodes;

/* Has `numa_nodes_cnt * numa_nodes_cnt` elements.
* numa_distance_matrix[i*numa_nodes_cnt + j] is NUMA distance from node i to node j. */
* numa_distance_matrix[i*numa_nodes_cnt + j] is NUMA distance from node i to node j.
* If node i or node j is offline, then NUMA distance is zero. */
size_t* numa_distance_matrix;
};
66 changes: 50 additions & 16 deletions pal/src/host/linux-common/topo_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,11 @@ static int get_hw_resource_value(const char* filename, size_t* out_value) {
return 0;
}

/* Read a space-separated list of numbers.
* The file has to contain at least `count` numbers, otherwise this function returns `-EINVAL`. */
static int read_numbers_from_file(const char* path, size_t* out_arr, size_t count) {
/* Read a space-separated list of numbers in the format used by
* `/sys/devices/system/node/node<i>/distance`, and write the result to the online nodes from
* `numa_nodes`. */
static int read_distances_from_file(const char* path, size_t* out_arr,
struct pal_numa_node_info* numa_nodes, size_t nodes_cnt) {
char buf[PAL_SYSFS_BUF_FILESZ];
int ret = read_file_buffer(path, buf, sizeof(buf) - 1);
if (ret < 0)
Expand All @@ -71,18 +73,26 @@ static int read_numbers_from_file(const char* path, size_t* out_arr, size_t coun

const char* buf_it = buf;
const char* end;
for (size_t i = 0; i < count; i++) {
char last_separator = ' ';
size_t node_i = 0;
for (size_t input_i = 0; /* no condition */; input_i++) {
/* Find next online node (only these are listed in `distance` file). */
while (node_i < nodes_cnt && !numa_nodes[node_i].is_online)
node_i++;
if (node_i == nodes_cnt)
break;
if (last_separator != ' ')
return -EINVAL;

unsigned long val;
ret = str_to_ulong(buf_it, 10, &val, &end);
if (ret < 0)
return -EINVAL;
char expected_separator = (i != count - 1) ? ' ' : '\n';
if (*end != expected_separator)
return -EINVAL;
buf_it = end + 1;
out_arr[i] = (size_t)val;
last_separator = *end;
buf_it = *end ? end + 1 : end; // don't shift past NULL-byte, possible for malformed inputs
out_arr[node_i++] = (size_t)val;
}
return 0;
return last_separator == '\n' ? 0 : -EINVAL;
}

static int iterate_ranges_from_file(const char* path, int (*callback)(size_t index, void* arg),
Expand Down Expand Up @@ -358,6 +368,9 @@ int get_topology_info(struct pal_topo_info* topo_info) {
}

for (size_t i = 0; i < nodes_cnt; i++) {
if (!numa_nodes[i].is_online)
continue;

snprintf(path, sizeof(path), "/sys/devices/system/node/node%zu/cpulist", i);
ret = iterate_ranges_from_file(path, set_node_id, &(struct set_node_id_args){
.threads = threads,
Expand All @@ -367,17 +380,38 @@ int get_topology_info(struct pal_topo_info* topo_info) {
if (ret < 0)
goto fail;

/* Since our sysfs doesn't support writes, set persistent hugepages to their default value
* of zero */
numa_nodes[i].nr_hugepages[HUGEPAGES_2M] = 0;
numa_nodes[i].nr_hugepages[HUGEPAGES_1G] = 0;
}

/*
* Linux kernel reflects only online nodes in the `distances` array. E.g. if a system has node 0
* online, node 1 offline and node 2 online, then distances matrix in Linux will look like this:
*
* [ node 0 -> node 0, node 0 -> node 2
* node 2 -> node 0, node 2 -> node 2 ]
*
* Gramine has a different view of the `distances` array -- it includes both online nodes and
* offline nodes (distances to offline nodes are 0). Thus, the above system will look like this:
*
* [ node 0 -> node 0, 0 , node 0 -> node 2
* 0 , 0 , 0
* node 2 -> node 0, 0 , node 2 -> node 2 ]
*/
memset(distances, 0, nodes_cnt * nodes_cnt * sizeof(*distances));
for (size_t i = 0; i < nodes_cnt; i++) {
if (!numa_nodes[i].is_online)
continue;

/* populate row i of `distances`, setting only online nodes */
ret = snprintf(path, sizeof(path), "/sys/devices/system/node/node%zu/distance", i);
if (ret < 0)
goto fail;
ret = read_numbers_from_file(path, distances + i * nodes_cnt, nodes_cnt);
ret = read_distances_from_file(path, distances + i * nodes_cnt, numa_nodes, nodes_cnt);
if (ret < 0)
goto fail;

/* Since our sysfs doesn't support writes, set persistent hugepages to their default value
* of zero */
numa_nodes[i].nr_hugepages[HUGEPAGES_2M] = 0;
numa_nodes[i].nr_hugepages[HUGEPAGES_1G] = 0;
}

for (size_t i = 0; i < threads_cnt; i++) {
Expand Down
19 changes: 17 additions & 2 deletions pal/src/host/linux-sgx/pal_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,14 +201,29 @@ static int sanitize_topo_info(struct pal_topo_info* topo_info) {
}

for (size_t i = 0; i < topo_info->numa_nodes_cnt; i++) {
/* Note: Linux doesn't guarantee that distance i -> i is 0, so we aren't checking this (it's
* actually non-zero on all machines we have). */
/* Note: distance i -> i is 10 according to the ACPI 2.0 SLIT spec, but to accomodate for
* weird BIOS settings we aren't checking this. */
for (size_t j = 0; j < topo_info->numa_nodes_cnt; j++) {
if ((!topo_info->numa_nodes[i].is_online || !topo_info->numa_nodes[j].is_online) &&
topo_info->numa_distance_matrix[i*topo_info->numa_nodes_cnt + j] != 0)
return -PAL_ERROR_INVAL;
if ( topo_info->numa_distance_matrix[i*topo_info->numa_nodes_cnt + j]
!= topo_info->numa_distance_matrix[j*topo_info->numa_nodes_cnt + i])
return -PAL_ERROR_INVAL;
}
}

/* Verify that online threads belong to online NUMA nodes (at this point, all indices are
* verified to be in-bounds and we can safely use them) */
for (size_t i = 0; i < topo_info->threads_cnt; i++) {
struct pal_cpu_thread_info* thread = &topo_info->threads[i];
if (!thread->is_online)
continue;
size_t node_id = topo_info->cores[thread->core_id].node_id;
if (!topo_info->numa_nodes[node_id].is_online)
return -PAL_ERROR_INVAL;
}

return 0;
}

Expand Down

0 comments on commit d12f96e

Please sign in to comment.