From 01c791efd0e2f167d4723e0ee047df3acac85eac Mon Sep 17 00:00:00 2001 From: Scott Fudally Date: Mon, 17 Nov 2025 06:58:28 -0800 Subject: [PATCH 1/6] DGX Spark: UMA support --- ggml/src/ggml-cuda/ggml-cuda.cu | 114 ++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 7d792e60cf9c5..f8e50d24066ba 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3744,10 +3744,124 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t return ctx->description.c_str(); } +#if defined(__linux__) +// Helper function to get available memory from /proc/meminfo for UMA systems +static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, long * freeSwapKb) { + FILE * meminfoFile = nullptr; + // 2KB buffer for reading /proc/meminfo since it does not report size info, should be enough + const size_t BUFFER_SIZE = 2048; + auto fileBuffer = std::make_unique(BUFFER_SIZE); + size_t bytesRead = 0; + long hugeTlbTotalPages = -1; + long hugeTlbFreePages = -1; + long hugeTlbPageSize = -1; + + if (availableMemoryKb == nullptr || freeSwapKb == nullptr) { + return false; + } + + meminfoFile = fopen("/proc/meminfo", "r"); + if (meminfoFile == nullptr) { + GGML_LOG_ERROR("%s: failed to open /proc/meminfo\n", __func__); + return false; + } + + // Read file into buffer + bytesRead = fread(fileBuffer.get(), 1, BUFFER_SIZE - 1, meminfoFile); + fclose(meminfoFile); + + if (bytesRead == 0) { + GGML_LOG_ERROR("%s: failed to read from /proc/meminfo\n", __func__); + return false; + } + fileBuffer[bytesRead] = '\0'; + + *availableMemoryKb = -1; + *freeSwapKb = -1; + + // Parse the file buffer line by line + char * line = fileBuffer.get(); + char * nextLine; + while (line < fileBuffer.get() + bytesRead) { + // Find the end of the current line + nextLine = strchr(line, '\n'); + if (nextLine != NULL) { + *nextLine = '\0'; + nextLine++; + } else { + nextLine = fileBuffer.get() + bytesRead; + } + + long value; + if (sscanf(line, "MemAvailable: %ld kB", &value) == 1) { + *availableMemoryKb = value; + } else if (sscanf(line, "SwapFree: %ld kB", &value) == 1) { + *freeSwapKb = value; + } else if (sscanf(line, "HugePages_Total: %ld", &value) == 1) { + hugeTlbTotalPages = value; + } else if (sscanf(line, "HugePages_Free: %ld", &value) == 1) { + hugeTlbFreePages = value; + } else if (sscanf(line, "Hugepagesize: %ld kB", &value) == 1) { + hugeTlbPageSize = value; + } + + line = nextLine; + } + + GGML_LOG_DEBUG("%s: hugeTlbTotalPages: %ld\n", __func__, hugeTlbTotalPages); + GGML_LOG_DEBUG("%s: hugeTlbFreePages: %ld\n", __func__, hugeTlbFreePages); + GGML_LOG_DEBUG("%s: hugeTlbPageSize: %ld\n", __func__, hugeTlbPageSize); + GGML_LOG_DEBUG("%s: availableMemoryKb: %ld\n", __func__, *availableMemoryKb); + GGML_LOG_DEBUG("%s: freeSwapKb: %ld\n", __func__, *freeSwapKb); + + if (hugeTlbTotalPages != 0 && hugeTlbTotalPages != -1) { + *availableMemoryKb = hugeTlbFreePages * hugeTlbPageSize; + + // Hugetlbfs pages are not swappable. + *freeSwapKb = 0; + } + + GGML_LOG_DEBUG("%s: final availableMemoryKb: %ld\n", __func__, *availableMemoryKb); + return true; +} +#endif // defined(__linux__) + static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemGetInfo(free, total)); + +#if defined(__linux__) + // Check if this is a UMA (Unified Memory Architecture) system + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device)); + + GGML_LOG_DEBUG("%s: prop.name: %s\n", __func__, prop.name); + GGML_LOG_DEBUG("%s: prop.unifiedAddressing: %d\n", __func__, prop.unifiedAddressing); + + // Check if UMA is explicitly enabled via environment variable + const char * uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY"); + + bool is_uma = prop.unifiedAddressing > 0 || (uma_env != nullptr && uma_env[0] == '1'); + + GGML_LOG_DEBUG("%s: is_uma: %d\n", __func__, is_uma); + + if (is_uma) { + // For UMA systems (like DGX Spark), use system memory info + long availableMemoryKb = 0; + long freeSwapKb = 0; + + if (ggml_backend_cuda_get_available_uma_memory(&availableMemoryKb, &freeSwapKb) && availableMemoryKb > 0) { + *free = (size_t)availableMemoryKb * 1024; + } else { + GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__); + } + } + + GGML_LOG_DEBUG("%s: free: %8.2f MB\n", __func__, *free / (1024.0 * 1024.0)); + GGML_LOG_DEBUG("%s: total: %8.2f MB\n", __func__, *total / (1024.0 * 1024.0)); +#endif // defined(__linux__) + } static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { From 885d972f03b210f518c708434f59640466b0cbd5 Mon Sep 17 00:00:00 2001 From: Scott Fudally Date: Wed, 19 Nov 2025 07:48:53 -0800 Subject: [PATCH 2/6] Updates from PR feedback --- ggml/src/ggml-cuda/ggml-cuda.cu | 73 +++++++++++++-------------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index f8e50d24066ba..773e64c6d8d29 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3747,49 +3747,49 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t #if defined(__linux__) // Helper function to get available memory from /proc/meminfo for UMA systems static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, long * freeSwapKb) { - FILE * meminfoFile = nullptr; + FILE * meminfo_file = nullptr; // 2KB buffer for reading /proc/meminfo since it does not report size info, should be enough const size_t BUFFER_SIZE = 2048; - auto fileBuffer = std::make_unique(BUFFER_SIZE); - size_t bytesRead = 0; - long hugeTlbTotalPages = -1; - long hugeTlbFreePages = -1; - long hugeTlbPageSize = -1; + auto file_buffer = std::make_unique(BUFFER_SIZE); + size_t bytes_read = 0; + long huge_tlb_total_pages = -1; + long huge_tlb_free_pages = -1; + long huge_tlb_page_size = -1; if (availableMemoryKb == nullptr || freeSwapKb == nullptr) { return false; } - meminfoFile = fopen("/proc/meminfo", "r"); - if (meminfoFile == nullptr) { + meminfo_file = fopen("/proc/meminfo", "r"); + if (meminfo_file == nullptr) { GGML_LOG_ERROR("%s: failed to open /proc/meminfo\n", __func__); return false; } // Read file into buffer - bytesRead = fread(fileBuffer.get(), 1, BUFFER_SIZE - 1, meminfoFile); - fclose(meminfoFile); + bytes_read = fread(file_buffer.get(), 1, BUFFER_SIZE - 1, meminfo_file); + fclose(meminfo_file); - if (bytesRead == 0) { + if (bytes_read == 0) { GGML_LOG_ERROR("%s: failed to read from /proc/meminfo\n", __func__); return false; } - fileBuffer[bytesRead] = '\0'; + file_buffer[bytes_read] = '\0'; *availableMemoryKb = -1; *freeSwapKb = -1; // Parse the file buffer line by line - char * line = fileBuffer.get(); + char * line = file_buffer.get(); char * nextLine; - while (line < fileBuffer.get() + bytesRead) { + while (line < file_buffer.get() + bytes_read) { // Find the end of the current line nextLine = strchr(line, '\n'); - if (nextLine != NULL) { + if (nextLine != nullptr) { *nextLine = '\0'; nextLine++; } else { - nextLine = fileBuffer.get() + bytesRead; + nextLine = file_buffer.get() + bytes_read; } long value; @@ -3798,24 +3798,18 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, } else if (sscanf(line, "SwapFree: %ld kB", &value) == 1) { *freeSwapKb = value; } else if (sscanf(line, "HugePages_Total: %ld", &value) == 1) { - hugeTlbTotalPages = value; + huge_tlb_total_pages = value; } else if (sscanf(line, "HugePages_Free: %ld", &value) == 1) { - hugeTlbFreePages = value; + huge_tlb_free_pages = value; } else if (sscanf(line, "Hugepagesize: %ld kB", &value) == 1) { - hugeTlbPageSize = value; + huge_tlb_page_size = value; } line = nextLine; } - GGML_LOG_DEBUG("%s: hugeTlbTotalPages: %ld\n", __func__, hugeTlbTotalPages); - GGML_LOG_DEBUG("%s: hugeTlbFreePages: %ld\n", __func__, hugeTlbFreePages); - GGML_LOG_DEBUG("%s: hugeTlbPageSize: %ld\n", __func__, hugeTlbPageSize); - GGML_LOG_DEBUG("%s: availableMemoryKb: %ld\n", __func__, *availableMemoryKb); - GGML_LOG_DEBUG("%s: freeSwapKb: %ld\n", __func__, *freeSwapKb); - - if (hugeTlbTotalPages != 0 && hugeTlbTotalPages != -1) { - *availableMemoryKb = hugeTlbFreePages * hugeTlbPageSize; + if (huge_tlb_total_pages != 0 && huge_tlb_total_pages != -1) { + *availableMemoryKb = huge_tlb_free_pages * huge_tlb_page_size; // Hugetlbfs pages are not swappable. *freeSwapKb = 0; @@ -3835,31 +3829,22 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * // Check if this is a UMA (Unified Memory Architecture) system cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device)); - - GGML_LOG_DEBUG("%s: prop.name: %s\n", __func__, prop.name); - GGML_LOG_DEBUG("%s: prop.unifiedAddressing: %d\n", __func__, prop.unifiedAddressing); - + // Check if UMA is explicitly enabled via environment variable - const char * uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY"); - - bool is_uma = prop.unifiedAddressing > 0 || (uma_env != nullptr && uma_env[0] == '1'); - - GGML_LOG_DEBUG("%s: is_uma: %d\n", __func__, is_uma); - + bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr; + bool is_uma = prop.unifiedAddressing > 0 || uma_env; + if (is_uma) { // For UMA systems (like DGX Spark), use system memory info - long availableMemoryKb = 0; - long freeSwapKb = 0; + long available_memory_kb = 0; + long free_swap_kb = 0; - if (ggml_backend_cuda_get_available_uma_memory(&availableMemoryKb, &freeSwapKb) && availableMemoryKb > 0) { - *free = (size_t)availableMemoryKb * 1024; + if (ggml_backend_cuda_get_available_uma_memory(&available_memory_kb, &free_swap_kb) && available_memory_kb > 0) { + *free = (size_t)available_memory_kb * 1024; } else { GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__); } } - - GGML_LOG_DEBUG("%s: free: %8.2f MB\n", __func__, *free / (1024.0 * 1024.0)); - GGML_LOG_DEBUG("%s: total: %8.2f MB\n", __func__, *total / (1024.0 * 1024.0)); #endif // defined(__linux__) } From 4134bc8a98d4a9faefb9f40b1b6795a5f28beb10 Mon Sep 17 00:00:00 2001 From: Scott Fudally Date: Wed, 19 Nov 2025 08:27:51 -0800 Subject: [PATCH 3/6] More PR feedback cleanup --- ggml/src/ggml-cuda/ggml-cuda.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 773e64c6d8d29..be12b7b9c5fb8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3746,7 +3746,7 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t #if defined(__linux__) // Helper function to get available memory from /proc/meminfo for UMA systems -static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, long * freeSwapKb) { +static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_kb, long * free_swap_kb) { FILE * meminfo_file = nullptr; // 2KB buffer for reading /proc/meminfo since it does not report size info, should be enough const size_t BUFFER_SIZE = 2048; @@ -3756,7 +3756,7 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, long huge_tlb_free_pages = -1; long huge_tlb_page_size = -1; - if (availableMemoryKb == nullptr || freeSwapKb == nullptr) { + if (available_memory_kb == nullptr || free_swap_kb == nullptr) { return false; } @@ -3776,8 +3776,8 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, } file_buffer[bytes_read] = '\0'; - *availableMemoryKb = -1; - *freeSwapKb = -1; + *available_memory_kb = -1; + *free_swap_kb = -1; // Parse the file buffer line by line char * line = file_buffer.get(); @@ -3794,9 +3794,9 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, long value; if (sscanf(line, "MemAvailable: %ld kB", &value) == 1) { - *availableMemoryKb = value; + *available_memory_kb = value; } else if (sscanf(line, "SwapFree: %ld kB", &value) == 1) { - *freeSwapKb = value; + *free_swap_kb = value; } else if (sscanf(line, "HugePages_Total: %ld", &value) == 1) { huge_tlb_total_pages = value; } else if (sscanf(line, "HugePages_Free: %ld", &value) == 1) { @@ -3809,13 +3809,13 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * availableMemoryKb, } if (huge_tlb_total_pages != 0 && huge_tlb_total_pages != -1) { - *availableMemoryKb = huge_tlb_free_pages * huge_tlb_page_size; + *available_memory_kb = huge_tlb_free_pages * huge_tlb_page_size; // Hugetlbfs pages are not swappable. - *freeSwapKb = 0; + *free_swap_kb = 0; } - GGML_LOG_DEBUG("%s: final availableMemoryKb: %ld\n", __func__, *availableMemoryKb); + GGML_LOG_DEBUG("%s: final available_memory_kb: %ld\n", __func__, *available_memory_kb); return true; } #endif // defined(__linux__) From 6fcd8eccbdb08634bad1f3acad6110538fdb3e12 Mon Sep 17 00:00:00 2001 From: Scott Fudally Date: Wed, 19 Nov 2025 09:18:44 -0800 Subject: [PATCH 4/6] Update ggml/src/ggml-cuda/ggml-cuda.cu Co-authored-by: Georgi Gerganov --- ggml/src/ggml-cuda/ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index be12b7b9c5fb8..891605a6c45ed 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3825,6 +3825,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemGetInfo(free, total)); +// ref: https://github.com/ggml-org/llama.cpp/pull/17368 #if defined(__linux__) // Check if this is a UMA (Unified Memory Architecture) system cudaDeviceProp prop; From 580d030afe07a4017f36d150f12ce45dff74ce4c Mon Sep 17 00:00:00 2001 From: Scott Fudally Date: Wed, 19 Nov 2025 14:00:19 -0800 Subject: [PATCH 5/6] Remove trailing whitespace --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 891605a6c45ed..5bfac2631a073 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3830,11 +3830,11 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * // Check if this is a UMA (Unified Memory Architecture) system cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device)); - + // Check if UMA is explicitly enabled via environment variable bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr; bool is_uma = prop.unifiedAddressing > 0 || uma_env; - + if (is_uma) { // For UMA systems (like DGX Spark), use system memory info long available_memory_kb = 0; From e9c289faa18fd1b2709d5440d8a641b12e814c29 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Nov 2025 12:31:02 +0200 Subject: [PATCH 6/6] Update ggml/src/ggml-cuda/ggml-cuda.cu --- ggml/src/ggml-cuda/ggml-cuda.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5bfac2631a073..ca20b8cc3080f 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3781,15 +3781,15 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k // Parse the file buffer line by line char * line = file_buffer.get(); - char * nextLine; + char * line_next; while (line < file_buffer.get() + bytes_read) { // Find the end of the current line - nextLine = strchr(line, '\n'); - if (nextLine != nullptr) { - *nextLine = '\0'; - nextLine++; + line_next = strchr(line, '\n'); + if (line_next != nullptr) { + *line_next = '\0'; + line_next++; } else { - nextLine = file_buffer.get() + bytes_read; + line_next = file_buffer.get() + bytes_read; } long value; @@ -3805,7 +3805,7 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k huge_tlb_page_size = value; } - line = nextLine; + line = line_next; } if (huge_tlb_total_pages != 0 && huge_tlb_total_pages != -1) {