From f74fbc83093231d48418466484324a8ae6308a9c Mon Sep 17 00:00:00 2001 From: Marzieh Lenjani Date: Fri, 29 Aug 2025 11:46:12 -0700 Subject: [PATCH] Dcperf mini (First version of Feedsim mini) : Add graph storage/loading optimization and eliminate per-thread graph building (#201) Summary: This diff enhances the DCPerf Feedsim benchmark by adding graph storage and loading optimization capabilities, eliminating redundant graph building across multiple thread runs, and replacing fixed sleep time with checking for server readiness. **Key changes:** 1. **Shell script enhancements** (`run-feedsim-multi.sh`, `run.sh`): * Added `-S` flag to store generated graphs to a file for reuse across instances * Added `-L` flag to load pre-generated graphs from a file instead of rebuilding per thread * Added `-I` flag to enable instrumenting graph generation * Enhanced help documentation to explain the new optimization options * Updated command line parsing to handle the new flags and pass them through to the underlying executables 2. **Command line options** (`LeafNodeRankCmdline.ggo`): * Added `store_graph` option to enable saving generated graphs to a specified file * Added `load_graph` option to enable loading graphs from a specified file instead of generating new ones * Added `instrument_graph` option to enable measuring the time for graph generation 3. **Performance optimizations:** * Eliminates per-thread graph building overhead**: Instead of each parallel instance building its own graph, one instance can build and store the graph while others load the pre-built version. This also optimizes memory and CPU usage by avoiding redundant graph generation across parallel threads * Reduces benchmark initialization time by replacing the fixed sleep time with checking for server readiness Differential Revision: D80288337 --- benchpress/config/jobs.yml | 62 ++++++ packages/feedsim/run-feedsim-multi.sh | 36 +++- packages/feedsim/run.sh | 53 ++++- .../src/workloads/ranking/LeafNodeRank.cc | 42 +++- .../workloads/ranking/LeafNodeRankCmdline.ggo | 3 + .../src/workloads/ranking/dwarfs/pagerank.cpp | 182 ++++++++++++++++++ .../src/workloads/ranking/dwarfs/pagerank.h | 8 + 7 files changed, 377 insertions(+), 9 deletions(-) diff --git a/benchpress/config/jobs.yml b/benchpress/config/jobs.yml index 1cbf77592..c5f283e45 100644 --- a/benchpress/config/jobs.yml +++ b/benchpress/config/jobs.yml @@ -461,6 +461,36 @@ - 'benchmarks/feedsim/feedsim-multi-inst-*.log' - 'benchmarks/feedsim/src/perf.data' +- name: feedsim_autoscale_mini + benchmark: feedsim_autoscale + description: > + Aggregator like workload. Latency sensitive. + The feedsim_autoscale mini benchmark jobs + are configured with a fixed QPS. + args: + - '-n {num_instances}' + - '-q {fixed_qps}' + - '-d {fixed_qps_duration}' + - '-w {warmup_time}' + - '-S {graph_store_path}' + - '-L {graph_load_path}' + - '{extra_args}' + vars: + - 'num_instances=-1' + - 'fixed_qps=100' + - 'fixed_qps_duration=300' + - 'warmup_time=120' + - 'graph_store_path=default_do_not_store' + - 'graph_load_path=default_do_not_load' + - 'extra_args=' + hooks: + - hook: copymove + options: + is_move: true + after: + - 'benchmarks/feedsim/feedsim_results*.txt' + - 'benchmarks/feedsim/feedsim-multi-inst-*.log' + - 'benchmarks/feedsim/src/perf.data' - name: feedsim_autoscale_arm benchmark: feedsim_autoscale @@ -492,6 +522,38 @@ - 'benchmarks/feedsim/feedsim-multi-inst-*.log' - 'benchmarks/feedsim/src/perf.data' +- name: feedsim_autoscale_arm_mini + benchmark: feedsim_autoscale + description: > + Aggregator like workload. Latency sensitive. + The feedsim_autoscale mini benchmark jobs + are configured with a fixed QPS. + Parameters tuned for arm. + args: + - '-n {num_instances}' + - '-i {icache_iterations}' + - '-q {fixed_qps}' + - '-d {fixed_qps_duration}' + - '-w {warmup_time}' + - '{extra_args}' + vars: + - 'num_instances=-1' + - 'icache_iterations=400000' + - 'fixed_qps=100' + - 'fixed_qps_duration=300' + - 'warmup_time=120' + - 'graph_store_path=default_do_not_store' + - 'graph_load_path=default_do_not_load' + - 'extra_args=' + hooks: + - hook: copymove + options: + is_move: true + after: + - 'benchmarks/feedsim/feedsim_results*.txt' + - 'benchmarks/feedsim/feedsim-multi-inst-*.log' + - 'benchmarks/feedsim/src/perf.data' + - benchmark: spark_standalone name: spark_standalone_local diff --git a/packages/feedsim/run-feedsim-multi.sh b/packages/feedsim/run-feedsim-multi.sh index 24a6fc0e2..800474d27 100755 --- a/packages/feedsim/run-feedsim-multi.sh +++ b/packages/feedsim/run-feedsim-multi.sh @@ -21,9 +21,30 @@ NUM_INSTANCES="$(( ( NCPU + 99 ) / 100 ))" NUM_ICACHE_ITERATIONS="1600000" +show_help() { +cat <&2 exit 1 @@ -43,7 +73,7 @@ while [ $# -ne 0 ]; do esac case $1 in - -n|-i) + -n|-i|-S|-L) if [ -z "$2" ]; then echo "Invalid option: '$1' requires an argument" 1>&2 exit 1 @@ -99,10 +129,10 @@ echo > $BREPS_LFILE # shellcheck disable=SC2086 for i in $(seq 1 ${NUM_INSTANCES}); do CORE_RANGE="$(get_cpu_range "${NUM_INSTANCES}" "$((i - 1))")" - CMD="IS_AUTOSCALE_RUN=${NUM_INSTANCES} taskset --cpu-list ${CORE_RANGE} ${FEEDSIM_ROOT}/run.sh -p ${PORT} -i ${NUM_ICACHE_ITERATIONS} -o feedsim_results_${FIXQPS_SUFFIX}${i}.txt $*" + CMD="IS_AUTOSCALE_RUN=${NUM_INSTANCES} taskset --cpu-list ${CORE_RANGE} ${FEEDSIM_ROOT}/run.sh -p ${PORT} -i ${NUM_ICACHE_ITERATIONS} -o feedsim_results_${FIXQPS_SUFFIX}${i}.txt ${STORE_GRAPH} ${LOAD_GRAPH} ${INSTRUMENT_GRAPH} $*" echo "$CMD" > "${FEEDSIM_LOG_PREFIX}${i}.log" # shellcheck disable=SC2068,SC2069 - IS_AUTOSCALE_RUN=${NUM_INSTANCES} stdbuf -i0 -o0 -e0 taskset --cpu-list "${CORE_RANGE}" "${FEEDSIM_ROOT}"/run.sh -p "${PORT}" -i "${NUM_ICACHE_ITERATIONS}" -o "feedsim_results_${FIXQPS_SUFFIX}${i}.txt" $@ 2>&1 > "${FEEDSIM_LOG_PREFIX}${i}.log" & + IS_AUTOSCALE_RUN=${NUM_INSTANCES} stdbuf -i0 -o0 -e0 taskset --cpu-list "${CORE_RANGE}" "${FEEDSIM_ROOT}"/run.sh -p "${PORT}" -i "${NUM_ICACHE_ITERATIONS}" -o "feedsim_results_${FIXQPS_SUFFIX}${i}.txt" ${STORE_GRAPH} ${LOAD_GRAPH} ${INSTRUMENT_GRAPH} $@ 2>&1 > "${FEEDSIM_LOG_PREFIX}${i}.log" & PIDS+=("$!") PHY_CORE_ID=$((PHY_CORE_ID + CORES_PER_INST)) SMT_ID=$((SMT_ID + CORES_PER_INST)) diff --git a/packages/feedsim/run.sh b/packages/feedsim/run.sh index 83a1eebbc..db313518a 100644 --- a/packages/feedsim/run.sh +++ b/packages/feedsim/run.sh @@ -65,6 +65,9 @@ Usage: ${0##*/} [OPTION]... -d Duration of each load testing experiment, in seconds. Default: 300 -p Port to use by the LeafNodeRank server and the load drivers. Default: 11222 -o Result output file name. Default: "feedsim_results.txt" + -S Store the generated graph to a file (requires a file path) + -L Load a graph from a file instead of generating one (requires a file path) + -I Enable timing instrumentation for graph operations (build, store, load) EOF } @@ -122,6 +125,17 @@ main() { local icache_iterations icache_iterations="1600000" + # Graph storage and loading options + local store_graph + store_graph="" + + local load_graph + load_graph="" + + local instrument_graph + instrument_graph="" + + if [ -z "$IS_AUTOSCALE_RUN" ]; then echo > $BREPS_LFILE fi @@ -162,6 +176,19 @@ main() { -i) icache_iterations="$2" ;; + -S) + if [ "$2" != "default_do_not_store" ]; then + store_graph="--store_graph=$2" + fi + ;; + -L) + if [ "$2" != "default_do_not_load" ]; then + load_graph="--load_graph=$2" + fi + ;; + -I) + instrument_graph="--instrument_graph" + ;; -h|--help) show_help >&2 exit 1 @@ -172,7 +199,7 @@ main() { esac case $1 in - -t|-c|-s|-d|-p|-q|-o|-w|-i|-l) + -t|-c|-s|-d|-p|-q|-o|-w|-i|-l|-S|-L) if [ -z "$2" ]; then echo "Invalid option: '$1' requires an argument" 1>&2 exit 1 @@ -208,13 +235,29 @@ main() { --num_objects=2000 \ --graph_max_iters=1 \ --noaffinity \ - --min_icache_iterations="$icache_iterations" & + --min_icache_iterations="$icache_iterations" \ + "$store_graph" \ + "$load_graph" \ + "$instrument_graph" >> $BREPS_LFILE 2>&1 & LEAF_PID=$! - # FIXME(cltorres) - # Remove sleep, expose an endpoint or print a message to notify service is ready - sleep 30 + # Wait for server to be fully ready using monitoring endpoint + echo "Waiting for LeafNodeRank server to be ready on monitor port $monitor_port..." + max_attempts=30 + attempt=0 + while [ $attempt -lt $max_attempts ]; do + if curl -f -s "http://localhost:$monitor_port/topology" > /dev/null 2>&1; then + echo "LeafNodeRank server is ready (monitor port responding)" + break + fi + attempt=$((attempt + 1)) + if [ $attempt -eq $max_attempts ]; then + echo "ERROR: Server failed to become ready within $max_attempts seconds" + exit 1 + fi + sleep 1 + done # FIXME(cltorres) # Skip ParentNode for now, and talk directly to LeafNode diff --git a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc index c0ccb2352..33143b2c4 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc +++ b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc @@ -75,6 +75,9 @@ struct ThreadData { std::string random_string; }; +// Global graph that will be shared across threads +CSRGraph g_shared_graph; + void ThreadStartup( oldisim::NodeThread& thread, std::vector& thread_data, @@ -85,7 +88,8 @@ void ThreadStartup( const std::shared_ptr& ioThreadPool, const std::shared_ptr& timekeeperPool) { auto& this_thread = thread_data[thread.get_thread_num()]; - auto graph = params.buildGraph(); + // auto graph = params.buildGraph(); + auto graph = params.makeGraphCopy(g_shared_graph); this_thread.cpuThreadPool = cpuThreadPool; this_thread.srvCPUThreadPool = srvCPUThreadPool; this_thread.srvIOThreadPool = srvIOThreadPool; @@ -307,6 +311,42 @@ int main(int argc, char** argv) { std::vector thread_data(args.threads_arg); ranking::dwarfs::PageRankParams params{ args.graph_scale_arg, args.graph_degree_arg}; + + // create or load a graph + + if (args.load_graph_given) { + if (args.instrument_graph_given) { + auto start_load = std::chrono::steady_clock::now(); + g_shared_graph = params.loadGraphFromFile(args.load_graph_arg); + auto end_load = std::chrono::steady_clock::now(); + auto load_duration = std::chrono::duration_cast(end_load - start_load).count(); + std::cout << "Graph loading time: " << load_duration << " ms" << std::endl; + } else { + g_shared_graph = params.loadGraphFromFile(args.load_graph_arg); + } + } else { + if (args.instrument_graph_given) { + auto start_build = std::chrono::steady_clock::now(); + g_shared_graph = params.buildGraph(); + auto end_build = std::chrono::steady_clock::now(); + auto build_duration = std::chrono::duration_cast(end_build - start_build).count(); + std::cout << "Graph building time: " << build_duration << " ms" << std::endl; + + if (args.store_graph_given) { + auto start_store = std::chrono::steady_clock::now(); + params.storeGraphToFile(g_shared_graph, args.store_graph_arg); + auto end_store = std::chrono::steady_clock::now(); + auto store_duration = std::chrono::duration_cast(end_store - start_store).count(); + std::cout << "Graph storing time: " << store_duration << " ms" << std::endl; + } + } else { + g_shared_graph = params.buildGraph(); + if (args.store_graph_given) { + params.storeGraphToFile(g_shared_graph, args.store_graph_arg); + } + } + } + oldisim::LeafNodeServer server(args.port_arg); server.SetThreadStartupCallback([&](auto&& thread) { return ThreadStartup( diff --git a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo index 7a5ddb6c7..d58587cea 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo +++ b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo @@ -28,6 +28,9 @@ option "graph_max_iters" - "Perform at most 'graph_max_iters' iterations during option "graph_subset" - "Perform partial PageRank over these numbers of nodes. 0 indicates all nodes." int default="3145728" option "num_objects" - "Number of objects to serialize." int default="40" option "random_data_size" - "Number of bytes of string random data." int default="3145728" +option "store_graph" - "Enable storing the generated graph to a file." string typestr="filename" optional +option "load_graph" - "Enable loading a graph from a file instead of generating one." string typestr="filename" optional +option "instrument_graph" - "Enable timing instrumentation for graph operations (build, store, load)." option "max_response_size" - "Maximum response size in bytes returned by the leaf server." int default="131072" option "compression_data_size" - "Number of bytes to compress per request." int default="131072" option "rank_trials_per_thread" - "Number of iterations each CPU thread executes of rank work." int default="1" diff --git a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp index 6f5f79485..50146dac0 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp +++ b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp @@ -25,9 +25,12 @@ #include "pagerank.h" #include +#include #include #include +#include #include +#include #include #include @@ -72,6 +75,185 @@ PageRankParams::~PageRankParams() = default; CSRGraph PageRankParams::buildGraph() { return pimpl->makeGraph(); } +CSRGraph PageRankParams::makeGraphCopy( + const CSRGraph& original) { + // Create a deep copy of the graph + const int64_t num_nodes = original.num_nodes(); + const int64_t num_edges = original.num_edges(); + + // Allocate memory for the new graph's index and neighbors arrays + int32_t** out_index = new int32_t*[num_nodes + 1]; + int32_t* out_neighbors = new int32_t[num_edges]; + auto is_directed = original.directed(); + + // Copy all outgoing neighbors data efficiently + std::copy( + original.out_neigh(0).begin(), + original.out_neigh(num_nodes - 1).end(), + out_neighbors); + // Set up index pointers for each node + #pragma omp parallel for + for (int64_t n = 0; n < num_nodes; n++) { + //TODO: check this is correct + out_index[n] = out_neighbors + + (original.out_neigh(n).begin() - original.out_neigh(0).begin()); + } + // Set the last index pointer + out_index[num_nodes] = out_neighbors + num_edges; + + // If the graph is directed and has incoming edges, copy those too + if (is_directed) { + int32_t** in_index = new int32_t*[num_nodes + 1]; + int32_t* in_neighbors = new int32_t[num_edges]; + + // Copy all incoming neighbors data efficiently + std::copy( + original.in_neigh(0).begin(), + original.in_neigh(num_nodes - 1).end(), + in_neighbors); + + // Set up index pointers for each node + #pragma omp parallel for + for (int64_t n = 0; n < num_nodes; n++) { + in_index[n] = in_neighbors + + (original.in_neigh(n).begin() - original.in_neigh(0).begin()); + } + // Set the last index pointer + in_index[num_nodes] = in_neighbors + num_edges; + + return CSRGraph( + num_nodes, out_index, out_neighbors, in_index, in_neighbors); + } else { + return CSRGraph(num_nodes, out_index, out_neighbors); + } +} + +void PageRankParams::storeGraphToFile( + const CSRGraph& original, + const std::string& filePath) { + std::ofstream outFile(filePath, std::ios::binary); + if (!outFile.is_open()) { + throw std::runtime_error("Unable to open file for writing: " + filePath); + } + + // Write basic graph properties + bool is_directed = original.directed(); + int64_t num_nodes = original.num_nodes(); + int64_t num_edges = original.num_edges(); + + outFile.write( + reinterpret_cast(&is_directed), sizeof(is_directed)); + outFile.write(reinterpret_cast(&num_nodes), sizeof(num_nodes)); + outFile.write(reinterpret_cast(&num_edges), sizeof(num_edges)); + + // Write out_index array (size: num_nodes + 1 pointers, but we store offsets) + std::vector out_offsets(num_nodes + 1); + for (int64_t n = 0; n <= num_nodes; n++) { + if (n == 0) { + out_offsets[n] = 0; + } else if (n == num_nodes) { + out_offsets[n] = num_edges; + } else { + out_offsets[n] = + original.out_neigh(n).begin() - original.out_neigh(0).begin(); + } + } + outFile.write( + reinterpret_cast(out_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Write out_neighbors array + outFile.write( + reinterpret_cast(original.out_neigh(0).begin()), + num_edges * sizeof(int32_t)); + + // If directed, write incoming edges data + if (is_directed) { + // Write in_index array (as offsets) + std::vector in_offsets(num_nodes + 1); + for (int64_t n = 0; n <= num_nodes; n++) { + if (n == 0) { + in_offsets[n] = 0; + } else if (n == num_nodes) { + in_offsets[n] = num_edges; + } else { + in_offsets[n] = original.in_neigh(n).begin() - original.in_neigh(0).begin(); + } + } + outFile.write( + reinterpret_cast(in_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Write in_neighbors array + outFile.write( + reinterpret_cast(original.in_neigh(0).begin()), + num_edges * sizeof(int32_t)); + } + + outFile.close(); +} + +CSRGraph PageRankParams::loadGraphFromFile( + const std::string& filePath) { + std::ifstream inFile(filePath, std::ios::binary); + if (!inFile.is_open()) { + throw std::runtime_error("Unable to open file for reading: " + filePath); + } + + // Read basic graph properties + bool is_directed; + int64_t num_nodes; + int64_t num_edges; + + inFile.read(reinterpret_cast(&is_directed), sizeof(is_directed)); + inFile.read(reinterpret_cast(&num_nodes), sizeof(num_nodes)); + inFile.read(reinterpret_cast(&num_edges), sizeof(num_edges)); + + // Read out_index offsets and convert to pointers + std::vector out_offsets(num_nodes + 1); + inFile.read( + reinterpret_cast(out_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Allocate and read out_neighbors array + int32_t* out_neighbors = new int32_t[num_edges]; + inFile.read( + reinterpret_cast(out_neighbors), num_edges * sizeof(int32_t)); + + // Create out_index pointer array + int32_t** out_index = new int32_t*[num_nodes + 1]; + #pragma omp parallel for + for (int64_t n = 0; n <= num_nodes; n++) { + out_index[n] = out_neighbors + out_offsets[n]; + } + + if (is_directed) { + // Read in_index offsets and convert to pointers + std::vector in_offsets(num_nodes + 1); + inFile.read( + reinterpret_cast(in_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Allocate and read in_neighbors array + int32_t* in_neighbors = new int32_t[num_edges]; + inFile.read( + reinterpret_cast(in_neighbors), num_edges * sizeof(int32_t)); + + // Create in_index pointer array + int32_t** in_index = new int32_t*[num_nodes + 1]; + #pragma omp parallel for + for (int64_t n = 0; n <= num_nodes; n++) { + in_index[n] = in_neighbors + in_offsets[n]; + } + + inFile.close(); + return CSRGraph( + num_nodes, out_index, out_neighbors, in_index, in_neighbors); + } else { + inFile.close(); + return CSRGraph(num_nodes, out_index, out_neighbors); + } +} PageRank::PageRank(CSRGraph graph, int num_pvectors_entries) : graph_(std::move(graph)), num_pvectors_entries_(num_pvectors_entries) { diff --git a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h index 20c789f5f..3c3f50dd5 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h +++ b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h @@ -42,6 +42,14 @@ class PageRankParams { ~PageRankParams(); CSRGraph buildGraph(); + CSRGraph makeGraphCopy(const CSRGraph& original); + + void storeGraphToFile( + const CSRGraph& original, + const std::string& filePath); + + CSRGraph loadGraphFromFile( + const std::string& filePath); private: struct Impl;