diff --git a/benchpress/config/jobs.yml b/benchpress/config/jobs.yml index 1cbf77592..c5f283e45 100644 --- a/benchpress/config/jobs.yml +++ b/benchpress/config/jobs.yml @@ -461,6 +461,36 @@ - 'benchmarks/feedsim/feedsim-multi-inst-*.log' - 'benchmarks/feedsim/src/perf.data' +- name: feedsim_autoscale_mini + benchmark: feedsim_autoscale + description: > + Aggregator like workload. Latency sensitive. + The feedsim_autoscale mini benchmark jobs + are configured with a fixed QPS. + args: + - '-n {num_instances}' + - '-q {fixed_qps}' + - '-d {fixed_qps_duration}' + - '-w {warmup_time}' + - '-S {graph_store_path}' + - '-L {graph_load_path}' + - '{extra_args}' + vars: + - 'num_instances=-1' + - 'fixed_qps=100' + - 'fixed_qps_duration=300' + - 'warmup_time=120' + - 'graph_store_path=default_do_not_store' + - 'graph_load_path=default_do_not_load' + - 'extra_args=' + hooks: + - hook: copymove + options: + is_move: true + after: + - 'benchmarks/feedsim/feedsim_results*.txt' + - 'benchmarks/feedsim/feedsim-multi-inst-*.log' + - 'benchmarks/feedsim/src/perf.data' - name: feedsim_autoscale_arm benchmark: feedsim_autoscale @@ -492,6 +522,38 @@ - 'benchmarks/feedsim/feedsim-multi-inst-*.log' - 'benchmarks/feedsim/src/perf.data' +- name: feedsim_autoscale_arm_mini + benchmark: feedsim_autoscale + description: > + Aggregator like workload. Latency sensitive. + The feedsim_autoscale mini benchmark jobs + are configured with a fixed QPS. + Parameters tuned for arm. + args: + - '-n {num_instances}' + - '-i {icache_iterations}' + - '-q {fixed_qps}' + - '-d {fixed_qps_duration}' + - '-w {warmup_time}' + - '{extra_args}' + vars: + - 'num_instances=-1' + - 'icache_iterations=400000' + - 'fixed_qps=100' + - 'fixed_qps_duration=300' + - 'warmup_time=120' + - 'graph_store_path=default_do_not_store' + - 'graph_load_path=default_do_not_load' + - 'extra_args=' + hooks: + - hook: copymove + options: + is_move: true + after: + - 'benchmarks/feedsim/feedsim_results*.txt' + - 'benchmarks/feedsim/feedsim-multi-inst-*.log' + - 'benchmarks/feedsim/src/perf.data' + - benchmark: spark_standalone name: spark_standalone_local diff --git a/packages/feedsim/run-feedsim-multi.sh b/packages/feedsim/run-feedsim-multi.sh index 24a6fc0e2..800474d27 100755 --- a/packages/feedsim/run-feedsim-multi.sh +++ b/packages/feedsim/run-feedsim-multi.sh @@ -21,9 +21,30 @@ NUM_INSTANCES="$(( ( NCPU + 99 ) / 100 ))" NUM_ICACHE_ITERATIONS="1600000" +show_help() { +cat <&2 exit 1 @@ -43,7 +73,7 @@ while [ $# -ne 0 ]; do esac case $1 in - -n|-i) + -n|-i|-S|-L) if [ -z "$2" ]; then echo "Invalid option: '$1' requires an argument" 1>&2 exit 1 @@ -99,10 +129,10 @@ echo > $BREPS_LFILE # shellcheck disable=SC2086 for i in $(seq 1 ${NUM_INSTANCES}); do CORE_RANGE="$(get_cpu_range "${NUM_INSTANCES}" "$((i - 1))")" - CMD="IS_AUTOSCALE_RUN=${NUM_INSTANCES} taskset --cpu-list ${CORE_RANGE} ${FEEDSIM_ROOT}/run.sh -p ${PORT} -i ${NUM_ICACHE_ITERATIONS} -o feedsim_results_${FIXQPS_SUFFIX}${i}.txt $*" + CMD="IS_AUTOSCALE_RUN=${NUM_INSTANCES} taskset --cpu-list ${CORE_RANGE} ${FEEDSIM_ROOT}/run.sh -p ${PORT} -i ${NUM_ICACHE_ITERATIONS} -o feedsim_results_${FIXQPS_SUFFIX}${i}.txt ${STORE_GRAPH} ${LOAD_GRAPH} ${INSTRUMENT_GRAPH} $*" echo "$CMD" > "${FEEDSIM_LOG_PREFIX}${i}.log" # shellcheck disable=SC2068,SC2069 - IS_AUTOSCALE_RUN=${NUM_INSTANCES} stdbuf -i0 -o0 -e0 taskset --cpu-list "${CORE_RANGE}" "${FEEDSIM_ROOT}"/run.sh -p "${PORT}" -i "${NUM_ICACHE_ITERATIONS}" -o "feedsim_results_${FIXQPS_SUFFIX}${i}.txt" $@ 2>&1 > "${FEEDSIM_LOG_PREFIX}${i}.log" & + IS_AUTOSCALE_RUN=${NUM_INSTANCES} stdbuf -i0 -o0 -e0 taskset --cpu-list "${CORE_RANGE}" "${FEEDSIM_ROOT}"/run.sh -p "${PORT}" -i "${NUM_ICACHE_ITERATIONS}" -o "feedsim_results_${FIXQPS_SUFFIX}${i}.txt" ${STORE_GRAPH} ${LOAD_GRAPH} ${INSTRUMENT_GRAPH} $@ 2>&1 > "${FEEDSIM_LOG_PREFIX}${i}.log" & PIDS+=("$!") PHY_CORE_ID=$((PHY_CORE_ID + CORES_PER_INST)) SMT_ID=$((SMT_ID + CORES_PER_INST)) diff --git a/packages/feedsim/run.sh b/packages/feedsim/run.sh index 83a1eebbc..db313518a 100644 --- a/packages/feedsim/run.sh +++ b/packages/feedsim/run.sh @@ -65,6 +65,9 @@ Usage: ${0##*/} [OPTION]... -d Duration of each load testing experiment, in seconds. Default: 300 -p Port to use by the LeafNodeRank server and the load drivers. Default: 11222 -o Result output file name. Default: "feedsim_results.txt" + -S Store the generated graph to a file (requires a file path) + -L Load a graph from a file instead of generating one (requires a file path) + -I Enable timing instrumentation for graph operations (build, store, load) EOF } @@ -122,6 +125,17 @@ main() { local icache_iterations icache_iterations="1600000" + # Graph storage and loading options + local store_graph + store_graph="" + + local load_graph + load_graph="" + + local instrument_graph + instrument_graph="" + + if [ -z "$IS_AUTOSCALE_RUN" ]; then echo > $BREPS_LFILE fi @@ -162,6 +176,19 @@ main() { -i) icache_iterations="$2" ;; + -S) + if [ "$2" != "default_do_not_store" ]; then + store_graph="--store_graph=$2" + fi + ;; + -L) + if [ "$2" != "default_do_not_load" ]; then + load_graph="--load_graph=$2" + fi + ;; + -I) + instrument_graph="--instrument_graph" + ;; -h|--help) show_help >&2 exit 1 @@ -172,7 +199,7 @@ main() { esac case $1 in - -t|-c|-s|-d|-p|-q|-o|-w|-i|-l) + -t|-c|-s|-d|-p|-q|-o|-w|-i|-l|-S|-L) if [ -z "$2" ]; then echo "Invalid option: '$1' requires an argument" 1>&2 exit 1 @@ -208,13 +235,29 @@ main() { --num_objects=2000 \ --graph_max_iters=1 \ --noaffinity \ - --min_icache_iterations="$icache_iterations" & + --min_icache_iterations="$icache_iterations" \ + "$store_graph" \ + "$load_graph" \ + "$instrument_graph" >> $BREPS_LFILE 2>&1 & LEAF_PID=$! - # FIXME(cltorres) - # Remove sleep, expose an endpoint or print a message to notify service is ready - sleep 30 + # Wait for server to be fully ready using monitoring endpoint + echo "Waiting for LeafNodeRank server to be ready on monitor port $monitor_port..." + max_attempts=30 + attempt=0 + while [ $attempt -lt $max_attempts ]; do + if curl -f -s "http://localhost:$monitor_port/topology" > /dev/null 2>&1; then + echo "LeafNodeRank server is ready (monitor port responding)" + break + fi + attempt=$((attempt + 1)) + if [ $attempt -eq $max_attempts ]; then + echo "ERROR: Server failed to become ready within $max_attempts seconds" + exit 1 + fi + sleep 1 + done # FIXME(cltorres) # Skip ParentNode for now, and talk directly to LeafNode diff --git a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc index c0ccb2352..33143b2c4 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc +++ b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRank.cc @@ -75,6 +75,9 @@ struct ThreadData { std::string random_string; }; +// Global graph that will be shared across threads +CSRGraph g_shared_graph; + void ThreadStartup( oldisim::NodeThread& thread, std::vector& thread_data, @@ -85,7 +88,8 @@ void ThreadStartup( const std::shared_ptr& ioThreadPool, const std::shared_ptr& timekeeperPool) { auto& this_thread = thread_data[thread.get_thread_num()]; - auto graph = params.buildGraph(); + // auto graph = params.buildGraph(); + auto graph = params.makeGraphCopy(g_shared_graph); this_thread.cpuThreadPool = cpuThreadPool; this_thread.srvCPUThreadPool = srvCPUThreadPool; this_thread.srvIOThreadPool = srvIOThreadPool; @@ -307,6 +311,42 @@ int main(int argc, char** argv) { std::vector thread_data(args.threads_arg); ranking::dwarfs::PageRankParams params{ args.graph_scale_arg, args.graph_degree_arg}; + + // create or load a graph + + if (args.load_graph_given) { + if (args.instrument_graph_given) { + auto start_load = std::chrono::steady_clock::now(); + g_shared_graph = params.loadGraphFromFile(args.load_graph_arg); + auto end_load = std::chrono::steady_clock::now(); + auto load_duration = std::chrono::duration_cast(end_load - start_load).count(); + std::cout << "Graph loading time: " << load_duration << " ms" << std::endl; + } else { + g_shared_graph = params.loadGraphFromFile(args.load_graph_arg); + } + } else { + if (args.instrument_graph_given) { + auto start_build = std::chrono::steady_clock::now(); + g_shared_graph = params.buildGraph(); + auto end_build = std::chrono::steady_clock::now(); + auto build_duration = std::chrono::duration_cast(end_build - start_build).count(); + std::cout << "Graph building time: " << build_duration << " ms" << std::endl; + + if (args.store_graph_given) { + auto start_store = std::chrono::steady_clock::now(); + params.storeGraphToFile(g_shared_graph, args.store_graph_arg); + auto end_store = std::chrono::steady_clock::now(); + auto store_duration = std::chrono::duration_cast(end_store - start_store).count(); + std::cout << "Graph storing time: " << store_duration << " ms" << std::endl; + } + } else { + g_shared_graph = params.buildGraph(); + if (args.store_graph_given) { + params.storeGraphToFile(g_shared_graph, args.store_graph_arg); + } + } + } + oldisim::LeafNodeServer server(args.port_arg); server.SetThreadStartupCallback([&](auto&& thread) { return ThreadStartup( diff --git a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo index 7a5ddb6c7..d58587cea 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo +++ b/packages/feedsim/third_party/src/workloads/ranking/LeafNodeRankCmdline.ggo @@ -28,6 +28,9 @@ option "graph_max_iters" - "Perform at most 'graph_max_iters' iterations during option "graph_subset" - "Perform partial PageRank over these numbers of nodes. 0 indicates all nodes." int default="3145728" option "num_objects" - "Number of objects to serialize." int default="40" option "random_data_size" - "Number of bytes of string random data." int default="3145728" +option "store_graph" - "Enable storing the generated graph to a file." string typestr="filename" optional +option "load_graph" - "Enable loading a graph from a file instead of generating one." string typestr="filename" optional +option "instrument_graph" - "Enable timing instrumentation for graph operations (build, store, load)." option "max_response_size" - "Maximum response size in bytes returned by the leaf server." int default="131072" option "compression_data_size" - "Number of bytes to compress per request." int default="131072" option "rank_trials_per_thread" - "Number of iterations each CPU thread executes of rank work." int default="1" diff --git a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp index 6f5f79485..50146dac0 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp +++ b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.cpp @@ -25,9 +25,12 @@ #include "pagerank.h" #include +#include #include #include +#include #include +#include #include #include @@ -72,6 +75,185 @@ PageRankParams::~PageRankParams() = default; CSRGraph PageRankParams::buildGraph() { return pimpl->makeGraph(); } +CSRGraph PageRankParams::makeGraphCopy( + const CSRGraph& original) { + // Create a deep copy of the graph + const int64_t num_nodes = original.num_nodes(); + const int64_t num_edges = original.num_edges(); + + // Allocate memory for the new graph's index and neighbors arrays + int32_t** out_index = new int32_t*[num_nodes + 1]; + int32_t* out_neighbors = new int32_t[num_edges]; + auto is_directed = original.directed(); + + // Copy all outgoing neighbors data efficiently + std::copy( + original.out_neigh(0).begin(), + original.out_neigh(num_nodes - 1).end(), + out_neighbors); + // Set up index pointers for each node + #pragma omp parallel for + for (int64_t n = 0; n < num_nodes; n++) { + //TODO: check this is correct + out_index[n] = out_neighbors + + (original.out_neigh(n).begin() - original.out_neigh(0).begin()); + } + // Set the last index pointer + out_index[num_nodes] = out_neighbors + num_edges; + + // If the graph is directed and has incoming edges, copy those too + if (is_directed) { + int32_t** in_index = new int32_t*[num_nodes + 1]; + int32_t* in_neighbors = new int32_t[num_edges]; + + // Copy all incoming neighbors data efficiently + std::copy( + original.in_neigh(0).begin(), + original.in_neigh(num_nodes - 1).end(), + in_neighbors); + + // Set up index pointers for each node + #pragma omp parallel for + for (int64_t n = 0; n < num_nodes; n++) { + in_index[n] = in_neighbors + + (original.in_neigh(n).begin() - original.in_neigh(0).begin()); + } + // Set the last index pointer + in_index[num_nodes] = in_neighbors + num_edges; + + return CSRGraph( + num_nodes, out_index, out_neighbors, in_index, in_neighbors); + } else { + return CSRGraph(num_nodes, out_index, out_neighbors); + } +} + +void PageRankParams::storeGraphToFile( + const CSRGraph& original, + const std::string& filePath) { + std::ofstream outFile(filePath, std::ios::binary); + if (!outFile.is_open()) { + throw std::runtime_error("Unable to open file for writing: " + filePath); + } + + // Write basic graph properties + bool is_directed = original.directed(); + int64_t num_nodes = original.num_nodes(); + int64_t num_edges = original.num_edges(); + + outFile.write( + reinterpret_cast(&is_directed), sizeof(is_directed)); + outFile.write(reinterpret_cast(&num_nodes), sizeof(num_nodes)); + outFile.write(reinterpret_cast(&num_edges), sizeof(num_edges)); + + // Write out_index array (size: num_nodes + 1 pointers, but we store offsets) + std::vector out_offsets(num_nodes + 1); + for (int64_t n = 0; n <= num_nodes; n++) { + if (n == 0) { + out_offsets[n] = 0; + } else if (n == num_nodes) { + out_offsets[n] = num_edges; + } else { + out_offsets[n] = + original.out_neigh(n).begin() - original.out_neigh(0).begin(); + } + } + outFile.write( + reinterpret_cast(out_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Write out_neighbors array + outFile.write( + reinterpret_cast(original.out_neigh(0).begin()), + num_edges * sizeof(int32_t)); + + // If directed, write incoming edges data + if (is_directed) { + // Write in_index array (as offsets) + std::vector in_offsets(num_nodes + 1); + for (int64_t n = 0; n <= num_nodes; n++) { + if (n == 0) { + in_offsets[n] = 0; + } else if (n == num_nodes) { + in_offsets[n] = num_edges; + } else { + in_offsets[n] = original.in_neigh(n).begin() - original.in_neigh(0).begin(); + } + } + outFile.write( + reinterpret_cast(in_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Write in_neighbors array + outFile.write( + reinterpret_cast(original.in_neigh(0).begin()), + num_edges * sizeof(int32_t)); + } + + outFile.close(); +} + +CSRGraph PageRankParams::loadGraphFromFile( + const std::string& filePath) { + std::ifstream inFile(filePath, std::ios::binary); + if (!inFile.is_open()) { + throw std::runtime_error("Unable to open file for reading: " + filePath); + } + + // Read basic graph properties + bool is_directed; + int64_t num_nodes; + int64_t num_edges; + + inFile.read(reinterpret_cast(&is_directed), sizeof(is_directed)); + inFile.read(reinterpret_cast(&num_nodes), sizeof(num_nodes)); + inFile.read(reinterpret_cast(&num_edges), sizeof(num_edges)); + + // Read out_index offsets and convert to pointers + std::vector out_offsets(num_nodes + 1); + inFile.read( + reinterpret_cast(out_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Allocate and read out_neighbors array + int32_t* out_neighbors = new int32_t[num_edges]; + inFile.read( + reinterpret_cast(out_neighbors), num_edges * sizeof(int32_t)); + + // Create out_index pointer array + int32_t** out_index = new int32_t*[num_nodes + 1]; + #pragma omp parallel for + for (int64_t n = 0; n <= num_nodes; n++) { + out_index[n] = out_neighbors + out_offsets[n]; + } + + if (is_directed) { + // Read in_index offsets and convert to pointers + std::vector in_offsets(num_nodes + 1); + inFile.read( + reinterpret_cast(in_offsets.data()), + (num_nodes + 1) * sizeof(int64_t)); + + // Allocate and read in_neighbors array + int32_t* in_neighbors = new int32_t[num_edges]; + inFile.read( + reinterpret_cast(in_neighbors), num_edges * sizeof(int32_t)); + + // Create in_index pointer array + int32_t** in_index = new int32_t*[num_nodes + 1]; + #pragma omp parallel for + for (int64_t n = 0; n <= num_nodes; n++) { + in_index[n] = in_neighbors + in_offsets[n]; + } + + inFile.close(); + return CSRGraph( + num_nodes, out_index, out_neighbors, in_index, in_neighbors); + } else { + inFile.close(); + return CSRGraph(num_nodes, out_index, out_neighbors); + } +} PageRank::PageRank(CSRGraph graph, int num_pvectors_entries) : graph_(std::move(graph)), num_pvectors_entries_(num_pvectors_entries) { diff --git a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h index 20c789f5f..3c3f50dd5 100644 --- a/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h +++ b/packages/feedsim/third_party/src/workloads/ranking/dwarfs/pagerank.h @@ -42,6 +42,14 @@ class PageRankParams { ~PageRankParams(); CSRGraph buildGraph(); + CSRGraph makeGraphCopy(const CSRGraph& original); + + void storeGraphToFile( + const CSRGraph& original, + const std::string& filePath); + + CSRGraph loadGraphFromFile( + const std::string& filePath); private: struct Impl;