/* * Copyright (c) 2014-2020 Embedded Systems and Applications, TU Darmstadt. * * This file is part of TaPaSCo * (see https://github.com/esa-tu-darmstadt/tapasco). * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #define RUNS 10 #define PAGE_SIZE 4096 typedef int32_t element_type; typedef int64_t numblock_type; // this function is used to init the array static void init_array(element_type *arr, const numblock_type num_blocks) { for (numblock_type i = 0; i < num_blocks; ++i){ arr[i] = (element_type)i; } } // this function check whether the input and result match with each other static uint64_t check_interface(element_type *input, element_type *result, const numblock_type num_blocks) { unsigned int errs = 0; for (numblock_type i = 0; i < num_blocks; ++i) { if (input[i] != result[i] ) { std::cerr << "ERROR: Value at " << i << " is " << input[i] <<" vs "<< result[i] << std::endl; ++errs; } } return errs; } int run_p2p(tapasco::Tapasco *dev0, tapasco::Tapasco *dev1, tapasco::PEId interface_test_id, numblock_type num_blocks) { uint64_t errs = 0; std::cout<<"testing on demand P2P"<(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); auto *output0 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); // init the array init_array(input0,num_blocks); // allocate array for second job auto *output1 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); // allocate array for third job auto *output2 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); // insert the timer auto start = std::chrono::high_resolution_clock::now(); // the first job is copying the value from input0 into output0 // it use on-demand to migrate data auto job = dev0->launch(interface_test_id, tapasco::makeVirtualAddress(input0), tapasco::makeVirtualAddress(output0), num_blocks); job(); // end the timer auto stop = std::chrono::high_resolution_clock::now(); // record the latency is micro second auto duration = std::chrono::duration_cast(stop - start); // start the timer of the second job, this involve the copying of output 0 // from one FPGA into another FPGA and it should use P2P. // the output1 is coming from host auto start2 = std::chrono::high_resolution_clock::now(); // it also use on-demand auto job2 = dev1->launch(interface_test_id, tapasco::makeVirtualAddress( output0),tapasco::makeVirtualAddress(output1), num_blocks); job2(); // time the code auto stop2 = std::chrono::high_resolution_clock::now(); // record the latency auto duration2 = std::chrono::duration_cast(stop2 - start2); // now the third job start auto start3 = std::chrono::high_resolution_clock::now(); // the third job is going to copy the output 1 from the second FPGA // back to the first FPGA. It should use P2P. // output2 is from host // it also use on demand auto job3 = dev0->launch(interface_test_id, tapasco::makeVirtualAddress( output1),tapasco::makeVirtualAddress(output2), num_blocks); job3(); // stop the timer auto stop3 = std::chrono::high_resolution_clock::now(); // record the latency auto duration3 = std::chrono::duration_cast(stop3 - start3); // std::cerr<<(num_blocks * sizeof(element_type))/PAGE_SIZE<<","<(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); auto *output0 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); // init the array init_array(input0,num_blocks); // allocate array for second job auto *output1 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); // allocate array for third job auto *output2 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); // insert the timer auto start = std::chrono::high_resolution_clock::now(); // the first job is copying the value from input0 into output0 // Here is the launch for user-managed auto job = dev0->launch(interface_test_id, tapasco::makeWrappedPointer( output0, num_blocks* sizeof(element_type)),tapasco::makeWrappedPointer( output1, num_blocks* sizeof(element_type)), num_blocks); job(); // end the timer auto stop = std::chrono::high_resolution_clock::now(); // record the latency is micro second auto duration = std::chrono::duration_cast(stop - start); // start the timer of the second job, this involve the copying of output 0 // from one FPGA into another FPGA and it should use P2P. // the output1 is coming from host auto start2 = std::chrono::high_resolution_clock::now(); // user-managed auto job2 = dev1->launch(interface_test_id, tapasco::makeWrappedPointer( output0, num_blocks* sizeof(element_type)),tapasco::makeWrappedPointer( output1, num_blocks* sizeof(element_type)), num_blocks); job2(); // time the code auto stop2 = std::chrono::high_resolution_clock::now(); // record the latency auto duration2 = std::chrono::duration_cast(stop2 - start2); // now the third job start auto start3 = std::chrono::high_resolution_clock::now(); // the third job is going to copy the output 1 from the second FPGA // back to the first FPGA. It should use P2P. // output2 is from host // it use user managed auto job3 = dev0->launch(interface_test_id, tapasco::makeWrappedPointer( output1, num_blocks* sizeof(element_type)),tapasco::makeWrappedPointer( output2, num_blocks* sizeof(element_type)), num_blocks); job3(); // stop the timer auto stop3 = std::chrono::high_resolution_clock::now(); // record the latency auto duration3 = std::chrono::duration_cast(stop3 - start3); // std::cerr<<(num_blocks * sizeof(element_type))/PAGE_SIZE<<","<(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); auto *output0 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); init_array(input0,num_blocks); // time the code auto start = std::chrono::high_resolution_clock::now(); // use on demand to copy array auto job = tapasco->launch(interface_test_id,tapasco::makeVirtualAddress(input0), tapasco::makeVirtualAddress(output0),num_blocks); job(); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(stop - start); // std::cerr<<(num_blocks * sizeof(element_type))/PAGE_SIZE<<","<(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); element_type *output0 = static_cast(aligned_alloc(PAGE_SIZE, num_blocks * sizeof(element_type))); init_array(input0,num_blocks); // time the code auto start = std::chrono::high_resolution_clock::now(); // use user managed auto job = tapasco->launch(interface_test_id, tapasco::makeWrappedPointer( input0, num_blocks* sizeof(element_type)),tapasco::makeWrappedPointer( output0, num_blocks* sizeof(element_type)), num_blocks); job(); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(stop - start); // std::cerr<<(num_blocks * sizeof(element_type))/PAGE_SIZE<<","< v) { return std::all_of(v.begin(), v.end(), [](int i) {return i == 0;}); } int main(int argc, char **argv) { // Use C API to get number of TaPaSCo devices if(argc != 2) { std::cout << "Usage: ./prog num_block (i.e. number of element)" << std::endl; return 1; } tapasco::TLKM *tlkm = tapasco::tapasco_tlkm_new(); int num_devices = tapasco::tapasco_tlkm_device_len(tlkm); std::cout << "Found " << num_devices << " TaPaSCo devices." << std::endl; const numblock_type num_blocks = std::atoll(argv[1]); std::cout << "The num_blocks is: " << num_blocks << std::endl; // initialize TaPaSCo devices std::vector devices; for (int i = 0; i < num_devices; ++i) { // pass access types and device ID to constructor (not required if only one device is used) devices.push_back(new tapasco::Tapasco(tapasco::tlkm_access::TlkmAccessExclusive, i)); } tapasco::PEId interface_test_id; std::vector interface_test_pe_count; // num_devices=1; for (auto d : devices) { try { auto id = d->get_pe_id("esa.cs.tu-darmstadt.de:hls:interface_test_2:1.0"); interface_test_pe_count.push_back(d->kernel_pe_count(id)); interface_test_id = id; } catch (...) { interface_test_pe_count.push_back(0); } } // No PE found to run any tests if (vector_all_zero(interface_test_pe_count)) { std::cout << "ERROR: Need at least one interface_test instance to run." << std::endl; exit(1); } // Run tests on all devices if PEs are available std::cout << "Run single-FPGA host example..." << std::endl; // std::cerr << "# of pages, number of run, latency" << std::endl; for (int i = 0; i < num_devices; ++i) { if (interface_test_pe_count[i]) { if (run_interface_test(devices[i], interface_test_id, num_blocks)) { std::cout << "An error occurred while running the interface_test example, exiting..." << std::endl; exit(1); } else { std::cout << "Completed interface_test example successfully!" << std::endl; } } } std::cout << "Run p2p example..." << std::endl; // std::cerr << "# of pages, number of run, dev0 latency, dev1 latency, dev 0 run latency" << std::endl; if (run_p2p(devices[0], devices[1], interface_test_id,num_blocks)) { std::cout << "An error occurred while running the pipeline example, exiting..." << std::endl; exit(1); } else { std::cout << "Completed the p2p example successfully!" << std::endl; } return 0; }