From 7ee82392c96c38ff8025d319b509b0cfc973ff2d Mon Sep 17 00:00:00 2001 From: Ali Marafi Date: Tue, 22 Mar 2022 17:04:03 +0100 Subject: [PATCH] Machine_MPI_Machine_CUDA_GPU --- .../CUDACore/test/mpiCudaGeneric.cu | 1980 ++++++++--------- 1 file changed, 967 insertions(+), 1013 deletions(-) diff --git a/HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu b/HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu index 88b2703a34cfc..c34256e2986be 100644 --- a/HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu +++ b/HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu @@ -42,80 +42,82 @@ bool help = false; /////////////////////////////////////////// Data Structure ///////////////////////////////////// //Data For User's Choises Only -struct UserChoises{ - // unsigned int sizeVector; - unsigned int sizeVectorBytes; //Defualt vectors element float - // unsigned int average; - unsigned int extra; - // unsigned int task; - // unsigned int partsToRun; - int root; - // int numberProcess; - int averageVectorSend; - std::vector partsToRunVector; //vector for user's choice of part. +struct UserChoises { + // unsigned int sizeVector; + unsigned int sizeVectorBytes; //Defualt vectors element float + // unsigned int average; + unsigned int extra; + // unsigned int task; + // unsigned int partsToRun; + int root; + // int numberProcess; + int averageVectorSend; + std::vector partsToRunVector; //vector for user's choice of part. }; //Data For MPI Only struct MPIData { - int num_procs; - int rank; - - std::pair workSplit; - float* mVect1; //declare vector 1. - float* mVect2; //declare vector 2. - float* mVect3; //declare vector fulled only by root to get result from workers. - float* mVectChecking; //declare vector to verify the results from each process. - float* mVectWorker1; //declare vector 1 for workers only. - float* mVectWorker2; //declare vector 2 for workers only. - float* mVectWorker3; //declare vector 2 for workers only. - std::vector displacement; //declare vector for selecting location of each element to be sent. - std::vector numberToSend; - }; + int num_procs; + int rank; + + std::pair workSplit; + float *mVect1; //declare vector 1. + float *mVect2; //declare vector 2. + float *mVect3; //declare vector fulled only by root to get result from workers. + float *mVectChecking; //declare vector to verify the results from each process. + float *mVectWorker1; //declare vector 1 for workers only. + float *mVectWorker2; //declare vector 2 for workers only. + float *mVectWorker3; //declare vector 2 for workers only. + std::vector displacement; //declare vector for selecting location of each element to be sent. + std::vector numberToSend; +}; //Data For Cuda Only struct Pointers { - float *vect1; //pointers only for Host - float *vect2; - float *vect3; - - float *dVect1; //pointers only for device - float *dVect2; - float *dVect3; - - float *dVect1Extra;//pointers only for device - float *dVect2Extra; - float *dVect3Extra; - }; + float *vect1; //pointers only for Host + float *vect2; + float *vect3; + + float *dVect1; //pointers only for device + float *dVect2; + float *dVect3; + + float *dVect1Extra; //pointers only for device + float *dVect2Extra; + float *dVect3Extra; +}; //Data for Time Measurements Only struct Timing { - int partChosen; - int unitChoice; - double inputPreparationRoot[2]; // get time points from start and end on Root Side - double inputPreparationHost[2]; // get time points from start and end on Host Side. - double operationOnDeviceByHost[2]; //get time duration in Device with Host perspective. - - double outputPreparationRoot[2]; - double outputPreparationHost[2]; - - std::vector timeInputPreparationRoot; //Save the Duration time. - std::vector timeInputPreparationHost; - std::vector timeOperationOnDeviceByRootHost; - std::vector timeOutputPreparationRoot; - std::vector timeOutputPreparationHost; - - cudaEvent_t start, stop; //get time points in Device. - float operationOnDeviceByDevice = 0; //get time duration in Device with device perspective. - std::vector operationOnDeviceByDeviceAcc; //get accumulating time duration in Device with device perspective. - std::vector averageResults; ///declare vector for getting average calcualtion for Hosts and device to Root. - }; + int partChosen; + int unitChoice; + double inputPreparationRoot[2]; // get time points from start and end on Root Side + double inputPreparationHost[2]; // get time points from start and end on Host Side. + double operationOnDeviceByHost[2]; //get time duration in Device with Host perspective. + + double outputPreparationRoot[2]; + double outputPreparationHost[2]; + + std::vector timeInputPreparationRoot; //Save the Duration time. + std::vector timeInputPreparationHost; + std::vector timeOperationOnDeviceByRootHost; + std::vector timeOutputPreparationRoot; + std::vector timeOutputPreparationHost; + + cudaEvent_t start, stop; //get time points in Device. + float operationOnDeviceByDevice = 0; //get time duration in Device with device perspective. + std::vector operationOnDeviceByDeviceAcc; //get accumulating time duration in Device with device perspective. + std::vector averageResults; ///declare vector for getting average calcualtion for Hosts and device to Root. +}; ////////////////////////////////////////////////////////////////////////////////////////////////// const std::vector chooseFunction(int toInteger); std::pair splitProcess(int works, int numberOfProcess); const std::vector numberDataSend(int numberOfProcess, std::pair splitWorks); -void setupMPIAndVectors(MPIData &mpiData, UserChoises &user); //initialize communicator environment for MPI and Resize Vectors with Generating Random numbers. -void setupTime(Timing &timing, UserChoises &user); //Resizing Vectors of Time. +void setupMPIAndVectors( + MPIData &mpiData, + UserChoises &user); //initialize communicator environment for MPI and Resize Vectors with Generating Random numbers. +void setupTime(Timing &timing, UserChoises &user); //Resizing Vectors of Time. void calculateTimeDuration(Timing &timing, int i, int &root); void addVectorsHost(float *vect1, float *vect2, float *vect3); void cleanBuffer(float *vect); @@ -123,9 +125,9 @@ bool checkingResultsPrintout(float *vectCpu, float *vectGpu); void calculateAverageDeviation(Timing &timing, int averg, int &root); bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank); -Timing blockSendPart1(MPIData& mpidata, Timing &timing, Pointers &pointer, UserChoises &user); -Timing blockSendPart2(MPIData& mpiData, Timing &timing, Pointers &pointer, UserChoises &user); -Timing blockSendPart3(MPIData& mpiData, Timing &timing, Pointers &pointer, UserChoises &user); +Timing blockSendPart1(MPIData &mpidata, Timing &timing, Pointers &pointer, UserChoises &user); +Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user); +Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user); void printTable(std::vector &timing, bool standerDeviationPrint); int getNumberofDigits(double number); @@ -134,170 +136,149 @@ void printResultEach(std::vector &timing, int type, bool standerDeviatio bool saveToFile(const std::string &name, const Timing &timing); void printHelp(void); -int main(int argc, char *argv[]) -{ - cms::cudatest::requireDevices(); - int c; //to get parameters from user. - - UserChoises user; //Setup Uuser's input variables - user.extra = 2; - user.root = 0; - user.averageVectorSend = 8; - - while ((c = getopt(argc, argv, "s:a:t:p:qfh")) != -1) { - switch (c) { - case 's': - try { - sizeVector = std::stoll(optarg, nullptr, 0); - } catch (std::exception &err) { - std::cout << "\n\tError Must be integer Argument!"; - std::cout << "\n\t" << err.what() << std::endl; - return 0; - } - break; - case 'a': - try { - average = std::stoll(optarg, nullptr, 0); - - } catch (std::exception &err) { - std::cout << "\n\tError Must be integer Argument!"; - std::cout << "\n\t" << err.what() << std::endl; - return 0; - } - break; - case 't': - try { - task = std::stoll(optarg, nullptr, 0); - //std::cout << "\nNumber of repeated Task is " << task << std::endl; - } catch (std::exception &err) { - std::cout << "\n\tError Must be integer Argument!"; - std::cout << "\n\t" << err.what() << std::endl; - return 0; - } - break; - case 'p': - try { - partsToRun = std::stoll(optarg, nullptr, 0); - user.partsToRunVector = chooseFunction(partsToRun); - //std::cout << "\nyou have chosen Part "; - for (unsigned int j = 0; j < user.partsToRunVector.size(); ++j) { - std::cout << user.partsToRunVector[j] << " ,"; - } - std::cout << "\n"; - } catch (std::exception &err) { - std::cout << "\n\tError Must be integer Argument!"; - std::cout << "\n\t" << err.what() << std::endl; - return 0; - } - break; - case 'q': - try { - printStander = true; - } catch (std::exception &err) { - std::cout << "\n\tError Must be integer Argument!"; - std::cout << "\n\t" << err.what() << std::endl; - return 0; - } - break; - case 'f': - try { - saveFile = true; - } catch (std::exception &err) { - std::cout << "\n\tError Must be integer Argument!"; - std::cout << "\n\t" << err.what() << std::endl; - return 0; - } - break; - case 'h': - try { - help = true; - } catch (std::exception &err) { - std::cout << "\n\tError Must be integer Argument!"; - std::cout << "\n\t" << err.what() << std::endl; - return 0; - } - break; - - default: - abort(); +int main(int argc, char *argv[]) { + cms::cudatest::requireDevices(); + int c; //to get parameters from user. + + UserChoises user; //Setup Uuser's input variables + user.extra = 2; + user.root = 0; + user.averageVectorSend = 8; + + while ((c = getopt(argc, argv, "s:a:t:p:qfh")) != -1) { + switch (c) { + case 's': + try { + sizeVector = std::stoll(optarg, nullptr, 0); + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'a': + try { + average = std::stoll(optarg, nullptr, 0); + + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 't': + try { + task = std::stoll(optarg, nullptr, 0); + //std::cout << "\nNumber of repeated Task is " << task << std::endl; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'p': + try { + partsToRun = std::stoll(optarg, nullptr, 0); + user.partsToRunVector = chooseFunction(partsToRun); + //std::cout << "\nyou have chosen Part "; + for (unsigned int j = 0; j < user.partsToRunVector.size(); ++j) { + std::cout << user.partsToRunVector[j] << " ,"; + } + std::cout << "\n"; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'q': + try { + printStander = true; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'f': + try { + saveFile = true; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'h': + try { + help = true; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; } - } - - - MPIData mpiData; - Timing timing; - Timing resetTime; - Pointers pointer; - timing.unitChoice = 1000000; //1M - resetTime.unitChoice = 1000000; //1M - - std::vector allTiming; - allTiming.resize(user.partsToRunVector.size()); - - MPI_Init(&argc, &argv); //initialize communicator environment. - - if(help) - { - printHelp(); - MPI::Finalize(); - exit(0); - } - setupMPIAndVectors(mpiData, user); - - setupTime(timing, user); - setupTime(resetTime, user); - - - - for (long unsigned int i = 0; i < user.partsToRunVector.size(); ++i) { - if (user.partsToRunVector[i] == 1) { - //setupTime(allTiming[i], user); - //blockSendPart1(mpiData, allTiming[i], pointer, user); - allTiming[i] = blockSendPart1(mpiData, timing, pointer, user); - timing = resetTime; - - } else if (user.partsToRunVector[i] == 2) { - //setupTime(allTiming[i], user); - //blockSendPart2(mpiData, allTiming[i], pointer, user); - allTiming[i] = blockSendPart2(mpiData, timing, pointer, user); - timing = resetTime; - - } else if (user.partsToRunVector[i] == 3) { - allTiming[i] = blockSendPart3(mpiData, timing, pointer, user); - timing = resetTime; - // } else if (user.partsToRunVector[i] == 4) { - // allTiming[i] = cudaTimePart4(timing, vect, dvect, size); - - // } else if (user.partsToRunVector[i] == 5) { - // allTiming[i] = cudaTimePart5(timing, vect, dvect, size); - - } else { - std::cout << "\n\n\tError the User has not chose any number of Function!\n"; break; - } - } - - if(!mpiData.rank) - printTable(allTiming, printStander); - - - - - - - - + default: + abort(); + } + } + MPIData mpiData; + Timing timing; + Timing resetTime; + Pointers pointer; + timing.unitChoice = 1000000; //1M + resetTime.unitChoice = 1000000; //1M + std::vector allTiming; + allTiming.resize(user.partsToRunVector.size()); + MPI_Init(&argc, &argv); //initialize communicator environment. + if (help) { + printHelp(); + MPI::Finalize(); + exit(0); + } + setupMPIAndVectors(mpiData, user); + + setupTime(timing, user); + setupTime(resetTime, user); + + for (long unsigned int i = 0; i < user.partsToRunVector.size(); ++i) { + if (user.partsToRunVector[i] == 1) { + //setupTime(allTiming[i], user); + //blockSendPart1(mpiData, allTiming[i], pointer, user); + allTiming[i] = blockSendPart1(mpiData, timing, pointer, user); + timing = resetTime; + + } else if (user.partsToRunVector[i] == 2) { + //setupTime(allTiming[i], user); + //blockSendPart2(mpiData, allTiming[i], pointer, user); + allTiming[i] = blockSendPart2(mpiData, timing, pointer, user); + timing = resetTime; + + } else if (user.partsToRunVector[i] == 3) { + allTiming[i] = blockSendPart3(mpiData, timing, pointer, user); + timing = resetTime; + // } else if (user.partsToRunVector[i] == 4) { + // allTiming[i] = cudaTimePart4(timing, vect, dvect, size); + // } else if (user.partsToRunVector[i] == 5) { + // allTiming[i] = cudaTimePart5(timing, vect, dvect, size); + } else { + std::cout << "\n\n\tError the User has not chose any number of Function!\n"; + break; + } + } + if (!mpiData.rank) + printTable(allTiming, printStander); - MPI::Finalize(); - return 0; + MPI::Finalize(); + return 0; } const std::vector chooseFunction(int toInteger) { std::vector digits(0, 0); @@ -319,16 +300,15 @@ const std::vector chooseFunction(int toInteger) { } std::pair splitProcess(int works, int numberOfProcess) { - std::pair Return{0, 0}; - if (numberOfProcess > 1 && numberOfProcess <= works) { - Return.first = works / (numberOfProcess - 1); //number of cycle for each process. - Return.second = works % (numberOfProcess - 1); //extra cycle for process. - } else { - std::cout << "\tError Either No worker are found OR Number Processes Larger than Length!!!\n"; - } - - return Return; - + std::pair Return{0, 0}; + if (numberOfProcess > 1 && numberOfProcess <= works) { + Return.first = works / (numberOfProcess - 1); //number of cycle for each process. + Return.second = works % (numberOfProcess - 1); //extra cycle for process. + } else { + std::cout << "\tError Either No worker are found OR Number Processes Larger than Length!!!\n"; + } + + return Return; } const std::vector numberDataSend(int numberOfProcess, std::pair splitWorks) { std::vector dataSend(numberOfProcess, splitWorks.first); @@ -341,7 +321,7 @@ const std::vector numberDataSend(int numberOfProcess, std::pair s } const std::vector displacmentData(int numberOfProcess, std::pair splitWorks, - const std::vector& numberDataSend) { + const std::vector &numberDataSend) { std::vector displacment(numberOfProcess, splitWorks.first); displacment[0] = 0; @@ -354,31 +334,27 @@ const std::vector displacmentData(int numberOfProcess, return displacment; } void randomGenerator(float *vect) { - std::random_device rand; - std::default_random_engine gener(rand()); - std::uniform_real_distribution<> dis(0., 1.); - for (unsigned int i = 0; i < sizeVector; ++i) { - vect[i] = dis(gener); - } - + std::random_device rand; + std::default_random_engine gener(rand()); + std::uniform_real_distribution<> dis(0., 1.); + for (unsigned int i = 0; i < sizeVector; ++i) { + vect[i] = dis(gener); + } } -void setupMPIAndVectors(MPIData &mpiData, UserChoises &user) -{ - +void setupMPIAndVectors(MPIData &mpiData, UserChoises &user) { mpiData.num_procs = MPI::COMM_WORLD.Get_size(); //get total size of processes. mpiData.rank = MPI::COMM_WORLD.Get_rank(); //get each process number. user.sizeVectorBytes = sizeVector * sizeof(float); //get size in byte for vectors. - mpiData.mVect1 = (float*) malloc(user.sizeVectorBytes); //initialize size. - mpiData.mVect2 = (float*) malloc(user.sizeVectorBytes); - mpiData.mVect3 = (float*) malloc(user.sizeVectorBytes); - mpiData.mVectChecking = (float*) malloc(user.sizeVectorBytes); - - //mpiData.mVectWorker1 = (float*) malloc(user.sizeVectorBytes); - //mpiData.mVectWorker2 = (float*) malloc(user.sizeVectorBytes); - mpiData.mVectWorker3 = (float*) malloc(user.sizeVectorBytes); + mpiData.mVect1 = (float *)malloc(user.sizeVectorBytes); //initialize size. + mpiData.mVect2 = (float *)malloc(user.sizeVectorBytes); + mpiData.mVect3 = (float *)malloc(user.sizeVectorBytes); + mpiData.mVectChecking = (float *)malloc(user.sizeVectorBytes); + //mpiData.mVectWorker1 = (float*) malloc(user.sizeVectorBytes); + //mpiData.mVectWorker2 = (float*) malloc(user.sizeVectorBytes); + mpiData.mVectWorker3 = (float *)malloc(user.sizeVectorBytes); mpiData.workSplit = splitProcess(sizeVector, mpiData.num_procs); @@ -392,9 +368,8 @@ void setupMPIAndVectors(MPIData &mpiData, UserChoises &user) // mpiData.mVectWorker1.resize(mpiData.numberToSend[mpiData.rank]); //Resizing each process with appropriate Receiving Data. // mpiData.mVectWorker2.resize(mpiData.numberToSend[mpiData.rank]); - mpiData.mVectWorker1 = (float*) malloc(mpiData.numberToSend[mpiData.rank]*sizeof(float)); - mpiData.mVectWorker2 = (float*) malloc(mpiData.numberToSend[mpiData.rank]*sizeof(float)); - + mpiData.mVectWorker1 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float)); + mpiData.mVectWorker2 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float)); if (!mpiData.rank) //Only for root { @@ -403,45 +378,42 @@ void setupMPIAndVectors(MPIData &mpiData, UserChoises &user) std::cout << "\n\tNumber of Processes " << mpiData.num_procs << std::endl; std::cout << "\tNumber of workSplit First " << mpiData.workSplit.first << std::endl; std::cout << "\tNumber of workSplit Second " << mpiData.workSplit.second << std::endl; - std::cout << "\tTotal size of a Buffer " << user.sizeVectorBytes << " B"<< std::endl; + std::cout << "\tTotal size of a Buffer " << user.sizeVectorBytes << " B" << std::endl; } - } -void setupTime(Timing &timing, UserChoises &user) -{ -//Setup Verctors for Taking Average and Standard deviation +void setupTime(Timing &timing, UserChoises &user) { + //Setup Verctors for Taking Average and Standard deviation timing.timeInputPreparationRoot.resize(average + user.extra); //extra for saving the average. - timing.timeInputPreparationHost.resize(average + user.extra); + timing.timeInputPreparationHost.resize(average + user.extra); timing.timeOperationOnDeviceByRootHost.resize(average + user.extra); timing.timeOutputPreparationRoot.resize(average + user.extra); timing.timeOutputPreparationHost.resize(average + user.extra); timing.operationOnDeviceByDeviceAcc.resize(average + user.extra); timing.averageResults.resize(user.averageVectorSend); - } void calculateTimeDuration(Timing &timing, int i, int &root) { - - if(!root) - { - timing.timeInputPreparationRoot[i] = (timing.inputPreparationRoot[1] - timing.inputPreparationRoot[0]); //getting the time in microseconds - timing.timeOperationOnDeviceByRootHost[i] =(timing.outputPreparationRoot[0] - timing.inputPreparationRoot[1]); + if (!root) { + timing.timeInputPreparationRoot[i] = + (timing.inputPreparationRoot[1] - timing.inputPreparationRoot[0]); //getting the time in microseconds + timing.timeOperationOnDeviceByRootHost[i] = (timing.outputPreparationRoot[0] - timing.inputPreparationRoot[1]); timing.timeOutputPreparationRoot[i] = (timing.outputPreparationRoot[1] - timing.outputPreparationRoot[0]); - } - else - { + } else { timing.timeInputPreparationHost[i] = (timing.inputPreparationHost[1] - timing.inputPreparationHost[0]); - timing.timeOperationOnDeviceByRootHost[i] = (timing.operationOnDeviceByHost[1]- timing.operationOnDeviceByHost[0]); //time taking for Device operation with respect of Host. - cudaEventElapsedTime(&timing.operationOnDeviceByDevice,timing.start,timing.stop); //get the time elapse in Device operation with device perspective. + timing.timeOperationOnDeviceByRootHost[i] = + (timing.operationOnDeviceByHost[1] - + timing.operationOnDeviceByHost[0]); //time taking for Device operation with respect of Host. + cudaEventElapsedTime(&timing.operationOnDeviceByDevice, + timing.start, + timing.stop); //get the time elapse in Device operation with device perspective. timing.operationOnDeviceByDeviceAcc[i] = (timing.operationOnDeviceByDevice * 1000); timing.timeOutputPreparationHost[i] = (timing.outputPreparationHost[1] - timing.outputPreparationHost[0]); } - } void addVectorsHost(float *vect1, float *vect2, float *vect3) { - for (unsigned int i = 0; i < sizeVector; ++i) { - vect3[i] = vect2[i] + vect1[i]; - } + for (unsigned int i = 0; i < sizeVector; ++i) { + vect3[i] = vect2[i] + vect1[i]; + } } void cleanBuffer(float *vect) { for (unsigned int i = 0; i < sizeVector; ++i) { @@ -451,7 +423,7 @@ void cleanBuffer(float *vect) { bool checkingResultsPrintout(float *vectCpu, float *vectGpu) { float percent{0.0}; float totalError{0.0}; - + for (unsigned int j = 0; j < sizeVector; j++) { percent = ((vectCpu[j] - vectGpu[j]) / vectCpu[j]) * 100; totalError += percent; @@ -475,52 +447,42 @@ bool checkingResultsPrintout(float *vectCpu, float *vectGpu) { } void calculateAverageDeviation(Timing &timing, int averg, int &root) { //Average - for (int i = 0; i < averg; ++i) - { - if(!root) - { + for (int i = 0; i < averg; ++i) { + if (!root) { timing.timeInputPreparationRoot[averg] += timing.timeInputPreparationRoot[i]; timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i]; timing.timeOutputPreparationRoot[averg] += timing.timeOutputPreparationRoot[i]; - } - else - { + } else { timing.timeInputPreparationHost[averg] += timing.timeInputPreparationHost[i]; timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i]; timing.timeOutputPreparationHost[averg] += timing.timeOutputPreparationHost[i]; timing.operationOnDeviceByDeviceAcc[averg] += timing.operationOnDeviceByDeviceAcc[i]; } - } - if(!root) - { + if (!root) { timing.timeInputPreparationRoot[averg] = timing.timeInputPreparationRoot[averg] / averg; timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg; - + timing.timeOutputPreparationRoot[averg] = timing.timeOutputPreparationRoot[averg] / averg; - - } - else - { + + } else { timing.timeInputPreparationHost[averg] = timing.timeInputPreparationHost[averg] / averg; - + timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg; - + timing.timeOutputPreparationHost[averg] = timing.timeOutputPreparationHost[averg] / averg; - + timing.operationOnDeviceByDeviceAcc[averg] = (double)timing.operationOnDeviceByDeviceAcc[averg] / averg; - } - //Standard deviation - for (int i = 0; i < averg; ++i) - { - if(!root) - { - timing.timeInputPreparationRoot[i] -= timing.timeInputPreparationRoot[averg]; //Take the different. - timing.timeInputPreparationRoot[i] = timing.timeInputPreparationRoot[i] * timing.timeInputPreparationRoot[i]; // Square it. - timing.timeInputPreparationRoot[averg + 1] += timing.timeInputPreparationRoot[i]; //add them togather. averg+1 is location of the Deviation + for (int i = 0; i < averg; ++i) { + if (!root) { + timing.timeInputPreparationRoot[i] -= timing.timeInputPreparationRoot[averg]; //Take the different. + timing.timeInputPreparationRoot[i] = + timing.timeInputPreparationRoot[i] * timing.timeInputPreparationRoot[i]; // Square it. + timing.timeInputPreparationRoot[averg + 1] += + timing.timeInputPreparationRoot[i]; //add them togather. averg+1 is location of the Deviation timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg]; timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i]; @@ -529,12 +491,12 @@ void calculateAverageDeviation(Timing &timing, int averg, int &root) { timing.timeOutputPreparationRoot[i] -= timing.timeOutputPreparationRoot[averg]; timing.timeOutputPreparationRoot[i] *= timing.timeOutputPreparationRoot[i]; timing.timeOutputPreparationRoot[averg + 1] += timing.timeOutputPreparationRoot[i]; - } - else - { - timing.timeInputPreparationHost[i] -= timing.timeInputPreparationHost[averg]; //Take the different. - timing.timeInputPreparationHost[i] = timing.timeInputPreparationHost[i] * timing.timeInputPreparationHost[i]; // Square it. - timing.timeInputPreparationHost[averg + 1] += timing.timeInputPreparationHost[i]; //add them togather. averg+1 is location of the Deviation + } else { + timing.timeInputPreparationHost[i] -= timing.timeInputPreparationHost[averg]; //Take the different. + timing.timeInputPreparationHost[i] = + timing.timeInputPreparationHost[i] * timing.timeInputPreparationHost[i]; // Square it. + timing.timeInputPreparationHost[averg + 1] += + timing.timeInputPreparationHost[i]; //add them togather. averg+1 is location of the Deviation timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg]; timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i]; @@ -550,70 +512,53 @@ void calculateAverageDeviation(Timing &timing, int averg, int &root) { } } - if(!root) - { + if (!root) { timing.timeInputPreparationRoot[averg + 1] = timing.timeInputPreparationRoot[averg + 1] / averg; timing.timeInputPreparationRoot[averg + 1] = sqrt(timing.timeInputPreparationRoot[averg + 1]); - timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg; timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]); - timing.timeOutputPreparationRoot[averg + 1] = timing.timeOutputPreparationRoot[averg + 1] / averg; timing.timeOutputPreparationRoot[averg + 1] = sqrt(timing.timeOutputPreparationRoot[averg + 1]); - - } - else - { - timing.timeInputPreparationHost[averg + 1] = timing.timeInputPreparationHost[averg + 1] / averg; //*1000000 + + } else { + timing.timeInputPreparationHost[averg + 1] = timing.timeInputPreparationHost[averg + 1] / averg; //*1000000 timing.timeInputPreparationHost[averg + 1] = sqrt(timing.timeInputPreparationHost[averg + 1]); - timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg; timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]); - timing.timeOutputPreparationHost[averg + 1] = timing.timeOutputPreparationHost[averg + 1] / averg; timing.timeOutputPreparationHost[averg + 1] = sqrt(timing.timeOutputPreparationHost[averg + 1]); - timing.operationOnDeviceByDeviceAcc[averg + 1] = (double)timing.operationOnDeviceByDeviceAcc[averg + 1] / averg; timing.operationOnDeviceByDeviceAcc[averg + 1] = sqrt(timing.operationOnDeviceByDeviceAcc[averg + 1]); - } - if(!root) - { - timing.timeInputPreparationRoot[averg]*= timing.unitChoice; - timing.timeOperationOnDeviceByRootHost[averg]*= timing.unitChoice; - timing.timeOutputPreparationRoot[averg]*= timing.unitChoice; + if (!root) { + timing.timeInputPreparationRoot[averg] *= timing.unitChoice; + timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice; + timing.timeOutputPreparationRoot[averg] *= timing.unitChoice; timing.timeInputPreparationRoot[averg + 1] *= timing.unitChoice; timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice; timing.timeOutputPreparationRoot[averg + 1] *= timing.unitChoice; - } - else - { - timing.timeInputPreparationHost[averg]*= timing.unitChoice; - timing.timeOperationOnDeviceByRootHost[averg]*= timing.unitChoice; - timing.timeOutputPreparationHost[averg]*= timing.unitChoice; + } else { + timing.timeInputPreparationHost[averg] *= timing.unitChoice; + timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice; + timing.timeOutputPreparationHost[averg] *= timing.unitChoice; timing.timeInputPreparationHost[averg + 1] *= timing.unitChoice; - timing.timeOperationOnDeviceByRootHost[averg + 1]*= timing.unitChoice; + timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice; timing.timeOutputPreparationHost[averg + 1] *= timing.unitChoice; } - - } -bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank) -{ - if(rank) - { - +bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank) { + if (rank) { timing.averageResults[0] = timing.timeInputPreparationHost[average]; - timing.averageResults[1] = timing.timeInputPreparationHost[average + 1]; //Stander Deviation + timing.averageResults[1] = timing.timeInputPreparationHost[average + 1]; //Stander Deviation timing.averageResults[2] = timing.timeOperationOnDeviceByRootHost[average]; timing.averageResults[3] = timing.timeOperationOnDeviceByRootHost[average + 1]; @@ -626,756 +571,765 @@ bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank) MPI_Send(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, user.root, 0, MPI_COMM_WORLD); - - } - else if(!rank) - { + } else if (!rank) { MPI_Recv(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } return true; } -Timing blockSendPart1(MPIData& mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { +Timing blockSendPart1(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements timing.partChosen = 1; - if(mpiData.rank) //Only for Workers + if (mpiData.rank) //Only for Workers { - cudaCheck(cudaMalloc((void **)&pointer.dVect1, user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. + cudaCheck(cudaMalloc((void **)&pointer.dVect1, + user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes)); cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes)); } - ///////////////////////////// Start of Average //////////////////////// - for (int a = 0; a <= average; ++a) + ///////////////////////////// Start of Average //////////////////////// + for (int a = 0; a <= average; ++a) { + if (!mpiData.rank) //Only for root { - - if (!mpiData.rank) //Only for root - { - ////////////////////////////////// Input Prepation for Root ////////////////////////////////// - timing.inputPreparationRoot[0] = MPI_Wtime(); - for (int i = 1; i < mpiData.num_procs; ++i) { - MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], - mpiData.numberToSend[i], - MPI_FLOAT, - i, - 0, - MPI_COMM_WORLD); //Tag is 0 - MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); - } - timing.inputPreparationRoot[1] = MPI_Wtime(); - ///////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (mpiData.rank) //Only for Workers - { - ////////////////////////////////// Input Prepation for Host ////////////////////////////////// - MPI_Probe(user.root, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE); - timing.inputPreparationHost[0] = MPI_Wtime(); - MPI_Recv(&mpiData.mVectWorker1[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Recv(&mpiData.mVectWorker2[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - - cudaCheck(cudaMemcpy(pointer.dVect1, mpiData.mVectWorker1, user.sizeVectorBytes, cudaMemcpyHostToDevice)); //copy random vector from host to device. - cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); - - timing.inputPreparationHost[1] = MPI_Wtime(); - /////////////////////////////////////////////////////////////////////////////////////// - - cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. - cudaCheck(cudaEventCreate(&timing.stop)); - - ///////////////////////////// Operation on Device with respect of Host ////////////////// - - int threads = 512; //arbitrary number. - int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. - blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. - - ////////////////////////// CAll Device Kernel ////////////////////////////////// - cudaCheck(cudaEventRecord(timing.start)); - timing.operationOnDeviceByHost[0] = MPI_Wtime(); - - addVectorsGpu<<>>(pointer.dVect1, pointer.dVect2, pointer.dVect3, sizeVector, task); //call device function to add two vectors and save into vect3Gpu. - - cudaCheck(cudaGetLastError()); - cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaEventRecord(timing.stop)); - - timing.operationOnDeviceByHost[1] = MPI_Wtime(); - ///////////////////////////////////////////////////////////////////////////////////////////// - - /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// - timing.outputPreparationHost[0] = MPI_Wtime(); - cudaCheck(cudaMemcpy(mpiData.mVectWorker3,pointer.dVect3,user.sizeVectorBytes,cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this - - MPI_Send(&mpiData.mVectWorker3[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD); //Tag is 0 - timing.outputPreparationHost[1] = MPI_Wtime(); - //////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (!mpiData.rank) //Only for root - { - /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// - MPI_Probe(MPI_ANY_SOURCE,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE); - timing.outputPreparationRoot[0] = MPI_Wtime(); - //MPI probe - for (int i = 1; i < mpiData.num_procs; i++) { - - MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], - mpiData.numberToSend[i], - MPI_FLOAT, - i, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - } - timing.outputPreparationRoot[1] = MPI_Wtime(); - //////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (a>0) - calculateTimeDuration(timing, a-1, mpiData.rank); - - if(mpiData.rank) - { - cudaCheck(cudaEventDestroy(timing.start)); - cudaCheck(cudaEventDestroy(timing.stop)); + ////////////////////////////////// Input Prepation for Root ////////////////////////////////// + timing.inputPreparationRoot[0] = MPI_Wtime(); + for (int i = 1; i < mpiData.num_procs; ++i) { + MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD); //Tag is 0 + MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); } + timing.inputPreparationRoot[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////////// } - ///////////////////////////// End of Average //////////////////////// - if(mpiData.rank) + + if (mpiData.rank) //Only for Workers { - cudaCheck(cudaFree(pointer.dVect1)); - cudaCheck(cudaFree(pointer.dVect2)); - cudaCheck(cudaFree(pointer.dVect3)); + ////////////////////////////////// Input Prepation for Host ////////////////////////////////// + MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.inputPreparationHost[0] = MPI_Wtime(); + MPI_Recv(&mpiData.mVectWorker1[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Recv(&mpiData.mVectWorker2[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + + cudaCheck(cudaMemcpy(pointer.dVect1, + mpiData.mVectWorker1, + user.sizeVectorBytes, + cudaMemcpyHostToDevice)); //copy random vector from host to device. + cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); + + timing.inputPreparationHost[1] = MPI_Wtime(); + /////////////////////////////////////////////////////////////////////////////////////// + + cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. + cudaCheck(cudaEventCreate(&timing.stop)); + + ///////////////////////////// Operation on Device with respect of Host ////////////////// + + int threads = 512; //arbitrary number. + int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. + blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. + + ////////////////////////// CAll Device Kernel ////////////////////////////////// + cudaCheck(cudaEventRecord(timing.start)); + timing.operationOnDeviceByHost[0] = MPI_Wtime(); + + addVectorsGpu<<>>(pointer.dVect1, + pointer.dVect2, + pointer.dVect3, + sizeVector, + task); //call device function to add two vectors and save into vect3Gpu. + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaEventRecord(timing.stop)); + + timing.operationOnDeviceByHost[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// + timing.outputPreparationHost[0] = MPI_Wtime(); + cudaCheck(cudaMemcpy( + mpiData.mVectWorker3, + pointer.dVect3, + user.sizeVectorBytes, + cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this + + MPI_Send(&mpiData.mVectWorker3[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD); //Tag is 0 + timing.outputPreparationHost[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// } - /// - bool test = 0; + if (!mpiData.rank) //Only for root { - addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. - test = checkingResultsPrintout(mpiData.mVectChecking, mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. - if(!test) - exit(-1); - } - - calculateAverageDeviation(timing,average, mpiData.rank); - test = sendAverageToRoot(timing, user, mpiData.rank); - if(test && !mpiData.rank){ - if(saveFile) - { - test = saveToFile("dataPart1", timing); - - if(test) - std::cout <<"Done Part " << timing.partChosen << " And File saved" << std::endl; - else - std::cout <<"Error Saving File!!" << std::endl; + /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// + MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.outputPreparationRoot[0] = MPI_Wtime(); + //MPI probe + for (int i = 1; i < mpiData.num_procs; i++) { + MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); } - std::cout <<"Done Part " << timing.partChosen << std::endl; + timing.outputPreparationRoot[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// } - return timing; + + if (a > 0) + calculateTimeDuration(timing, a - 1, mpiData.rank); + + if (mpiData.rank) { + cudaCheck(cudaEventDestroy(timing.start)); + cudaCheck(cudaEventDestroy(timing.stop)); + } + } + ///////////////////////////// End of Average //////////////////////// + if (mpiData.rank) { + cudaCheck(cudaFree(pointer.dVect1)); + cudaCheck(cudaFree(pointer.dVect2)); + cudaCheck(cudaFree(pointer.dVect3)); + } + /// + bool test = 0; + if (!mpiData.rank) //Only for root + { + addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. + test = checkingResultsPrintout(mpiData.mVectChecking, + mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. + if (!test) + exit(-1); } - Timing blockSendPart2(MPIData& mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { - cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements - timing.partChosen = 2; - - if(mpiData.rank) //Only for Workers - { - cudaCheck(cudaMallocHost((void **)&pointer.vect1, user.sizeVectorBytes)); //allocate Pinned memory on the Host. - cudaCheck(cudaMallocHost((void **)&pointer.vect2, user.sizeVectorBytes)); - cudaCheck(cudaMallocHost((void **)&pointer.vect3, user.sizeVectorBytes)); - cudaCheck(cudaMalloc((void **)&pointer.dVect1, user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. - cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes)); - cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes)); + calculateAverageDeviation(timing, average, mpiData.rank); + test = sendAverageToRoot(timing, user, mpiData.rank); + if (test && !mpiData.rank) { + if (saveFile) { + test = saveToFile("dataPart1", timing); + + if (test) + std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl; + else + std::cout << "Error Saving File!!" << std::endl; } - ///////////////////////////// Start of Average //////////////////////// - for (int a = 0; a <= average; ++a) - { - - if (!mpiData.rank) //Only for root - { - ////////////////////////////////// Input Prepation for Root ////////////////////////////////// - timing.inputPreparationRoot[0] = MPI_Wtime(); - for (int i = 1; i < mpiData.num_procs; ++i) { - MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], - mpiData.numberToSend[i], - MPI_FLOAT, - i, - 0, - MPI_COMM_WORLD); //Tag is 0 - MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); - } - timing.inputPreparationRoot[1] = MPI_Wtime(); - ///////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (mpiData.rank) //Only for Workers - { - ////////////////////////////////// Input Prepation for Host ////////////////////////////////// - MPI_Probe(user.root, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE); - timing.inputPreparationHost[0] = MPI_Wtime(); - - MPI_Recv(&pointer.vect1[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Recv(&pointer.vect2[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - - cudaCheck(cudaMemcpy(pointer.dVect1, pointer.vect1, user.sizeVectorBytes, cudaMemcpyHostToDevice)); //copy random vector from host to device. - cudaCheck(cudaMemcpy(pointer.dVect2, pointer.vect2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); - - timing.inputPreparationHost[1] = MPI_Wtime(); - /////////////////////////////////////////////////////////////////////////////////////// - - cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. - cudaCheck(cudaEventCreate(&timing.stop)); - - ///////////////////////////// Operation on Device with respect of Host ////////////////// - - int threads = 512; //arbitrary number. - int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. - blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. - - ////////////////////////// CAll Device Kernel ////////////////////////////////// - cudaCheck(cudaEventRecord(timing.start)); - timing.operationOnDeviceByHost[0] = MPI_Wtime(); - - addVectorsGpu<<>>(pointer.dVect1, pointer.dVect2, pointer.dVect3, sizeVector, task); //call device function to add two vectors and save into vect3Gpu. - - cudaCheck(cudaGetLastError()); - cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaEventRecord(timing.stop)); - - timing.operationOnDeviceByHost[1] = MPI_Wtime(); - ///////////////////////////////////////////////////////////////////////////////////////////// - - /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// - timing.outputPreparationHost[0] = MPI_Wtime(); - - cudaCheck(cudaMemcpy(pointer.vect3,pointer.dVect3,user.sizeVectorBytes,cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this - - MPI_Send(&pointer.vect3[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD); //Tag is 0 - - timing.outputPreparationHost[1] = MPI_Wtime(); - //////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (!mpiData.rank) //Only for root - { - /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// - MPI_Probe(MPI_ANY_SOURCE,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE); - timing.outputPreparationRoot[0] = MPI_Wtime(); - //MPI probe - for (int i = 1; i < mpiData.num_procs; i++) { - - MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], - mpiData.numberToSend[i], - MPI_FLOAT, - i, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - } - timing.outputPreparationRoot[1] = MPI_Wtime(); - //////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (a>0) - calculateTimeDuration(timing, a-1, mpiData.rank); - - if(mpiData.rank) - { - - cudaCheck(cudaEventDestroy(timing.start)); - cudaCheck(cudaEventDestroy(timing.stop)); - } + std::cout << "Done Part " << timing.partChosen << std::endl; + } + return timing; +} - } - ///////////////////////////// End of Average //////////////////////// - if(mpiData.rank) - { - cudaCheck(cudaFreeHost(pointer.vect1)); - cudaCheck(cudaFreeHost(pointer.vect2)); - cudaCheck(cudaFreeHost(pointer.vect3)); - cudaCheck(cudaFree(pointer.dVect1)); - cudaCheck(cudaFree(pointer.dVect2)); - cudaCheck(cudaFree(pointer.dVect3)); +Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { + cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements + timing.partChosen = 2; + if (mpiData.rank) //Only for Workers + { + cudaCheck(cudaMallocHost((void **)&pointer.vect1, user.sizeVectorBytes)); //allocate Pinned memory on the Host. + cudaCheck(cudaMallocHost((void **)&pointer.vect2, user.sizeVectorBytes)); + cudaCheck(cudaMallocHost((void **)&pointer.vect3, user.sizeVectorBytes)); + cudaCheck(cudaMalloc((void **)&pointer.dVect1, + user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. + cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes)); + cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes)); + } + ///////////////////////////// Start of Average //////////////////////// + for (int a = 0; a <= average; ++a) { + if (!mpiData.rank) //Only for root + { + ////////////////////////////////// Input Prepation for Root ////////////////////////////////// + timing.inputPreparationRoot[0] = MPI_Wtime(); + for (int i = 1; i < mpiData.num_procs; ++i) { + MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD); //Tag is 0 + MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); } - - bool test = 0; - if (!mpiData.rank) //Only for root - { - addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. - test = checkingResultsPrintout(mpiData.mVectChecking, mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. - if(!test) - exit(-1); - } - - calculateAverageDeviation(timing,average, mpiData.rank); - test = sendAverageToRoot(timing, user, mpiData.rank); - if(test && !mpiData.rank){ - if(saveFile) - { - test = saveToFile("dataPart2", timing); - - if(test) - std::cout <<"Done Part " << timing.partChosen << " And File saved" << std::endl; - else - std::cout <<"Error Saving File!!" << std::endl; - } - std::cout <<"Done Part " << timing.partChosen << std::endl; + timing.inputPreparationRoot[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (mpiData.rank) //Only for Workers + { + ////////////////////////////////// Input Prepation for Host ////////////////////////////////// + MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.inputPreparationHost[0] = MPI_Wtime(); + + MPI_Recv(&pointer.vect1[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Recv(&pointer.vect2[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + + cudaCheck(cudaMemcpy(pointer.dVect1, + pointer.vect1, + user.sizeVectorBytes, + cudaMemcpyHostToDevice)); //copy random vector from host to device. + cudaCheck(cudaMemcpy(pointer.dVect2, pointer.vect2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); + + timing.inputPreparationHost[1] = MPI_Wtime(); + /////////////////////////////////////////////////////////////////////////////////////// + + cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. + cudaCheck(cudaEventCreate(&timing.stop)); + + ///////////////////////////// Operation on Device with respect of Host ////////////////// + + int threads = 512; //arbitrary number. + int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. + blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. + + ////////////////////////// CAll Device Kernel ////////////////////////////////// + cudaCheck(cudaEventRecord(timing.start)); + timing.operationOnDeviceByHost[0] = MPI_Wtime(); + + addVectorsGpu<<>>(pointer.dVect1, + pointer.dVect2, + pointer.dVect3, + sizeVector, + task); //call device function to add two vectors and save into vect3Gpu. + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaEventRecord(timing.stop)); + + timing.operationOnDeviceByHost[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// + timing.outputPreparationHost[0] = MPI_Wtime(); + + cudaCheck(cudaMemcpy( + pointer.vect3, + pointer.dVect3, + user.sizeVectorBytes, + cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this + + MPI_Send(&pointer.vect3[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD); //Tag is 0 + + timing.outputPreparationHost[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (!mpiData.rank) //Only for root + { + /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// + MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.outputPreparationRoot[0] = MPI_Wtime(); + //MPI probe + for (int i = 1; i < mpiData.num_procs; i++) { + MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); } - return timing; + timing.outputPreparationRoot[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (a > 0) + calculateTimeDuration(timing, a - 1, mpiData.rank); + + if (mpiData.rank) { + cudaCheck(cudaEventDestroy(timing.start)); + cudaCheck(cudaEventDestroy(timing.stop)); + } + } + ///////////////////////////// End of Average //////////////////////// + if (mpiData.rank) { + cudaCheck(cudaFreeHost(pointer.vect1)); + cudaCheck(cudaFreeHost(pointer.vect2)); + cudaCheck(cudaFreeHost(pointer.vect3)); + cudaCheck(cudaFree(pointer.dVect1)); + cudaCheck(cudaFree(pointer.dVect2)); + cudaCheck(cudaFree(pointer.dVect3)); + } + + bool test = 0; + if (!mpiData.rank) //Only for root + { + addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. + test = checkingResultsPrintout(mpiData.mVectChecking, + mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. + if (!test) + exit(-1); + } + + calculateAverageDeviation(timing, average, mpiData.rank); + test = sendAverageToRoot(timing, user, mpiData.rank); + if (test && !mpiData.rank) { + if (saveFile) { + test = saveToFile("dataPart2", timing); + + if (test) + std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl; + else + std::cout << "Error Saving File!!" << std::endl; + } + std::cout << "Done Part " << timing.partChosen << std::endl; } + return timing; +} - Timing blockSendPart3(MPIData& mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { +Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements timing.partChosen = 3; - if(mpiData.rank) //Only for Workers + if (mpiData.rank) //Only for Workers { - cudaCheck(cudaMalloc((void **)&pointer.dVect1, user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. + cudaCheck(cudaMalloc((void **)&pointer.dVect1, + user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes)); cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes)); } - ///////////////////////////// Start of Average //////////////////////// - for (int a = 0; a <= average; ++a) + ///////////////////////////// Start of Average //////////////////////// + for (int a = 0; a <= average; ++a) { + if (!mpiData.rank) //Only for root { - - if (!mpiData.rank) //Only for root - { - ////////////////////////////////// Input Prepation for Root ////////////////////////////////// - timing.inputPreparationRoot[0] = MPI_Wtime(); - for (int i = 1; i < mpiData.num_procs; ++i) { - MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], - mpiData.numberToSend[i], - MPI_FLOAT, - i, - 0, - MPI_COMM_WORLD); //Tag is 0 - MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); - } - timing.inputPreparationRoot[1] = MPI_Wtime(); - ///////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (mpiData.rank) //Only for Workers - { - ////////////////////////////////// Input Prepation for Host ////////////////////////////////// - MPI_Probe(user.root, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE); - timing.inputPreparationHost[0] = MPI_Wtime(); - MPI_Recv(&pointer.dVect1[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - MPI_Recv(&pointer.dVect2[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - - // cudaCheck(cudaMemcpy(pointer.dVect1, mpiData.mVectWorker1, user.sizeVectorBytes, cudaMemcpyHostToDevice)); //copy random vector from host to device. - // cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); - - timing.inputPreparationHost[1] = MPI_Wtime(); - /////////////////////////////////////////////////////////////////////////////////////// - - cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. - cudaCheck(cudaEventCreate(&timing.stop)); - - ///////////////////////////// Operation on Device with respect of Host ////////////////// - - int threads = 512; //arbitrary number. - int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. - blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. - - ////////////////////////// CAll Device Kernel ////////////////////////////////// - cudaCheck(cudaEventRecord(timing.start)); - timing.operationOnDeviceByHost[0] = MPI_Wtime(); - - addVectorsGpu<<>>(pointer.dVect1, pointer.dVect2, pointer.dVect3, sizeVector, task); //call device function to add two vectors and save into vect3Gpu. - - cudaCheck(cudaGetLastError()); - cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaEventRecord(timing.stop)); - - timing.operationOnDeviceByHost[1] = MPI_Wtime(); - ///////////////////////////////////////////////////////////////////////////////////////////// - - /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// - timing.outputPreparationHost[0] = MPI_Wtime(); - //cudaCheck(cudaMemcpy(mpiData.mVectWorker3,pointer.dVect3,user.sizeVectorBytes,cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this - - MPI_Send(&pointer.dVect3[0], - mpiData.numberToSend[mpiData.rank], - MPI_FLOAT, - user.root, - 0, - MPI_COMM_WORLD); //Tag is 0 - timing.outputPreparationHost[1] = MPI_Wtime(); - //////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (!mpiData.rank) //Only for root - { - /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// - MPI_Probe(MPI_ANY_SOURCE,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE); - timing.outputPreparationRoot[0] = MPI_Wtime(); - //MPI probe - for (int i = 1; i < mpiData.num_procs; i++) { - - MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], - mpiData.numberToSend[i], - MPI_FLOAT, - i, - 0, - MPI_COMM_WORLD, - MPI_STATUS_IGNORE); - } - timing.outputPreparationRoot[1] = MPI_Wtime(); - //////////////////////////////////////////////////////////////////////////////////////////////// - } - - if (a>0) - calculateTimeDuration(timing, a-1, mpiData.rank); - - if(mpiData.rank) - { - cudaCheck(cudaEventDestroy(timing.start)); - cudaCheck(cudaEventDestroy(timing.stop)); + ////////////////////////////////// Input Prepation for Root ////////////////////////////////// + timing.inputPreparationRoot[0] = MPI_Wtime(); + for (int i = 1; i < mpiData.num_procs; ++i) { + MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD); //Tag is 0 + MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); } + timing.inputPreparationRoot[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////////// } - ///////////////////////////// End of Average //////////////////////// - if(mpiData.rank) + + if (mpiData.rank) //Only for Workers { - cudaCheck(cudaFree(pointer.dVect1)); - cudaCheck(cudaFree(pointer.dVect2)); - cudaCheck(cudaFree(pointer.dVect3)); + ////////////////////////////////// Input Prepation for Host ////////////////////////////////// + MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.inputPreparationHost[0] = MPI_Wtime(); + MPI_Recv(&pointer.dVect1[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Recv(&pointer.dVect2[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + + // cudaCheck(cudaMemcpy(pointer.dVect1, mpiData.mVectWorker1, user.sizeVectorBytes, cudaMemcpyHostToDevice)); //copy random vector from host to device. + // cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); + + timing.inputPreparationHost[1] = MPI_Wtime(); + /////////////////////////////////////////////////////////////////////////////////////// + + cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. + cudaCheck(cudaEventCreate(&timing.stop)); + + ///////////////////////////// Operation on Device with respect of Host ////////////////// + + int threads = 512; //arbitrary number. + int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. + blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. + + ////////////////////////// CAll Device Kernel ////////////////////////////////// + cudaCheck(cudaEventRecord(timing.start)); + timing.operationOnDeviceByHost[0] = MPI_Wtime(); + + addVectorsGpu<<>>(pointer.dVect1, + pointer.dVect2, + pointer.dVect3, + sizeVector, + task); //call device function to add two vectors and save into vect3Gpu. + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaEventRecord(timing.stop)); + + timing.operationOnDeviceByHost[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// + timing.outputPreparationHost[0] = MPI_Wtime(); + //cudaCheck(cudaMemcpy(mpiData.mVectWorker3,pointer.dVect3,user.sizeVectorBytes,cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this + + MPI_Send(&pointer.dVect3[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD); //Tag is 0 + timing.outputPreparationHost[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// } - /// - bool test = 0; + if (!mpiData.rank) //Only for root { - addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. - test = checkingResultsPrintout(mpiData.mVectChecking, mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. - if(!test) - exit(-1); - } - - calculateAverageDeviation(timing,average, mpiData.rank); - test = sendAverageToRoot(timing, user, mpiData.rank); - if(test && !mpiData.rank){ - if(saveFile) - { - test = saveToFile("dataPart3", timing); - - if(test) - std::cout <<"Done Part " << timing.partChosen << " And File saved" << std::endl; - else - std::cout <<"Error Saving File!!" << std::endl; + /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// + MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.outputPreparationRoot[0] = MPI_Wtime(); + //MPI probe + for (int i = 1; i < mpiData.num_procs; i++) { + MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); } - std::cout <<"Done Part " << timing.partChosen << std::endl; + timing.outputPreparationRoot[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// } - return timing; - } - - - - void printTable(std::vector &timing, bool standerDeviationPrint) { - - const std::string inPrepatRoot = " Duration Time Read Input Prepations On Root "; - const std::string inPrepatHost = " Duration Time Read Input Prepations On Host "; - const std::string timeCpuR = " Duration Time operation on Root point View "; - const std::string timeCpu = " Duration Time operation on Host point View "; - const std::string timeGpu = " Duration Time operation on Device point View"; - const std::string outPrepatRoot = " Duration Time Read Output Prepations On Root"; - const std::string outPrepatHost = " Duration Time Read Output Prepations On Host"; - - const std::string averageTime = " AverTime "; - const std::string standerDeviation = " StDeviation "; - const std::string nameTiming = " Name Timing "; - const std::string partsNumberall = "Part "; - - int totalFix = 0; - - if (standerDeviationPrint) { - totalFix = timeGpu.size() + timing.size() * (averageTime.size() + standerDeviation.size() + 3); - } else { - totalFix = timeGpu.size() + timing.size() * (averageTime.size() + 3); + + if (a > 0) + calculateTimeDuration(timing, a - 1, mpiData.rank); + + if (mpiData.rank) { + cudaCheck(cudaEventDestroy(timing.start)); + cudaCheck(cudaEventDestroy(timing.stop)); } - - - std::cout.flags(std::ios::fixed | std::ios::showpoint); - std::cout.precision(4); - - std::cout << '\n'; - std::cout.width(totalFix); - std::cout.fill('-'); - std::cout << '-' << '\n'; - std::cout.fill(' '); - - std::cout << "|"; - std::cout.width((timeGpu.size() - nameTiming.size()) / 2); - std::cout.fill(' '); - std::cout << " "; - std::cout << nameTiming; - std::cout.width((timeGpu.size() - nameTiming.size()) / 2); - std::cout.fill(' '); + } + ///////////////////////////// End of Average //////////////////////// + if (mpiData.rank) { + cudaCheck(cudaFree(pointer.dVect1)); + cudaCheck(cudaFree(pointer.dVect2)); + cudaCheck(cudaFree(pointer.dVect3)); + } + /// + bool test = 0; + if (!mpiData.rank) //Only for root + { + addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. + test = checkingResultsPrintout(mpiData.mVectChecking, + mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. + if (!test) + exit(-1); + } + + calculateAverageDeviation(timing, average, mpiData.rank); + test = sendAverageToRoot(timing, user, mpiData.rank); + if (test && !mpiData.rank) { + if (saveFile) { + test = saveToFile("dataPart3", timing); + + if (test) + std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl; + else + std::cout << "Error Saving File!!" << std::endl; + } + std::cout << "Done Part " << timing.partChosen << std::endl; + } + return timing; +} + +void printTable(std::vector &timing, bool standerDeviationPrint) { + const std::string inPrepatRoot = " Duration Time Read Input Prepations On Root "; + const std::string inPrepatHost = " Duration Time Read Input Prepations On Host "; + const std::string timeCpuR = " Duration Time operation on Root point View "; + const std::string timeCpu = " Duration Time operation on Host point View "; + const std::string timeGpu = " Duration Time operation on Device point View"; + const std::string outPrepatRoot = " Duration Time Read Output Prepations On Root"; + const std::string outPrepatHost = " Duration Time Read Output Prepations On Host"; + + const std::string averageTime = " AverTime "; + const std::string standerDeviation = " StDeviation "; + const std::string nameTiming = " Name Timing "; + const std::string partsNumberall = "Part "; + + int totalFix = 0; + + if (standerDeviationPrint) { + totalFix = timeGpu.size() + timing.size() * (averageTime.size() + standerDeviation.size() + 3); + } else { + totalFix = timeGpu.size() + timing.size() * (averageTime.size() + 3); + } + + std::cout.flags(std::ios::fixed | std::ios::showpoint); + std::cout.precision(4); + + std::cout << '\n'; + std::cout.width(totalFix); + std::cout.fill('-'); + std::cout << '-' << '\n'; + std::cout.fill(' '); + + std::cout << "|"; + std::cout.width((timeGpu.size() - nameTiming.size()) / 2); + std::cout.fill(' '); + std::cout << " "; + std::cout << nameTiming; + std::cout.width((timeGpu.size() - nameTiming.size()) / 2); + std::cout.fill(' '); + std::cout << " "; + std::cout << " |"; + + for (unsigned int i = 0; i < timing.size(); ++i) { + if (standerDeviationPrint) { + std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2); + } //9 + else { + std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2); + } //2 + std::cout << " "; - std::cout << " |"; - - for (unsigned int i = 0; i < timing.size(); ++i) { - if (standerDeviationPrint) { - std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2); - } //9 - else { - std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2); - } //2 - - std::cout << " "; - std::cout << partsNumberall << timing[i].partChosen; - - if (standerDeviationPrint) { - std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2); - } //9 - else { - std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2); - } - //2 - std::cout << " "; - std::cout << "|"; + std::cout << partsNumberall << timing[i].partChosen; + + if (standerDeviationPrint) { + std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2); + } //9 + else { + std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2); } - - std::cout << '\n'; + //2 + std::cout << " "; std::cout << "|"; - std::cout.width(inPrepatHost.size() + 3); - std::cout.fill(' '); + } + + std::cout << '\n'; + std::cout << "|"; + std::cout.width(inPrepatHost.size() + 3); + std::cout.fill(' '); + std::cout << "|"; + + for (unsigned int i = 0; i < timing.size(); ++i) { + std::cout << averageTime; std::cout << "|"; - - for (unsigned int i = 0; i < timing.size(); ++i) { - std::cout << averageTime; + if (standerDeviationPrint) { + std::cout << standerDeviation; std::cout << "|"; - if (standerDeviationPrint) { - std::cout << standerDeviation; - std::cout << "|"; - } } - - newLineTitle(totalFix, inPrepatRoot); - printResultEach(timing, 1, standerDeviationPrint); - - newLineTitle(totalFix, inPrepatHost); - printResultEach(timing, 2, standerDeviationPrint); - - newLineTitle(totalFix, timeCpuR); - printResultEach(timing, 3, standerDeviationPrint); - - newLineTitle(totalFix, timeCpu); - printResultEach(timing, 4, standerDeviationPrint); - - newLineTitle(totalFix, timeGpu); - printResultEach(timing, 5, standerDeviationPrint); - - newLineTitle(totalFix, outPrepatRoot); - printResultEach(timing, 6, standerDeviationPrint); - - newLineTitle(totalFix, outPrepatHost); - printResultEach(timing, 7, standerDeviationPrint); - - std::cout << '\n'; - std::cout.width(totalFix); - std::cout.fill('-'); - std::cout << '-' << '\n'; - std::cout.fill(' '); - } - int getNumberofDigits(double number) { return ((int)log10(number) + 1) + 4; } - void newLineTitle(int line, const std::string &title) { - std::cout << '\n'; - std::cout.width(line); - std::cout.fill('-'); - std::cout << '-' << '\n'; - std::cout.fill(' '); - - std::cout << "| "; - std::cout << title; - std::cout << " |"; - } - void printResultEach(std::vector &timing, int type, bool standerDeviationPrint) - { - int averageTimeWidth = 10; - int standerDeviationWidth = 13; - - for (unsigned int i = 0; i < timing.size(); ++i) { - if (type == 1 ) { - std::cout.width(averageTimeWidth); + + newLineTitle(totalFix, inPrepatRoot); + printResultEach(timing, 1, standerDeviationPrint); + + newLineTitle(totalFix, inPrepatHost); + printResultEach(timing, 2, standerDeviationPrint); + + newLineTitle(totalFix, timeCpuR); + printResultEach(timing, 3, standerDeviationPrint); + + newLineTitle(totalFix, timeCpu); + printResultEach(timing, 4, standerDeviationPrint); + + newLineTitle(totalFix, timeGpu); + printResultEach(timing, 5, standerDeviationPrint); + + newLineTitle(totalFix, outPrepatRoot); + printResultEach(timing, 6, standerDeviationPrint); + + newLineTitle(totalFix, outPrepatHost); + printResultEach(timing, 7, standerDeviationPrint); + + std::cout << '\n'; + std::cout.width(totalFix); + std::cout.fill('-'); + std::cout << '-' << '\n'; + std::cout.fill(' '); +} +int getNumberofDigits(double number) { return ((int)log10(number) + 1) + 4; } +void newLineTitle(int line, const std::string &title) { + std::cout << '\n'; + std::cout.width(line); + std::cout.fill('-'); + std::cout << '-' << '\n'; + std::cout.fill(' '); + + std::cout << "| "; + std::cout << title; + std::cout << " |"; +} +void printResultEach(std::vector &timing, int type, bool standerDeviationPrint) { + int averageTimeWidth = 10; + int standerDeviationWidth = 13; + + for (unsigned int i = 0; i < timing.size(); ++i) { + if (type == 1) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].timeInputPreparationRoot[average]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); std::cout.fill(' '); - std::cout << timing[i].timeInputPreparationRoot[average]; + std::cout << timing[i].timeInputPreparationRoot[average + 1]; std::cout << "|"; - if (standerDeviationPrint) { - std::cout.width(standerDeviationWidth); - std::cout.fill(' '); - std::cout << timing[i].timeInputPreparationRoot[average + 1]; - std::cout << "|"; - } - }else if (type == 2) { - std::cout.width(averageTimeWidth); + } + } else if (type == 2) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[0]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); std::cout.fill(' '); - std::cout << timing[i].averageResults[0]; + std::cout << timing[i].averageResults[1]; std::cout << "|"; - if (standerDeviationPrint) { - std::cout.width(standerDeviationWidth); - std::cout.fill(' '); - std::cout << timing[i].averageResults[1]; - std::cout << "|"; - } - } else if (type == 3) { - std::cout.width(averageTimeWidth); + } + } else if (type == 3) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].timeOperationOnDeviceByRootHost[average]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); std::cout.fill(' '); - std::cout << timing[i].timeOperationOnDeviceByRootHost[average]; + std::cout << timing[i].timeOperationOnDeviceByRootHost[average + 1]; std::cout << "|"; - if (standerDeviationPrint) { - std::cout.width(standerDeviationWidth); - std::cout.fill(' '); - std::cout << timing[i].timeOperationOnDeviceByRootHost[average + 1]; - std::cout << "|"; - } - } else if (type == 4) { - std::cout.width(averageTimeWidth); + } + } else if (type == 4) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[2]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); std::cout.fill(' '); - std::cout << timing[i].averageResults[2]; + std::cout << timing[i].averageResults[3]; std::cout << "|"; - if (standerDeviationPrint) { - std::cout.width(standerDeviationWidth); - std::cout.fill(' '); - std::cout << timing[i].averageResults[3]; - std::cout << "|"; - } - } else if (type == 5) { - std::cout.width(averageTimeWidth); + } + } else if (type == 5) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[6]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); std::cout.fill(' '); - std::cout << timing[i].averageResults[6]; + std::cout << timing[i].averageResults[7]; std::cout << "|"; - if (standerDeviationPrint) { - std::cout.width(standerDeviationWidth); - std::cout.fill(' '); - std::cout << timing[i].averageResults[7]; - std::cout << "|"; - } - } else if (type == 6) { - std::cout.width(averageTimeWidth); + } + } else if (type == 6) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].timeOutputPreparationRoot[average]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); std::cout.fill(' '); - std::cout << timing[i].timeOutputPreparationRoot[average]; + std::cout << timing[i].timeOutputPreparationRoot[average + 1]; std::cout << "|"; - if (standerDeviationPrint) { - std::cout.width(standerDeviationWidth); - std::cout.fill(' '); - std::cout << timing[i].timeOutputPreparationRoot[average + 1]; - std::cout << "|"; - } - } else if (type == 7) { - std::cout.width(averageTimeWidth); + } + } else if (type == 7) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[4]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); std::cout.fill(' '); - std::cout << timing[i].averageResults[4]; + std::cout << timing[i].averageResults[5]; std::cout << "|"; - if (standerDeviationPrint) { - std::cout.width(standerDeviationWidth); - std::cout.fill(' '); - std::cout << timing[i].averageResults[5]; - std::cout << "|"; - } } } } - bool saveToFile(const std::string &name, const Timing &timing) { - std::ofstream file(name + ".txt", std::ios::out | std::ios::app); - - if (!file.is_open()) { - std::cout << "\nCannot open File nor Create File!" << std::endl; - return 0; - } - - file << sizeVector << std::endl; - file << average << std::endl; - file << task << std::endl; - file << timing.timeInputPreparationRoot[average] << " " << timing.timeInputPreparationRoot[average + 1]<< std::endl; - file << timing.averageResults[0] << " "<< timing.averageResults[1] << std::endl; - file << timing.timeOperationOnDeviceByRootHost[average] << " " << timing.timeOperationOnDeviceByRootHost[average + 1] << std::endl; - file << timing.averageResults[2] << " " << timing.averageResults[3] << std::endl; - file << timing.averageResults[6] << " " << timing.averageResults[7] << std::endl; - file << timing.timeOutputPreparationRoot[average] << " " << timing.timeOutputPreparationRoot[average + 1] << std::endl; - file << timing.averageResults[4] << " " << timing.averageResults[5] << std::endl; - - file.close(); - if (!file.good()) { - std::cout << "\n*ERROR While Writing The " + name + " file!!" << std::endl; - return 0; - } - return 1; +} +bool saveToFile(const std::string &name, const Timing &timing) { + std::ofstream file(name + ".txt", std::ios::out | std::ios::app); + + if (!file.is_open()) { + std::cout << "\nCannot open File nor Create File!" << std::endl; + return 0; } -void printHelp(void) -{ + + file << sizeVector << std::endl; + file << average << std::endl; + file << task << std::endl; + file << timing.timeInputPreparationRoot[average] << " " << timing.timeInputPreparationRoot[average + 1] << std::endl; + file << timing.averageResults[0] << " " << timing.averageResults[1] << std::endl; + file << timing.timeOperationOnDeviceByRootHost[average] << " " << timing.timeOperationOnDeviceByRootHost[average + 1] + << std::endl; + file << timing.averageResults[2] << " " << timing.averageResults[3] << std::endl; + file << timing.averageResults[6] << " " << timing.averageResults[7] << std::endl; + file << timing.timeOutputPreparationRoot[average] << " " << timing.timeOutputPreparationRoot[average + 1] + << std::endl; + file << timing.averageResults[4] << " " << timing.averageResults[5] << std::endl; + + file.close(); + if (!file.good()) { + std::cout << "\n*ERROR While Writing The " + name + " file!!" << std::endl; + return 0; + } + return 1; +} +void printHelp(void) { int rank = MPI::COMM_WORLD.Get_rank(); - if(!rank){ - std::cout << "\n\n\t**************************************\n"; - std::cout << "\t* This is a Help for Command Opitions*"; - std::cout << "\n\t**************************************\n"; - std::cout << "\n\tYou as a user, can choose two ways to run the program:\n"; - std::cout << "\n\t1) mpirun -np -s -t -a -p \n"; - std::cout << "\n\t2) cmsenv_mpirun -np -s -t -a -p \n"; - std::cout << "\n\t[-np] is for number of processes or processors that you would like to run."; - std::cout << "\n\t[-s] is the size of vector that you would like to send, the type is float and there are two vectors."; - std::cout << "\n\t[-t] is the number of repeating of task on the Device(GPU) side."; - std::cout << "\n\t[-a] is the number of repeating the part that user has chosen."; - std::cout << "\n\t[-p] is the choice of what part to run in the program."; - std::cout << "\n\t[-q] is to print Stander Deviation."; - std::cout << "\n\t[-f] is to save the results into a file for each part."; - std::cout << "\n\n\tExample for only local Machine: "; - std::cout << "\n\tcmsenv_mpirun -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1\n"; - std::cout << "\n\tExample for two Machines connected: "; - std::cout << "\n\tcmsenv_mpirun -H , -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1"; - std::cout << "\n\tExample for two Machines connected Using ucx: "; - std::cout << "\n\tcmsenv_mpirun -H , -np 2 -mca pml ucx -- mpiCudaGeneric -p1 -s200 -t1 -a1"; - std::cout << "\n\n\tFor the Parts, we have in this program 4 Parts:"; - std::cout << "\n\t1)The Root, who does not have a GPU, using MPI Blocking send and receive to Host, The Host is who have a GPU, then Host:"; - std::cout << "\n\t uses cudaMalloc and copies the receiving values to GPU side. Next, the GPU does the compuation"; - std::cout << "\n\t Finaly, the Host copies the results from GPU, sends them back to The Root using MPI Blocking Send.\n\n"; + if (!rank) { + std::cout << "\n\n\t**************************************\n"; + std::cout << "\t* This is a Help for Command Opitions*"; + std::cout << "\n\t**************************************\n"; + std::cout << "\n\tYou as a user, can choose two ways to run the program:\n"; + std::cout << "\n\t1) mpirun -np -s -t -a " + "-p \n"; + std::cout << "\n\t2) cmsenv_mpirun -np -s -t -a -p \n"; + std::cout << "\n\t[-np] is for number of processes or processors that you would like to run."; + std::cout + << "\n\t[-s] is the size of vector that you would like to send, the type is float and there are two vectors."; + std::cout << "\n\t[-t] is the number of repeating of task on the Device(GPU) side."; + std::cout << "\n\t[-a] is the number of repeating the part that user has chosen."; + std::cout << "\n\t[-p] is the choice of what part to run in the program."; + std::cout << "\n\t[-q] is to print Stander Deviation."; + std::cout << "\n\t[-f] is to save the results into a file for each part."; + std::cout << "\n\n\tExample for only local Machine: "; + std::cout << "\n\tcmsenv_mpirun -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1\n"; + std::cout << "\n\tExample for two Machines connected: "; + std::cout + << "\n\tcmsenv_mpirun -H , -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1"; + std::cout << "\n\tExample for two Machines connected Using ucx: "; + std::cout << "\n\tcmsenv_mpirun -H , -np 2 -mca pml ucx -- " + "mpiCudaGeneric -p1 -s200 -t1 -a1"; + std::cout << "\n\n\tFor the Parts, we have in this program 4 Parts:"; + std::cout << "\n\t1)The Root, who does not have a GPU, using MPI Blocking send and receive to Host, The Host is " + "who have a GPU, then Host:"; + std::cout << "\n\t uses cudaMalloc and copies the receiving values to GPU side. Next, the GPU does the compuation"; + std::cout << "\n\t Finaly, the Host copies the results from GPU, sends them back to The Root using MPI Blocking " + "Send.\n\n"; } - } \ No newline at end of file