Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenCL: thread interleaving #2100

Merged
merged 1 commit into from
Nov 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "xmrstak/picosha2/picosha2.hpp"
#include "xmrstak/params.hpp"
#include "xmrstak/version.hpp"
#include "xmrstak/net/msgstruct.hpp"

#include <stdio.h>
#include <string.h>
Expand All @@ -34,6 +35,7 @@
#include <vector>
#include <string>
#include <iostream>
#include <thread>

#if defined _MSC_VER
#include <direct.h>
Expand Down Expand Up @@ -730,11 +732,21 @@ std::vector<GpuContext> getAMDDevices(int index)
continue;
}

std::vector<char> openCLDriverVer(1024);
if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
continue;
}

bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos;

// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
ctx.deviceIdx = k;
ctx.freeMem = std::min(ctx.freeMem, maxMem);
ctx.name = std::string(devNameVec.data());
ctx.DeviceID = device_list[k];
ctx.interleave = 40;
printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
ctxVec.push_back(ctx);
}
Expand Down Expand Up @@ -936,8 +948,27 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
// create a directory for the OpenCL compile cache
create_directory(get_home() + "/.openclcache");

std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);

for(int i = 0; i < num_gpus; ++i)
{
const size_t devIdx = ctx[i].deviceIdx;
if(interleaveData.size() <= devIdx)
{
interleaveData.resize(devIdx + 1u, nullptr);
}
if(!interleaveData[devIdx])
{
interleaveData[devIdx].reset(new InterleaveData{});
interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms();

}
ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU;
++interleaveData[devIdx]->numThreadsOnGPU;
ctx[i].interleaveData = interleaveData[devIdx];
ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;

const std::string backendName = xmrstak::params::inst().openCLVendor;
if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
{
Expand Down Expand Up @@ -1126,6 +1157,91 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
return ERR_SUCCESS;
}

void updateTimings(GpuContext* ctx, const uint64_t t)
{
// averagingBias = 1.0 - only the last delta time is taken into account
// averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined
// averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined
const double averagingBias = 0.1;

{
int64_t t2 = get_timestamp_ms();
std::lock_guard<std::mutex> g(ctx->interleaveData->mutex);
// 20000 mean that something went wrong an we reset the average
if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0)
ctx->interleaveData->avgKernelRuntime = (t2 - t);
else
ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (t2 - t) * averagingBias;
}
}

uint64_t interleaveAdjustDelay(GpuContext* ctx)
{
uint64_t t0 = get_timestamp_ms();

if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0)
{
t0 = get_timestamp_ms();
std::unique_lock<std::mutex> g(ctx->interleaveData->mutex);

int64_t delay = 0;
double dt = 0.0;

if(t0 > ctx->interleaveData->lastRunTimeStamp)
dt = static_cast<double>(t0 - ctx->interleaveData->lastRunTimeStamp);

const double avgRuntime = ctx->interleaveData->avgKernelRuntime;
const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold;

// threshold where the the auto adjustment is disabled
constexpr uint32_t maxDelay = 10;
constexpr double maxAutoAdjust = 0.05;

if((dt > 0) && (dt < optimalTimeOffset))
{
delay = static_cast<int64_t>((optimalTimeOffset - dt));
if(ctx->lastDelay == delay && delay > maxDelay)
ctx->interleaveData->adjustThreshold -= 0.001;
// if the delay doubled than increase the adjustThreshold
else if(delay > 1 && ctx->lastDelay * 2 < delay)
ctx->interleaveData->adjustThreshold += 0.001;
ctx->lastDelay = delay;

// this is std::clamp which is available in c++17
ctx->interleaveData->adjustThreshold = std::max(
std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust),
std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust)
);
// avoid that the auto adjustment is disable interleaving
ctx->interleaveData->adjustThreshold = std::max(
ctx->interleaveData->adjustThreshold,
0.001
);
}
delay = std::max(int64_t(0), delay);

ctx->interleaveData->lastRunTimeStamp = t0 + delay;

g.unlock();
if(delay > 0)
{
// do not notify the user anymore if we reach a good delay
if(delay > maxDelay)
printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
ctx->deviceIdx,
ctx->idWorkerOnDevice,
static_cast<uint32_t>(delay),
avgRuntime,
ctx->interleaveData->adjustThreshold * 100.
);

std::this_thread::sleep_for(std::chrono::milliseconds(delay));
}
}

return t0;
}

size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
{
// switch to the kernel storage
Expand Down
19 changes: 17 additions & 2 deletions xmrstak/backend/amd/amd_gpu/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,23 @@
#include <stdint.h>
#include <string>
#include <vector>
#include <mutex>
#include <memory>

#define ERR_SUCCESS (0)
#define ERR_OCL_API (2)
#define ERR_STUPID_PARAMS (1)

struct InterleaveData
{
std::mutex mutex;

double adjustThreshold = 0.4;
double startAdjustThreshold = 0.4;
double avgKernelRuntime = 0.0;
uint64_t lastRunTimeStamp = 0;
uint32_t numThreadsOnGPU = 0;
};

struct GpuContext
{
Expand All @@ -42,6 +53,10 @@ struct GpuContext
size_t freeMem;
int computeUnits;
std::string name;
std::shared_ptr<InterleaveData> interleaveData;
uint32_t idWorkerOnDevice = 0u;
int interleave = 40;
uint64_t lastDelay = 0;

uint32_t Nonce;

Expand All @@ -54,5 +69,5 @@ std::vector<GpuContext> getAMDDevices(int index);
size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);


uint64_t interleaveAdjustDelay(GpuContext* ctx);
void updateTimings(GpuContext* ctx, const uint64_t t);
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/autoAdjust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class autoAdjust
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
" \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
" \"unroll\" : 8, \"comp_mode\" : true\n" +
" \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
" },\n";
}
else
Expand Down
10 changes: 8 additions & 2 deletions xmrstak/backend/amd/config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION
* to use a intensity which is not the multiple of the worksize.
* If you set false and the intensity is not multiple of the worksize the miner can crash:
* in this case set the intensity to a multiple of the worksize or activate comp_mode.
* interleave - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
* This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
* 0 = disable thread interleaving
* 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
* "gpu_threads_conf" :
* [
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
* "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
* "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
* "interleave" : 40
* },
* ],
* If you do not wish to mine with your AMD GPU(s) then use:
* "gpu_threads_conf" :
Expand Down
21 changes: 20 additions & 1 deletion xmrstak/backend/amd/jconf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;

const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
Expand All @@ -115,11 +115,30 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
memChunk = GetObjectMember(oThdConf, "mem_chunk");
unroll = GetObjectMember(oThdConf, "unroll");
compMode = GetObjectMember(oThdConf, "comp_mode");
interleave = GetObjectMember(oThdConf, "interleave");

if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
return false;

// interleave is optional
if(interleave == nullptr)
cfg.interleave = 50;
else if(!interleave->IsUint64())
{
printer::inst()->print_msg(L0, "ERROR: interleave must be a number");
return false;
}
else if((int)interleave->GetInt64() < 0 || (int)interleave->GetInt64() > 100)
{
printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]");
return false;
}
else
{
cfg.interleave = (int)interleave->GetInt64();
}

if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
return false;

Expand Down
1 change: 1 addition & 0 deletions xmrstak/backend/amd/jconf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class jconf
size_t w_size;
long long cpu_aff;
int stridedIndex;
int interleave = 40;
int memChunk;
int unroll;
bool compMode;
Expand Down
5 changes: 5 additions & 0 deletions xmrstak/backend/amd/minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ bool minethd::init_gpus()
vGpuData[i].memChunk = cfg.memChunk;
vGpuData[i].compMode = cfg.compMode;
vGpuData[i].unroll = cfg.unroll;
vGpuData[i].interleave = cfg.interleave;
}

return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
Expand Down Expand Up @@ -242,6 +243,7 @@ void minethd::work_main()
break;
}

uint64_t t0 = interleaveAdjustDelay(pGpuCtx);

cl_uint results[0x100];
memset(results,0,sizeof(cl_uint)*(0x100));
Expand Down Expand Up @@ -269,6 +271,9 @@ void minethd::work_main()
uint64_t iStamp = get_timestamp_ms();
iHashCount.store(iCount, std::memory_order_relaxed);
iTimestamp.store(iStamp, std::memory_order_relaxed);

updateTimings(pGpuCtx, t0);

std::this_thread::yield();
}

Expand Down