Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 60 additions & 94 deletions ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3559,31 +3559,12 @@ class sycl_gpu_mgr {
int work_group_size = 0;
std::string gpus_list = "";

/*
Use all GPU with same top max compute units
*/
sycl_gpu_mgr() {
detect_sycl_gpu_list_with_max_cu();
get_allow_gpus();
create_context_with_gpus();
}

/*
Use the assigned GPU as only one
*/
sycl_gpu_mgr(int main_gpu_id) {
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
dpct::device_info prop;
dpct::get_device_info(prop, device);
gpus.push_back(main_gpu_id);
devices.push_back(device);
work_group_size = prop.get_max_work_group_size();
max_compute_units = prop.get_max_compute_units();

get_allow_gpus();
create_context_with_gpus();
}

void create_context_with_gpus() {
sycl::context ctx = sycl::context(devices);
assert(gpus.size() > 0);
Expand All @@ -3599,7 +3580,7 @@ class sycl_gpu_mgr {
gpus_list += std::to_string(gpus[i]);
gpus_list += ",";
}
if (gpus_list.length() > 1) {
if (gpus_list.length() > 2) {
gpus_list.pop_back();
}
}
Expand Down Expand Up @@ -3648,8 +3629,8 @@ class sycl_gpu_mgr {
if (gpus[i] == id)
return i;
}
printf("miss to get device index by id=%d\n", id);
GGML_ASSERT(false);
assert(false);
return -1;
}

int get_next_index(int id) {
Expand All @@ -3658,7 +3639,8 @@ class sycl_gpu_mgr {
if (gpus[i] == id)
return i;
}
GGML_ASSERT(false);
assert(false);
return -1;
}
};

Expand All @@ -3667,7 +3649,6 @@ static int g_device_count = -1;
static int g_all_sycl_device_count = -1;
static int g_main_device = -1;
static int g_main_device_id = -1;
static bool g_ggml_backend_sycl_buffer_type_initialized = false;

static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};

Expand Down Expand Up @@ -13244,7 +13225,7 @@ void ggml_backend_sycl_print_sycl_devices() {
}

void print_gpu_device_list() {
fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
g_sycl_gpu_mgr->get_gpu_count(),
g_sycl_gpu_mgr->gpus_list.c_str(),
g_sycl_gpu_mgr->max_compute_units);
Expand Down Expand Up @@ -13283,15 +13264,6 @@ void ggml_init_sycl() try {
#else
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
#endif

/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
#else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/

if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
Expand All @@ -13300,61 +13272,68 @@ void ggml_init_sycl() try {
}
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
ggml_backend_sycl_print_sycl_devices();
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
print_gpu_device_list();
initialized = true;
g_sycl_loaded = true;
}

if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();

g_device_count = g_sycl_gpu_mgr->get_gpu_count();
g_work_group_size = g_sycl_gpu_mgr->work_group_size;

g_device_count = g_sycl_gpu_mgr->get_gpu_count();
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
print_gpu_device_list();

int64_t total_vram = 0;
int64_t total_vram = 0;

/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
#else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
g_device_caps[id].vmm = 0;
g_device_caps[id].device_id = -1;
g_device_caps[id].cc = 0;
g_tensor_split[id] = 0;
g_default_tensor_split[id] = 0;
}

for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
g_device_caps[id].vmm = 0;
g_device_caps[id].device_id = -1;
g_device_caps[id].cc = 0;
g_tensor_split[id] = 0;
g_default_tensor_split[id] = 0;
}
for (int i = 0; i < g_device_count; ++i) {
int device_id = g_sycl_gpu_mgr->gpus[i];
g_device_caps[i].vmm = 0;

for (int i = 0; i < g_device_count; ++i) {
int device_id = g_sycl_gpu_mgr->gpus[i];
g_device_caps[i].vmm = 0;
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(device_id))));

dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(device_id))));
g_default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();

g_default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();
g_device_caps[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}

g_device_caps[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}
for (int i = 0; i < g_device_count; ++i) {
g_default_tensor_split[i] /= total_vram;
}

for (int i = 0; i < g_device_count; ++i) {
g_default_tensor_split[i] /= total_vram;
}
for (int i = 0; i < g_device_count; ++i) {
SYCL_CHECK(ggml_sycl_set_device(i));

for (int i = 0; i < g_device_count; ++i) {
SYCL_CHECK(ggml_sycl_set_device(i));
// create sycl streams
for (int is = 0; is < MAX_STREAMS; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR(
g_syclStreams[i][is] =
dpct::get_current_device().create_queue(
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
}

// create sycl streams
for (int is = 0; is < MAX_STREAMS; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR(
g_syclStreams[i][is] =
dpct::get_current_device().create_queue(
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
const dpct::queue_ptr stream = g_syclStreams[i][0];
// create sycl handle
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
}

const dpct::queue_ptr stream = g_syclStreams[i][0];
// create sycl handle
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
initialized = true;
g_sycl_loaded = true;
}
}
catch (sycl::exception const &exc) {
Expand Down Expand Up @@ -16753,24 +16732,22 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .is_host = */ nullptr,
};

ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
if (device_index>=g_device_count or device_index<0) {
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
device_index, g_device_count-1);
GGML_ASSERT(device_index<g_device_count);
}
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];

if (!g_ggml_backend_sycl_buffer_type_initialized) {
static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < g_device_count; i++) {
ggml_backend_sycl_buffer_types[i] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
};
}
g_ggml_backend_sycl_buffer_type_initialized = true;
ggml_backend_sycl_buffer_type_initialized = true;
}
return &ggml_backend_sycl_buffer_types[device_index];

return &ggml_backend_sycl_buffer_types[device];
}

// sycl split buffer type
Expand Down Expand Up @@ -17519,17 +17496,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
return g_sycl_gpu_mgr->get_index(device_id);
}

GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
if (g_sycl_gpu_mgr) {
delete g_sycl_gpu_mgr;
}
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
ggml_init_sycl();
g_ggml_backend_sycl_buffer_type_initialized = false;
}

extern "C" int ggml_backend_sycl_reg_devices();

int ggml_backend_sycl_reg_devices() {
Expand Down
1 change: 0 additions & 1 deletion ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu);

#ifdef __cplusplus
}
Expand Down
16 changes: 4 additions & 12 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3750,14 +3750,6 @@ static bool llm_load_tensors(
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;

#ifdef GGML_USE_SYCL
if (split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_sycl_set_single_device(main_gpu);
//SYCL use device index (0, 1, 2), instead if device id.
main_gpu = ggml_backend_sycl_get_device_index(main_gpu);
}
#endif

const int64_t n_layer = hparams.n_layer;
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);

Expand Down Expand Up @@ -12268,13 +12260,13 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU

int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
int device_id = id_list[i];
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, id_list[i], i);
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
llama_free(ctx);
return nullptr;
}
Expand Down