-
Notifications
You must be signed in to change notification settings - Fork 685
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #20156 from Flamefire/20240319165505_new_pr_PyTorc…
…h212 Fix test issues in PyTorch-2.1.2-foss-2023a-CUDA-12.1.1
- Loading branch information
Showing
5 changed files
with
202 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
106 changes: 106 additions & 0 deletions
106
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
There is a bug in CUDA 11.7 through at least CUDA 12.4 which changes the current thread locale | ||
when calling nvrtcCompileProgram. | ||
See e.g. https://stackoverflow.com/questions/74044994 | ||
This also includes the encoding used by Python by default for e.g. subsequent invocations of `subprocess` calls. | ||
When the user environment is now set to e.g. UTF-8 and changed by CUDA (to ASCII/ANSI_X3.4-1968) Python will fail | ||
to decode UTF-8 output from programs invoked. | ||
This happens e.g. in `test_torch` which calls `from scipy import stats` which runs `lscpu` and errors | ||
with something like | ||
> /software/SciPy-bundle/2023.07-gfbf-2023a/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1253, in <module> | ||
> _SUPPORTS_SVE = check_support_sve() | ||
> /software/SciPy-bundle/2023.07-gfbf-2023a/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1247, in check_support_sve | ||
> output = subprocess.run(cmd, capture_output=True, text=True) | ||
> /software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 2113, in _communicate | ||
> stdout = self._translate_newlines(stdout, | ||
> UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 96: ordinal not in range(128) | ||
|
||
Fix by wrapping the nvrtcCompileProgram saving and restoring the thread locale. | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp | ||
index 1b85e7776e2..1c13a3b1168 100644 | ||
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp | ||
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp | ||
@@ -2,6 +2,7 @@ | ||
|
||
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h> | ||
#include <ATen/DynamicLibrary.h> | ||
+#include <locale.h> | ||
#include <stdexcept> | ||
|
||
namespace at { | ||
@@ -143,6 +144,29 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, | ||
return fn(prog, src, name, numHeaders, headers, includeNames); | ||
} | ||
|
||
+nvrtcResult nvrtcCompileProgram_wrapped(nvrtcProgram prog, | ||
+ int numOptions, | ||
+ const char * const *options) { | ||
+ // Save & restore current thread locale which can get modified by nvrtcCompileProgram | ||
+ locale_t oldLocale = uselocale((locale_t) 0); | ||
+ auto result = lazyNVRTC.nvrtcCompileProgram_real(prog, numOptions, options); | ||
+ if (oldLocale != (locale_t) 0) | ||
+ uselocale(oldLocale); | ||
+ return result; | ||
+} | ||
+ | ||
+nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, | ||
+ int numOptions, | ||
+ const char * const *options) { | ||
+ auto fn = reinterpret_cast<decltype(&nvrtcCompileProgram)>(getNVRTCLibrary().sym(__func__)); | ||
+ if (!fn) | ||
+ throw std::runtime_error("Can't get nvrtcCompileProgram"); | ||
+ lazyNVRTC.nvrtcCompileProgram_real = fn; | ||
+ fn = &nvrtcCompileProgram_wrapped; | ||
+ lazyNVRTC.nvrtcCompileProgram = fn; | ||
+ return fn(prog, numOptions, options); | ||
+} | ||
+ | ||
NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *); | ||
NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *); | ||
NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); | ||
@@ -150,7 +174,6 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); | ||
NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); | ||
NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); | ||
#endif | ||
-NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); | ||
_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); | ||
NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); | ||
NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); | ||
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h | ||
index 574b2c41c26..4ddc5316dad 100644 | ||
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h | ||
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h | ||
@@ -120,6 +120,8 @@ extern "C" typedef struct NVRTC { | ||
#define CREATE_MEMBER(name) decltype(&name) name; | ||
AT_FORALL_NVRTC(CREATE_MEMBER) | ||
#undef CREATE_MEMBER | ||
+ // Must be at end! | ||
+ decltype(nvrtcCompileProgram) nvrtcCompileProgram_real; | ||
} NVRTC; | ||
|
||
extern "C" TORCH_CUDA_CPP_API NVRTC* load_nvrtc(); | ||
diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h | ||
index 9d9582d34b6..562a653a67a 100644 | ||
--- a/caffe2/cuda_rtc/common_rtc.h | ||
+++ b/caffe2/cuda_rtc/common_rtc.h | ||
@@ -1,6 +1,7 @@ | ||
#ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_ | ||
#define CAFFE2_CUDA_RTC_COMMON_RTC_H_ | ||
|
||
+#include <locale.h> | ||
#include <sstream> | ||
#include <string> | ||
|
||
@@ -46,7 +47,10 @@ class CudaRTCFunction { | ||
// coding it? | ||
const char* nvrtc_opts[] = { | ||
"--gpu-architecture=compute_35", "--use_fast_math"}; | ||
+ locale_t oldLocale = uselocale((locale_t) 0); | ||
nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts); | ||
+ if (oldLocale != (locale_t) 0) | ||
+ uselocale(oldLocale); | ||
if (compile_result != NVRTC_SUCCESS) { | ||
size_t log_size; | ||
NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size)); |
19 changes: 19 additions & 0 deletions
19
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-with_temp_dir-decorator.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
The decorator fails to pass the argument list to the function. | ||
This then fails e.g. test/distributed/checkpoint/test_fsdp_optim_state.py with | ||
> TypeError: FsdpOptimStateCheckpoint.test_load_sharded_optimizer_state_dict() missing 1 required positional argument: 'pass_planner' | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py | ||
index 1a6e43a038c..52f79b37bfd 100644 | ||
--- a/torch/testing/_internal/distributed/checkpoint_utils.py | ||
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py | ||
@@ -31,7 +31,7 @@ def with_temp_dir( | ||
self.temp_dir = object_list[0] | ||
|
||
try: | ||
- func(self) | ||
+ func(self, *args, **kwargs) | ||
finally: | ||
if dist.get_rank() == 0: | ||
shutil.rmtree(self.temp_dir, ignore_errors=True) |
28 changes: 28 additions & 0 deletions
28
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
From 1d1308649298caf8884970fc57ed82a2d8ea6079 Mon Sep 17 00:00:00 2001 | ||
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com> | ||
Date: Tue, 26 Dec 2023 17:48:11 -0800 | ||
Subject: [PATCH] [BE] force DTensorTestBase.build_device_mesh to use | ||
world_size rather than NUM_DEVICES constant (#116439) | ||
|
||
**Test**: | ||
`python test/distributed/fsdp/test_shard_utils.py -k test_create_chunk_dtensor` | ||
|
||
Pull Request resolved: https://github.com/pytorch/pytorch/pull/116439 | ||
Approved by: https://github.com/wanchaol | ||
--- | ||
torch/testing/_internal/distributed/_tensor/common_dtensor.py | 2 +- | ||
1 file changed, 1 insertion(+), 1 deletion(-) | ||
|
||
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py | ||
index ab86ecd1616a74f..05a3c0872878965 100644 | ||
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py | ||
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py | ||
@@ -192,7 +192,7 @@ def backend(self) -> str: | ||
return PG_BACKEND | ||
|
||
def build_device_mesh(self) -> DeviceMesh: | ||
- return DeviceMesh(DEVICE_TYPE, list(range(NUM_DEVICES))) | ||
+ return DeviceMesh(DEVICE_TYPE, list(range(self.world_size))) | ||
|
||
def init_pg(self) -> None: | ||
if "nccl" in self.backend and torch.cuda.device_count() < self.world_size: |
33 changes: 33 additions & 0 deletions
33
easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
test_fsdp_tp_checkpoint_integration in distributed/fsdp/test_fsdp_tp_integration.py | ||
fails due to a regression. See https://github.com/pytorch/pytorch/issues/101162 | ||
|
||
> RuntimeError: Error(s) in loading state_dict for FullyShardedDataParallel: | ||
> size mismatch for _fsdp_wrapped_module.net1.weight: copying a param with shape torch.Size([4, 5]) from checkpoint, the shape in current model is torch.Size([8, 5]). | ||
> size mismatch for _fsdp_wrapped_module.net1.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([8]). | ||
> size mismatch for _fsdp_wrapped_module.net2.weight: copying a param with shape torch.Size([4, 4]) from checkpoint, the shape in current model is torch.Size([4, 8]). | ||
|
||
Skip the test. This should be fixed already for 2.2.x | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py | ||
index bc7a4aef4a3..aea16a1f1fb 100644 | ||
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py | ||
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py | ||
@@ -3,6 +3,7 @@ import copy | ||
import sys | ||
from collections import OrderedDict | ||
from typing import Any, Dict, List, Optional, Tuple | ||
+import unittest | ||
|
||
import torch | ||
from torch import distributed as dist | ||
@@ -306,7 +307,7 @@ class TestTPFSDPIntegration(FSDPTest): | ||
tp_fsdp_out = tp_fsdp_model(inp) | ||
self.assertEqual(fsdp_out, tp_fsdp_out) | ||
|
||
- @skip_if_lt_x_gpu(4) | ||
+ @unittest.skip("Known failure: #101162") | ||
def test_fsdp_tp_checkpoint_integration(self): | ||
"""Tests checkpointing for TP + FSDP integration.""" | ||
self.assertTrue( |