From b39b5a7dd2ec6828aaf96d8ff5778cf48446dbb0 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Fri, 29 May 2026 15:41:13 +0000 Subject: [PATCH] Handle repo patch with non-UTF8 sequences Git text diff is not actually text in the usual sense -- it may contain any nonprintable bytes. But since we send patches as files (multipart/form-data), not as JSON strings, we don't need to decode them at all. In addition, the missing --binary flag was added. Without this flag, modified binary files were effectively excluded from the patch with "Binary files a and b differ". Fixes: https://github.com/dstackai/dstack/issues/3880 --- .../_internal/core/models/repos/remote.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/dstack/_internal/core/models/repos/remote.py b/src/dstack/_internal/core/models/repos/remote.py index d7ffdea7d9..f613e18221 100644 --- a/src/dstack/_internal/core/models/repos/remote.py +++ b/src/dstack/_internal/core/models/repos/remote.py @@ -58,7 +58,7 @@ class RemoteRepoInfo( class RemoteRunRepoData(RemoteRepoInfo): repo_branch: Optional[str] = None repo_hash: Optional[str] = None - repo_diff: Annotated[Optional[str], Field(exclude=True)] = None + repo_diff: Annotated[Optional[bytes], Field(exclude=True)] = None repo_config_name: Optional[str] = None repo_config_email: Optional[str] = None @@ -183,13 +183,15 @@ def __init__( def has_code_to_write(self) -> bool: # repo_diff is: # * None for RemoteRepo.from_url() - # * an empty string for RemoteRepo.from_dir() if there are no changes ("clean" state) - # * a non-empty string for RemoteRepo.from_dir() if there are changes ("dirty" state) + # * empty bytes for RemoteRepo.from_dir() if there are no changes ("clean" state) + # and untracked files + # * non-empty bytes for RemoteRepo.from_dir() if there are changes ("dirty" state) + # and/or untracked files return bool(self.run_repo_data.repo_diff) def write_code_file(self, fp: BinaryIO) -> str: if self.run_repo_data.repo_diff is not None: - fp.write(self.run_repo_data.repo_diff.encode()) + fp.write(self.run_repo_data.repo_diff) return get_sha256(fp) def get_repo_info(self) -> RemoteRepoInfo: @@ -238,7 +240,7 @@ def __init__(self, warning_time: float, delay: float = 5): self.delay = delay self.warned = False self.start_time = time.monotonic() - self.buffer = io.StringIO() + self.buffer = io.BytesIO() def timeout(self): now = time.monotonic() @@ -256,9 +258,9 @@ def timeout(self): ) def write(self, v: bytes): - self.buffer.write(v.decode()) + self.buffer.write(v) - def get(self) -> str: + def get(self) -> bytes: if self.warned: print() return self.buffer.getvalue() @@ -366,10 +368,10 @@ def _interactive_git_proc( continue -def _repo_diff_verbose(repo: git.Repo, repo_hash: str, warning_time: float = 5) -> str: +def _repo_diff_verbose(repo: git.Repo, repo_hash: str, warning_time: float = 5) -> bytes: collector = _DiffCollector(warning_time) try: - _interactive_git_proc(repo.git.diff(repo_hash, as_process=True), collector) + _interactive_git_proc(repo.git.diff(repo_hash, binary=True, as_process=True), collector) for filename in repo.untracked_files: _interactive_git_proc( repo.git.diff("/dev/null", filename, no_index=True, binary=True, as_process=True),