From 11a3205c1caabd50ae33563140fada6ad7ae4e4c Mon Sep 17 00:00:00 2001 From: Matthew Grange Date: Tue, 24 Mar 2026 14:14:57 -0700 Subject: [PATCH] Pin tree-sitter to 0.20.4 for OSS and internal (#117) Summary: ## Problem The tree-sitter 0.25.0 API changed `Language.__init__` to require a `name` parameter, breaking our code that passes only a capsule pointer. ## Solution - Pin tree-sitter to 0.20.4 in both PACKAGE files and pyproject.toml - Add `_language_from_capsule()` compatibility shim that extracts the raw pointer from the PyCapsule returned by tree-sitter-python/cpp language packages and constructs a Language object compatible with 0.20.4 - Switch from `Parser(language)` to `Parser()` + `parser.set_language()` (0.20.4 API) Differential Revision: D98013757 --- .../code_similarity/py_tree_sitter_attack.py | 30 +++++++++++++++++-- pyproject.toml | 6 ++-- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/privacy_guard/attacks/code_similarity/py_tree_sitter_attack.py b/privacy_guard/attacks/code_similarity/py_tree_sitter_attack.py index fe67498..267d5c2 100644 --- a/privacy_guard/attacks/code_similarity/py_tree_sitter_attack.py +++ b/privacy_guard/attacks/code_similarity/py_tree_sitter_attack.py @@ -14,6 +14,7 @@ # pyre-strict +import ctypes import logging from types import ModuleType from typing import Any @@ -25,6 +26,9 @@ CodeSimilarityAnalysisInput, ) from privacy_guard.attacks.base_attack import BaseAttack + +# pyre-ignore[21]: Parser is re-exported from tree_sitter.binding (C extension) +# but Pyre cannot resolve the binding module's .pyi stub in this Buck config. from tree_sitter import ( # @manual=fbsource//third-party/pypi/tree-sitter:tree-sitter Language, Parser, @@ -43,7 +47,26 @@ } -def _get_parser(language: str) -> Parser: +def _language_from_capsule(ts_module: ModuleType) -> Language: + """Create a tree-sitter Language from a language module's capsule. + + tree-sitter 0.20.4 expects ``Language(library_path, name)`` but the + modern language packages (tree-sitter-python, tree-sitter-cpp) expose + a ``language()`` function returning a PyCapsule. We extract the raw + pointer from the capsule and construct a Language-compatible object. + """ + capsule = ts_module.language() # type: ignore[attr-defined] + ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p + ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p] + language_id: ctypes.c_void_p = ctypes.pythonapi.PyCapsule_GetPointer( + capsule, b"tree_sitter.Language" + ) + lang = Language.__new__(Language) + lang.language_id = language_id # type: ignore[attr-defined] + return lang + + +def _get_parser(language: str) -> Parser: # pyre-ignore[11] """Create a tree-sitter Parser for the given language. Args: @@ -63,8 +86,9 @@ def _get_parser(language: str) -> Parser: f"Supported: {sorted(_LANGUAGE_REGISTRY.keys())}" ) - ts_language = Language(ts_module.language()) # type: ignore[attr-defined] - parser = Parser(ts_language) + ts_language = _language_from_capsule(ts_module) + parser = Parser() # pyre-ignore[16] + parser.set_language(ts_language) return parser diff --git a/pyproject.toml b/pyproject.toml index 9c64fe6..5b7fff1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,9 +41,9 @@ dependencies = [ 'later', 'torchvision', 'matplotlib', - 'tree-sitter', - 'tree-sitter-python', - 'tree-sitter-cpp', + 'tree-sitter==0.20.4', + 'tree-sitter-python<=0.23.2', + 'tree-sitter-cpp<=0.23.4', 'zss', ]