Skip to content

Commit 28198d6

Browse files
committed
feat: add optional --include-submodules flag to CLI and ingestion
- Adds --include-submodules CLI flag to control submodule analysis - Propagates include_submodules through ingestion, schemas, and clone logic - Updates tests to cover submodule inclusion - Adds a helper function (_checkout_partial_clone) to avoid repetition - Web UI for this option is not implemented for now (#313 (comment))
1 parent f8d397e commit 28198d6

File tree

7 files changed

+96
-8
lines changed

7 files changed

+96
-8
lines changed

src/gitingest/cli.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class _CLIArgs(TypedDict):
2020
include_pattern: tuple[str, ...]
2121
branch: str | None
2222
include_gitignored: bool
23+
include_submodules: bool
2324
token: str | None
2425
output: str | None
2526

@@ -47,6 +48,12 @@ class _CLIArgs(TypedDict):
4748
default=False,
4849
help="Include files matched by .gitignore and .gitingestignore",
4950
)
51+
@click.option(
52+
"--include-submodules",
53+
is_flag=True,
54+
help="Include repository's submodules in the analysis",
55+
default=False,
56+
)
5057
@click.option(
5158
"--token",
5259
"-t",
@@ -106,6 +113,7 @@ async def _async_main(
106113
include_pattern: tuple[str, ...] | None = None,
107114
branch: str | None = None,
108115
include_gitignored: bool = False,
116+
include_submodules: bool = False,
109117
token: str | None = None,
110118
output: str | None = None,
111119
) -> None:
@@ -129,6 +137,9 @@ async def _async_main(
129137
Git branch to ingest. If ``None``, the repository's default branch is used.
130138
include_gitignored : bool
131139
If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``).
140+
include_submodules : bool
141+
If ``True``, recursively include and analyze all Git submodules within the repository.
142+
Set to ``False`` to ignore submodules during analysis (default is ``False``).
132143
token : str | None
133144
GitHub personal access token (PAT) for accessing private repositories.
134145
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -160,6 +171,7 @@ async def _async_main(
160171
include_patterns=include_patterns,
161172
exclude_patterns=exclude_patterns,
162173
branch=branch,
174+
include_submodules=include_submodules,
163175
output=output_target,
164176
include_gitignored=include_gitignored,
165177
token=token,

src/gitingest/clone.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
4848
commit: str | None = config.commit
4949
branch: str | None = config.branch
5050
partial_clone: bool = config.subpath != "/"
51+
include_submodules: bool = config.include_submodules
5152

5253
# Create parent directory if it doesn't exist
5354
await ensure_directory(Path(local_path).parent)
@@ -62,7 +63,8 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
6263
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6364

6465
clone_cmd += ["clone", "--single-branch"]
65-
# TODO: Re-enable --recurse-submodules when submodule support is needed
66+
if include_submodules:
67+
clone_cmd += ["--recurse-submodules"]
6668

6769
if partial_clone:
6870
clone_cmd += ["--filter=blob:none", "--sparse"]
@@ -80,15 +82,40 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
8082

8183
# Checkout the subpath if it is a partial clone
8284
if partial_clone:
83-
subpath = config.subpath.lstrip("/")
84-
if config.blob:
85-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
86-
subpath = str(Path(subpath).parent.as_posix())
87-
88-
checkout_cmd = create_git_command(["git"], local_path, url, token)
89-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
85+
await _checkout_partial_clone(config, local_path, url, token)
9086

9187
# Checkout the commit if it is provided
9288
if commit:
9389
checkout_cmd = create_git_command(["git"], local_path, url, token)
9490
await run_command(*checkout_cmd, "checkout", commit)
91+
92+
93+
def _checkout_partial_clone(config: CloneConfig, local_path: str, url: str, token: str | None) -> None:
94+
"""Handle sparse-checkout for partial clones.
95+
96+
This helper function sets the sparse-checkout configuration for a partial clone,
97+
optionally adjusting the subpath if ingesting from a file URL.
98+
99+
Parameters
100+
----------
101+
config : CloneConfig
102+
The configuration for cloning the repository, including subpath and blob flag.
103+
local_path : str
104+
The local path where the repository has been cloned.
105+
url : str
106+
The URL of the repository.
107+
token : str | None
108+
GitHub personal access token (PAT) for accessing private repositories.
109+
Can also be set via the ``GITHUB_TOKEN`` environment variable.
110+
111+
Returns
112+
-------
113+
None
114+
115+
"""
116+
subpath = config.subpath.lstrip("/")
117+
if config.blob:
118+
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
119+
subpath = str(Path(subpath).parent.as_posix())
120+
checkout_cmd = create_git_command(["git"], local_path, url, token)
121+
return run_command(*checkout_cmd, "sparse-checkout", "set", subpath)

src/gitingest/entrypoint.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ async def ingest_async(
2424
exclude_patterns: str | set[str] | None = None,
2525
branch: str | None = None,
2626
include_gitignored: bool = False,
27+
include_submodules: bool = False,
2728
token: str | None = None,
2829
output: str | None = None,
2930
) -> tuple[str, str, str]:
@@ -47,6 +48,8 @@ async def ingest_async(
4748
The branch to clone and ingest (default: the default branch).
4849
include_gitignored : bool
4950
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
51+
include_submodules : bool
52+
If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``).
5053
token : str | None
5154
GitHub personal access token (PAT) for accessing private repositories.
5255
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -80,6 +83,8 @@ async def ingest_async(
8083

8184
if branch:
8285
query.branch = branch
86+
if include_submodules is not None:
87+
query.include_submodules = include_submodules
8388

8489
repo_cloned = False
8590
try:
@@ -104,6 +109,7 @@ def ingest(
104109
exclude_patterns: str | set[str] | None = None,
105110
branch: str | None = None,
106111
include_gitignored: bool = False,
112+
include_submodules: bool = False,
107113
token: str | None = None,
108114
output: str | None = None,
109115
) -> tuple[str, str, str]:
@@ -127,6 +133,8 @@ def ingest(
127133
The branch to clone and ingest (default: the default branch).
128134
include_gitignored : bool
129135
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
136+
include_submodules : bool
137+
If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``).
130138
token : str | None
131139
GitHub personal access token (PAT) for accessing private repositories.
132140
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -156,6 +164,7 @@ def ingest(
156164
exclude_patterns=exclude_patterns,
157165
branch=branch,
158166
include_gitignored=include_gitignored,
167+
include_submodules=include_submodules,
159168
token=token,
160169
output=output,
161170
),

src/gitingest/schemas/ingestion.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class CloneConfig:
3131
The subpath to clone from the repository (default: ``"/"``).
3232
blob: bool
3333
Whether the repository is a blob (default: ``False``).
34+
include_submodules: bool
35+
Whether to clone submodules (default: ``False``).
3436
3537
"""
3638

@@ -40,6 +42,7 @@ class CloneConfig:
4042
branch: str | None = None
4143
subpath: str = "/"
4244
blob: bool = False
45+
include_submodules: bool = False
4346

4447

4548
class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
@@ -73,6 +76,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
7376
The patterns to ignore (default: ``set()``).
7477
include_patterns : set[str] | None
7578
The patterns to include.
79+
include_submodules : bool
80+
The flag whether to include Git submodules in the analysis. (default: ``False``)
7681
7782
"""
7883

@@ -89,6 +94,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
8994
max_file_size: int = Field(default=MAX_FILE_SIZE)
9095
ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type
9196
include_patterns: set[str] | None = None
97+
include_submodules: bool = False
9298

9399
def extract_clone_config(self) -> CloneConfig:
94100
"""Extract the relevant fields for the CloneConfig object.
@@ -115,6 +121,7 @@ def extract_clone_config(self) -> CloneConfig:
115121
branch=self.branch,
116122
subpath=self.subpath,
117123
blob=self.type == "blob",
124+
include_submodules=self.include_submodules,
118125
)
119126

120127
def ensure_url(self) -> None:

tests/query_parser/test_git_host_agnostic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ async def test_parse_query_without_host(
6767
"commit": None,
6868
"max_file_size": 50,
6969
"include_patterns": None,
70+
"include_submodules": False,
7071
}
7172

7273
assert actual == expected

tests/test_cli.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@
3131
True,
3232
id="custom-options",
3333
),
34+
pytest.param(
35+
[
36+
"./",
37+
"--output",
38+
str(OUTPUT_FILE_NAME),
39+
"--include-submodules",
40+
],
41+
True,
42+
id="with-include-submodules",
43+
),
3444
],
3545
)
3646
def test_cli_writes_file(

tests/test_clone.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,3 +414,25 @@ async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> Non
414414
)
415415

416416
assert run_command_mock.call_count == expected_call_count
417+
418+
419+
@pytest.mark.asyncio
420+
async def test_clone_with_include_submodules(run_command_mock: AsyncMock) -> None:
421+
"""Test cloning a repository with submodules included.
422+
423+
Given a valid URL and include_submodules=True:
424+
When `clone_repo` is called,
425+
Then the repository should be cloned with --recurse-submodules in the git command.
426+
"""
427+
clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True)
428+
429+
await clone_repo(clone_config)
430+
431+
# Check that --recurse-submodules is in the clone command
432+
found = False
433+
for call in run_command_mock.call_args_list:
434+
args = call[0]
435+
if "clone" in args and "--recurse-submodules" in args:
436+
found = True
437+
break
438+
assert found, "--recurse-submodules not found in git clone command when include_submodules=True"

0 commit comments

Comments
 (0)