Skip to content

Commit 7958427

Browse files
committed
feat: add optional --include-submodules flag to CLI and ingestion
- Adds --include-submodules CLI flag to control submodule analysis - Propagates include_submodules through ingestion, schemas, and clone logic - Updates tests to cover submodule inclusion - Adds a helper function (_checkout_partial_clone) to avoid repetition - Web UI for this option is not implemented for now (#313 (comment))
1 parent 0fcf8a9 commit 7958427

File tree

7 files changed

+93
-9
lines changed

7 files changed

+93
-9
lines changed

src/gitingest/cli.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class _CLIArgs(TypedDict):
2020
include_pattern: tuple[str, ...]
2121
branch: str | None
2222
include_gitignored: bool
23+
include_submodules: bool
2324
token: str | None
2425
output: str | None
2526

@@ -47,6 +48,12 @@ class _CLIArgs(TypedDict):
4748
default=False,
4849
help="Include files matched by .gitignore and .gitingestignore",
4950
)
51+
@click.option(
52+
"--include-submodules",
53+
is_flag=True,
54+
help="Include repository's submodules in the analysis",
55+
default=False,
56+
)
5057
@click.option(
5158
"--token",
5259
"-t",
@@ -106,6 +113,7 @@ async def _async_main(
106113
include_pattern: tuple[str, ...] | None = None,
107114
branch: str | None = None,
108115
include_gitignored: bool = False,
116+
include_submodules: bool = False,
109117
token: str | None = None,
110118
output: str | None = None,
111119
) -> None:
@@ -129,6 +137,9 @@ async def _async_main(
129137
Git branch to ingest. If ``None``, the repository's default branch is used.
130138
include_gitignored : bool
131139
If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``).
140+
include_submodules : bool
141+
If ``True``, recursively include and analyze all Git submodules within the repository.
142+
Set to ``False`` to ignore submodules during analysis (default is ``False``).
132143
token : str | None
133144
GitHub personal access token (PAT) for accessing private repositories.
134145
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -160,6 +171,7 @@ async def _async_main(
160171
include_patterns=include_patterns,
161172
exclude_patterns=exclude_patterns,
162173
branch=branch,
174+
include_submodules=include_submodules,
163175
output=output_target,
164176
include_gitignored=include_gitignored,
165177
token=token,

src/gitingest/clone.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
4949
branch: str | None = config.branch
5050
tag: str | None = config.tag
5151
partial_clone: bool = config.subpath != "/"
52+
include_submodules: bool = config.include_submodules
5253

5354
# Create parent directory if it doesn't exist
5455
await ensure_directory(Path(local_path).parent)
@@ -63,7 +64,8 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
6364
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6465

6566
clone_cmd += ["clone", "--single-branch"]
66-
# TODO: Re-enable --recurse-submodules when submodule support is needed
67+
if include_submodules:
68+
clone_cmd += ["--recurse-submodules"]
6769

6870
if partial_clone:
6971
clone_cmd += ["--filter=blob:none", "--sparse"]
@@ -86,15 +88,36 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
8688

8789
# Checkout the subpath if it is a partial clone
8890
if partial_clone:
89-
subpath = config.subpath.lstrip("/")
90-
if config.blob:
91-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
92-
subpath = str(Path(subpath).parent.as_posix())
93-
94-
checkout_cmd = create_git_command(["git"], local_path, url, token)
95-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
91+
await _checkout_partial_clone(config, token)
9692

9793
# Checkout the commit if it is provided
9894
if commit:
9995
checkout_cmd = create_git_command(["git"], local_path, url, token)
10096
await run_command(*checkout_cmd, "checkout", commit)
97+
98+
99+
def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
100+
"""Handle sparse-checkout for partial clones.
101+
102+
This helper function sets the sparse-checkout configuration for a partial clone,
103+
optionally adjusting the subpath if ingesting from a file URL.
104+
105+
Parameters
106+
----------
107+
config : CloneConfig
108+
The configuration for cloning the repository, including subpath and blob flag.
109+
token : str | None
110+
GitHub personal access token (PAT) for accessing private repositories.
111+
Can also be set via the ``GITHUB_TOKEN`` environment variable.
112+
113+
Returns
114+
-------
115+
None
116+
117+
"""
118+
subpath = config.subpath.lstrip("/")
119+
if config.blob:
120+
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
121+
subpath = str(Path(subpath).parent.as_posix())
122+
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
123+
return run_command(*checkout_cmd, "sparse-checkout", "set", subpath)

src/gitingest/entrypoint.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ async def ingest_async(
2727
branch: str | None = None,
2828
tag: str | None = None,
2929
include_gitignored: bool = False,
30+
include_submodules: bool = False,
3031
token: str | None = None,
3132
output: str | None = None,
3233
) -> tuple[str, str, str]:
@@ -52,6 +53,8 @@ async def ingest_async(
5253
The tag to clone and ingest. If ``None``, no tag is used.
5354
include_gitignored : bool
5455
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
56+
include_submodules : bool
57+
If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``).
5558
token : str | None
5659
GitHub personal access token (PAT) for accessing private repositories.
5760
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -86,6 +89,8 @@ async def ingest_async(
8689
if query.url:
8790
_override_branch_and_tag(query, branch=branch, tag=tag)
8891

92+
query.include_submodules = include_submodules
93+
8994
async with _clone_repo_if_remote(query, token=token):
9095
summary, tree, content = ingest_query(query)
9196
await _write_output(tree, content=content, target=output)
@@ -101,6 +106,7 @@ def ingest(
101106
branch: str | None = None,
102107
tag: str | None = None,
103108
include_gitignored: bool = False,
109+
include_submodules: bool = False,
104110
token: str | None = None,
105111
output: str | None = None,
106112
) -> tuple[str, str, str]:
@@ -126,6 +132,8 @@ def ingest(
126132
The tag to clone and ingest. If ``None``, no tag is used.
127133
include_gitignored : bool
128134
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
135+
include_submodules : bool
136+
If ``True``, recursively include and analyze all Git submodules within the repository (default: ``False``).
129137
token : str | None
130138
GitHub personal access token (PAT) for accessing private repositories.
131139
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -156,6 +164,7 @@ def ingest(
156164
branch=branch,
157165
tag=tag,
158166
include_gitignored=include_gitignored,
167+
include_submodules=include_submodules,
159168
token=token,
160169
output=output,
161170
),

src/gitingest/schemas/ingestion.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
@dataclass
14-
class CloneConfig:
14+
class CloneConfig: # pylint: disable=too-many-instance-attributes, too-many-arguments
1515
"""Configuration for cloning a Git repository.
1616
1717
This class holds the necessary parameters for cloning a repository to a local path, including
@@ -33,6 +33,8 @@ class CloneConfig:
3333
The subpath to clone from the repository (default: ``"/"``).
3434
blob: bool
3535
Whether the repository is a blob (default: ``False``).
36+
include_submodules: bool
37+
Whether to clone submodules (default: ``False``).
3638
3739
"""
3840

@@ -43,6 +45,7 @@ class CloneConfig:
4345
tag: str | None = None
4446
subpath: str = "/"
4547
blob: bool = False
48+
include_submodules: bool = False
4649

4750

4851
class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
@@ -78,6 +81,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
7881
The patterns to ignore (default: ``set()``).
7982
include_patterns : set[str] | None
8083
The patterns to include.
84+
include_submodules : bool
85+
The flag whether to include Git submodules in the analysis. (default: ``False``)
8186
8287
"""
8388

@@ -95,6 +100,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
95100
max_file_size: int = Field(default=MAX_FILE_SIZE)
96101
ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type
97102
include_patterns: set[str] | None = None
103+
include_submodules: bool = False
98104

99105
def extract_clone_config(self) -> CloneConfig:
100106
"""Extract the relevant fields for the CloneConfig object.
@@ -122,6 +128,7 @@ def extract_clone_config(self) -> CloneConfig:
122128
tag=self.tag,
123129
subpath=self.subpath,
124130
blob=self.type == "blob",
131+
include_submodules=self.include_submodules,
125132
)
126133

127134
def ensure_url(self) -> None:

tests/query_parser/test_git_host_agnostic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ async def test_parse_query_without_host(
6868
"commit": None,
6969
"max_file_size": 50,
7070
"include_patterns": None,
71+
"include_submodules": False,
7172
}
7273

7374
assert actual == expected

tests/test_cli.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@
3131
True,
3232
id="custom-options",
3333
),
34+
pytest.param(
35+
[
36+
"./",
37+
"--output",
38+
str(OUTPUT_FILE_NAME),
39+
"--include-submodules",
40+
],
41+
True,
42+
id="with-include-submodules",
43+
),
3444
],
3545
)
3646
def test_cli_writes_file(

tests/test_clone.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,3 +414,25 @@ async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> Non
414414
)
415415

416416
assert run_command_mock.call_count == expected_call_count
417+
418+
419+
@pytest.mark.asyncio
420+
async def test_clone_with_include_submodules(run_command_mock: AsyncMock) -> None:
421+
"""Test cloning a repository with submodules included.
422+
423+
Given a valid URL and include_submodules=True:
424+
When `clone_repo` is called,
425+
Then the repository should be cloned with --recurse-submodules in the git command.
426+
"""
427+
clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True)
428+
429+
await clone_repo(clone_config)
430+
431+
# Check that --recurse-submodules is in the clone command
432+
found = False
433+
for call in run_command_mock.call_args_list:
434+
args = call[0]
435+
if "clone" in args and "--recurse-submodules" in args:
436+
found = True
437+
break
438+
assert found, "--recurse-submodules not found in git clone command when include_submodules=True"

0 commit comments

Comments
 (0)