Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions __tests__/extraction.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3515,6 +3515,87 @@ describe('Nested non-submodule git repos', () => {
expect(files).toContain('sub_repo/src/real.ts');
expect(files).not.toContain('sub_repo/src/generated.ts');
});

it('should index an embedded repo the super-repo .gitignore excludes', async () => {
const { execFileSync } = await import('child_process');
const git = (cwd: string, ...args: string[]) =>
execFileSync('git', args, { cwd, stdio: 'pipe' });

const root = path.join(tempDir, 'root');
fs.mkdirSync(root, { recursive: true });
git(root, 'init', '-q');
git(root, 'config', 'user.email', 'test@test.com');
git(root, 'config', 'user.name', 'Test');
// The workspace ignores its nested clones to keep its own `git status` clean.
fs.writeFileSync(path.join(root, '.gitignore'), 'sub_repo/\n');

const sub = path.join(root, 'sub_repo', 'src');
fs.mkdirSync(sub, { recursive: true });
git(path.join(root, 'sub_repo'), 'init', '-q');
fs.writeFileSync(path.join(sub, 'hidden.ts'), 'export const hidden = 1;');

const files = scanDirectory(root);

// Excluded from the super-repo's tracking, but still its own repo → indexed.
expect(files).toContain('sub_repo/src/hidden.ts');
});

it('should index repos inside an ignored wrapper directory', async () => {
const { execFileSync } = await import('child_process');
const git = (cwd: string, ...args: string[]) =>
execFileSync('git', args, { cwd, stdio: 'pipe' });

const root = path.join(tempDir, 'root');
fs.mkdirSync(root, { recursive: true });
git(root, 'init', '-q');
git(root, 'config', 'user.email', 'test@test.com');
git(root, 'config', 'user.name', 'Test');
// `services/` is a plain wrapper dir (not a repo) holding sibling clones,
// and the super-repo ignores the whole subtree.
fs.writeFileSync(path.join(root, '.gitignore'), 'services/\n');

const walletSrc = path.join(root, 'services', 'wallet', 'src');
fs.mkdirSync(walletSrc, { recursive: true });
git(path.join(root, 'services', 'wallet'), 'init', '-q');
fs.writeFileSync(path.join(walletSrc, 'wallet.ts'), 'export const wallet = 1;');

const balanceSrc = path.join(root, 'services', 'balance', 'src');
fs.mkdirSync(balanceSrc, { recursive: true });
git(path.join(root, 'services', 'balance'), 'init', '-q');
fs.writeFileSync(path.join(balanceSrc, 'balance.ts'), 'export const balance = 1;');

const files = scanDirectory(root);

expect(files).toContain('services/wallet/src/wallet.ts');
expect(files).toContain('services/balance/src/balance.ts');
});

it("should still honor a hidden embedded repo's own .gitignore", async () => {
const { execFileSync } = await import('child_process');
const git = (cwd: string, ...args: string[]) =>
execFileSync('git', args, { cwd, stdio: 'pipe' });

const root = path.join(tempDir, 'root');
fs.mkdirSync(root, { recursive: true });
git(root, 'init', '-q');
git(root, 'config', 'user.email', 'test@test.com');
git(root, 'config', 'user.name', 'Test');
fs.writeFileSync(path.join(root, '.gitignore'), 'sub_repo/\n');

const sub = path.join(root, 'sub_repo', 'src');
fs.mkdirSync(sub, { recursive: true });
git(path.join(root, 'sub_repo'), 'init', '-q');
// The embedded repo's OWN .gitignore must still apply — the super-repo
// hiding it must not switch off its internal ignore rules.
fs.writeFileSync(path.join(root, 'sub_repo', '.gitignore'), 'src/generated.ts\n');
fs.writeFileSync(path.join(sub, 'real.ts'), 'export const real = 1;');
fs.writeFileSync(path.join(sub, 'generated.ts'), 'export const generated = 1;');

const files = scanDirectory(root);

expect(files).toContain('sub_repo/src/real.ts');
expect(files).not.toContain('sub_repo/src/generated.ts');
});
});

// =============================================================================
Expand Down
110 changes: 109 additions & 1 deletion src/extraction/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,98 @@ function collectGitFiles(repoDir: string, prefix: string, files: Set<string>): v
}
}

/**
* Maximum depth to descend through an *ignored* non-repo directory while hunting
* for embedded git repos. Wrapper dirs that hold sibling clones (e.g.
* `services/<name>/`) keep the repos a level or two down; a small cap avoids
* walking deep into large ignored artifact trees that contain no repo at all.
*/
const IGNORED_EMBEDDED_REPO_MAX_DEPTH = 4;

/**
* Recurse into an ignored directory looking for embedded git repos. If `absDir`
* is itself a repo, index it via collectGitFiles (which honors that repo's own
* .gitignore and recurses its embedded repos). Otherwise descend a bounded
* number of levels — wrapper dirs like `services/` hold sibling clones such as
* `services/<name>/` that are the actual repos.
*/
function findEmbeddedReposUnder(
absDir: string,
prefix: string,
files: Set<string>,
visited: Set<string>,
depth: number
): void {
let real: string;
try {
real = fs.realpathSync(absDir);
} catch {
return; // unreadable or dangling symlink
}
if (visited.has(real)) return;
visited.add(real);

if (fs.existsSync(path.join(absDir, '.git'))) {
collectGitFiles(absDir, prefix, files);
return;
}
if (depth >= IGNORED_EMBEDDED_REPO_MAX_DEPTH) return;

let entries: fs.Dirent[];
try {
entries = fs.readdirSync(absDir, { withFileTypes: true });
} catch {
return;
}
for (const entry of entries) {
// Dirent reflects the link itself, so a symlinked directory reports
// isDirectory() === false — this also skips links, avoiding cycles.
if (!entry.isDirectory()) continue;
// Never hunt inside dependency/build dirs or hidden tooling dirs.
if (entry.name.startsWith('.') || DEFAULT_IGNORE_DIRS.has(entry.name)) continue;
findEmbeddedReposUnder(
path.join(absDir, entry.name),
prefix + entry.name + '/',
files,
visited,
depth + 1
);
}
}

/**
* Find embedded git repos that the super-repo's own .gitignore hides.
*
* collectGitFiles only recurses into embedded repos that surface as *untracked*
* entries (`git ls-files -o`). When the super-repo's .gitignore excludes the
* nested clone — common when a workspace holds independent repos it doesn't want
* cluttering its own `git status` — the repo never appears there and its source
* is invisible. List ignored directories explicitly and recurse into any that is,
* or contains, a git repo. Each embedded repo still honors its OWN .gitignore.
* (See issue #193 for the untracked case this complements.)
*/
function collectIgnoredEmbeddedRepos(rootDir: string, files: Set<string>, visited: Set<string>): void {
let output: string;
try {
output = execFileSync(
'git',
['ls-files', '-o', '-i', '--exclude-standard', '--directory'],
{ cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
);
} catch {
return; // not a git repo or git failure — nothing to add
}
for (const line of output.split('\n')) {
const trimmed = line.trim();
// `--directory` collapses each ignored directory to a single "dir/" entry;
// ignored files (no trailing slash) can never be a repo, so skip them.
if (!trimmed.endsWith('/')) continue;
const name = path.basename(trimmed.replace(/\/+$/, ''));
if (name.startsWith('.') || DEFAULT_IGNORE_DIRS.has(name)) continue;
findEmbeddedReposUnder(path.join(rootDir, trimmed), normalizePath(trimmed), files, visited, 0);
}
}

/**
* Get all files visible to git (tracked + untracked but not ignored).
* Respects .gitignore at all levels (root, subdirectories) and descends into
Expand Down Expand Up @@ -265,7 +357,23 @@ function getGitVisibleFiles(rootDir: string): Set<string> | null {
// committing a dependency/build dir doesn't make it project code. A
// `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
const ig = buildDefaultIgnore(rootDir);
return new Set([...files].filter((f) => !ig.ignores(f)));
const visible = new Set([...files].filter((f) => !ig.ignores(f)));

// Embedded git repos hidden by the super-repo's own .gitignore are
// independent projects, not third-party noise — index them too. Their files
// are filtered ONLY by the built-in defaults (node_modules, …), NOT the
// super-repo's .gitignore: that exclusion is the parent's tracking hygiene,
// and each embedded repo already honored its own .gitignore in collectGitFiles.
const embedded = new Set<string>();
collectIgnoredEmbeddedRepos(rootDir, embedded, new Set<string>());
if (embedded.size > 0) {
const defaultsOnly = ignore().add(DEFAULT_IGNORE_PATTERNS);
for (const f of embedded) {
if (!defaultsOnly.ignores(f)) visible.add(f);
}
}

return visible;
} catch {
return null;
}
Expand Down