### Step 1: Parse GitHub URL

In [21]:
from urllib.parse import urlparse
import re


def parse_github_url(url: str) -> tuple[str, str]:
    """Extract owner and repo name from a GitHub URL.
    
    Supports formats:
      - https://github.com/owner/repo
      - https://github.com/owner/repo.git
      - https://github.com/owner/repo/tree/branch/...
    """
    parsed = urlparse(url)

    if parsed.hostname not in ("github.com", "www.github.com"):
        raise ValueError(f"Not a GitHub URL: {url}")

    parts = parsed.path.strip("/").split("/")

    if len(parts) < 2:
        raise ValueError(f"URL must include owner and repo: {url}")

    owner = parts[0]
    repo = re.sub(r"\.git$", "", parts[1])

    if not owner or not repo:
        raise ValueError(f"Could not extract owner/repo from: {url}")

    return owner, repo


# --- Test it ---
test_urls = [
    "https://github.com/fastapi/fastapi",
    "https://github.com/openai/openai-python.git",
    "https://github.com/torvalds/linux/tree/master/kernel",
]

for url in test_urls:
    owner, repo = parse_github_url(url)
    print(f"{url}  ‚Üí  owner={owner}, repo={repo}")

https://github.com/fastapi/fastapi  ‚Üí  owner=fastapi, repo=fastapi
https://github.com/openai/openai-python.git  ‚Üí  owner=openai, repo=openai-python
https://github.com/torvalds/linux/tree/master/kernel  ‚Üí  owner=torvalds, repo=linux


## Step 2: Clone the Repository

In [22]:
import subprocess
import tempfile
import shutil
from pathlib import Path


def clone_repo(url: str) -> Path:
    """Shallow-clone a GitHub repo into a temp directory. Returns the path."""
    tmp_dir = Path(tempfile.mkdtemp(prefix="repo_"))
    clone_url = f"https://github.com/{owner}/{repo}.git"

    subprocess.run(
        ["git", "clone", "--depth", "1", clone_url, str(tmp_dir)],
        check=True,
        capture_output=True,
        text=True,
    )
    print(f"Cloned to {tmp_dir}")
    return tmp_dir


# --- Test it ---
owner, repo = parse_github_url("https://github.com/fastapi/fastapi")
repo_path = clone_repo(f"https://github.com/{owner}/{repo}")
print(f"Repo cloned at: {repo_path}")

Cloned to /var/folders/4j/pcs4np_543s8qg7l5jf9qnvh0000gn/T/repo_rvwk3rz0
Repo cloned at: /var/folders/4j/pcs4np_543s8qg7l5jf9qnvh0000gn/T/repo_rvwk3rz0


## Step 3: Filter Relevant Files

In [23]:
import os

SKIP_DIRS = {
    "node_modules", ".git", "__pycache__", ".venv", "venv", "env",
    "dist", "build", ".next", ".nuxt", "vendor", ".tox", ".mypy_cache",
    ".pytest_cache", "coverage", ".idea", ".vscode", "docs", ".github",
}

SKIP_EXTENSIONS = {
    ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
    ".mp4", ".mp3", ".wav", ".mov",
    ".woff", ".woff2", ".ttf", ".eot",
    ".pdf", ".zip", ".tar", ".gz", ".bz2",
    ".lock", ".min.js", ".min.css",
    ".pyc", ".pyo", ".so", ".dll", ".dylib",
    ".DS_Store", ".gitignore",
}

HIGH_PRIORITY_NAMES = {
    "README.md", "readme.md", "README.rst",
    "main.py", "app.py", "index.py", "server.py", "cli.py",
    "index.ts", "index.js", "app.ts", "app.js", "main.ts", "main.js",
    "setup.py", "setup.cfg", "pyproject.toml",
    "package.json", "Cargo.toml", "go.mod",
    "Makefile", "Dockerfile", "docker-compose.yml",
    "requirements.txt",
}

LOW_PRIORITY_DIRS = {"test", "tests", "spec", "specs", "examples", "example", "docs_src", "benchmarks", "scripts"}

MAX_FILE_SIZE = 100_000


def prioritize_files(files: list[Path], repo_path: Path) -> tuple[list[Path], list[Path], list[Path]]:
    """Sort files into priority tiers. Returns (high, medium, low)."""
    high, medium, low = [], [], []

    for f in files:
        rel = f.relative_to(repo_path)
        parts = set(rel.parts)

        if rel.name in HIGH_PRIORITY_NAMES:
            high.append(f)
        elif parts & LOW_PRIORITY_DIRS:
            low.append(f)
        else:
            medium.append(f)

    key = lambda f: (len(f.relative_to(repo_path).parts), str(f))
    high.sort(key=key)
    medium.sort(key=key)
    low.sort(key=key)

    return high, medium, low


def filter_files(repo_path: Path, priority: str = "all") -> list[Path]:
    """Walk the cloned repo, filter, and return priority-sorted file paths.
    
    priority: "all" (default), "high", "high+medium"
    """
    filtered = []

    for root, dirs, files in os.walk(repo_path):
        dirs[:] = [d for d in dirs if d not in SKIP_DIRS]

        for filename in files:
            filepath = Path(root) / filename

            if any(filename.endswith(ext) for ext in SKIP_EXTENSIONS):
                continue

            if filepath.stat().st_size > MAX_FILE_SIZE:
                continue

            filtered.append(filepath)

    high, medium, low = prioritize_files(filtered, repo_path)

    if priority == "high":
        return high
    elif priority == "high+medium":
        return high + medium
    return high + medium + low


# --- Test it ---
filtered = filter_files(repo_path, priority="high")

print(f"After filtering: {len(filtered)} files")
print("\nHigh priority files:")
for f in filtered:
    rel = f.relative_to(repo_path)
    size = f.stat().st_size
    print(f"  {size:>7} bytes  {rel}")

After filtering: 17 files

High priority files:
    26877 bytes  README.md
    10099 bytes  pyproject.toml
     2101 bytes  fastapi-slim/README.md
      418 bytes  fastapi/cli.py
     4558 bytes  tests/main.py
     1151 bytes  tests/test_validate_response_recursive/app.py
      118 bytes  docs_src/app_testing/app_a_py310/main.py
     1163 bytes  docs_src/app_testing/app_b_an_py310/main.py
     1113 bytes  docs_src/app_testing/app_b_py310/main.py
      112 bytes  docs_src/async_tests/app_a_py310/main.py
      552 bytes  docs_src/bigger_applications/app_an_py310/main.py
      267 bytes  docs_src/settings/app01_py310/main.py
      445 bytes  docs_src/settings/app02_an_py310/main.py
      406 bytes  docs_src/settings/app02_py310/main.py
      451 bytes  docs_src/settings/app03_an_py310/main.py
      412 bytes  docs_src/settings/app03_py310/main.py
      150 bytes  tests/test_modules_same_name_body/app/main.py


## Step 4: Read File Contents

In [24]:
MAX_TOTAL_CHARS = 500_000  # total character budget for all files combined
MAX_FILES = 150            # max number of files to read


def build_directory_tree(repo_path: Path, files: list[Path]) -> str:
    """Build a text representation of the directory structure."""
    lines = ["# Directory Structure", "```"]
    seen_dirs = set()

    for f in files:
        rel = f.relative_to(repo_path)
        # add parent directories we haven't printed yet
        for i in range(1, len(rel.parts)):
            dir_path = Path(*rel.parts[:i])
            if dir_path not in seen_dirs:
                seen_dirs.add(dir_path)
                indent = "  " * (len(dir_path.parts) - 1)
                lines.append(f"{indent}{dir_path.name}/")
        # add the file itself
        indent = "  " * (len(rel.parts) - 1)
        lines.append(f"{indent}{rel.name}")

    lines.append("```")
    return "\n".join(lines)


def read_all_contents(repo_path: Path, files: list[Path]) -> str:
    """Build directory tree + read filtered files into a context string."""
    tree = build_directory_tree(repo_path, files)

    files_to_read = files[:MAX_FILES]
    context_parts = []
    total_chars = len(tree)

    for filepath in files_to_read:
        rel_path = filepath.relative_to(repo_path)

        try:
            content = filepath.read_text(encoding="utf-8", errors="replace")
        except Exception:
            continue

        if total_chars + len(content) > MAX_TOTAL_CHARS:
            print(f"Stopping at {rel_path} ‚Äî character budget reached ({total_chars:,} chars)")
            break

        context_parts.append(f"## File: {rel_path}\n```\n{content}\n```")
        total_chars += len(content)

    print(f"Read {len(context_parts)} files, {total_chars:,} total chars")
    return tree + "\n\n" + "\n\n".join(context_parts)


# --- Test it ---
context = read_all_contents(repo_path, filtered)
print(f"Context size: {len(context)} characters")
print(f"\nContext preview (first 1500 chars):\n{context[:100000]}")

Read 17 files, 50,767 total chars
Context size: 51672 characters

Context preview (first 1500 chars):
# Directory Structure
```
README.md
pyproject.toml
fastapi-slim/
  README.md
fastapi/
  cli.py
tests/
  main.py
  test_validate_response_recursive/
    app.py
docs_src/
  app_testing/
    app_a_py310/
      main.py
    app_b_an_py310/
      main.py
    app_b_py310/
      main.py
  async_tests/
    app_a_py310/
      main.py
  bigger_applications/
    app_an_py310/
      main.py
  settings/
    app01_py310/
      main.py
    app02_an_py310/
      main.py
    app02_py310/
      main.py
    app03_an_py310/
      main.py
    app03_py310/
      main.py
  test_modules_same_name_body/
    app/
      main.py
```

## File: README.md
```
<p align="center">
  <a href="https://fastapi.tiangolo.com"><img src="https://fastapi.tiangolo.com/img/logo-margin/logo-teal.png" alt="FastAPI"></a>
</p>
<p align="center">
    <em>FastAPI framework, high performance, easy to learn, fast to code, ready for produ

## Step 5: Summarize with LLM Agent

In [30]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")
client = OpenAI(api_key=api_key)  # reads OPENAI_API_KEY from .env

SYSTEM_PROMPT = """You are a code analyst. Given a repository's directory structure and file contents, produce a clear, human-readable summary.

Your summary should include:
1. **Purpose** ‚Äî What does this project do? (1-2 sentences)
2. **Tech Stack** ‚Äî Languages, frameworks, and key dependencies
3. **Architecture** ‚Äî How is the codebase organized? Key modules/packages
4. **Key Components** ‚Äî The most important files/classes/functions and what they do
5. **Getting Started** ‚Äî How to install and run the project (if discernible from the code)

Keep the summary concise but informative. Focus on what matters most to someone seeing this project for the first time."""


def summarize_repo(context: str) -> str:
    """Send repo context to the LLM and return a summary."""
    response = client.chat.completions.create(
        model="gpt-5-nano-2025-08-07",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"Summarize this repository:\n\n{context}"},
        ],
    )
    return response.choices[0].message.content


# --- Test it ---
summary = summarize_repo(context)
print(summary)

Summary of the repository

1) Purpose
- Provides the FastAPI framework source (core library) along with a deprecated fastapi-slim wrapper, a large set of tests, and documentation/examples. It‚Äôs intended to validate features, demonstrate usage, and drive docs/tests for FastAPI.

2) Tech Stack
- Language: Python (3.10‚Äì3.14 compatible per pyproject)
- Core tech: FastAPI, Starlette, Pydantic
- Web server / testing: uvicorn, httpx (tests), pytest
- Documentation/testing tooling: mkdocs, various docs/test helpers in docs_src
- Additional libraries referenced in extras: Jinja2, python-multipart, email-validator, pydantic-settings, etc.

3) Architecture (code organization)
- fastapi/: Core library code (fastapi.cli integration exposed via CLI script). Primary package containing the FastAPI framework components.
- fastapi-slim/: Deprecated wrapper package that depends on fastapi; exists as migration aid.
- tests/: Test harness and example apps used to verify behavior (endpoints with diverse