In [2]:
import io
import zipfile
import requests
import frontmatter


# ---- Function to download and parse GitHub repo ----
def read_repo_data(repo_owner, repo_name, branch="main"):
    """
    Download and parse all markdown (.md, .mdx) files from a GitHub repository.

    Args:
        repo_owner (str): GitHub username or organization
        repo_name (str): Repository name
        branch (str): Branch name (default: main)

    Returns:
        list: A list of dictionaries with metadata + content
    """

    print(f"Downloading repository: {repo_owner}/{repo_name}")

    base_url = "https://codeload.github.com"
    url = f"{base_url}/{repo_owner}/{repo_name}/zip/refs/heads/{branch}"

    # Download repo zip
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(f"Failed to download repo: HTTP {response.status_code}")

    repository_data = []

    # Open zip file in memory
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    for file_info in zip_file.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        # Only process markdown files
        if not (filename_lower.endswith(".md") or filename_lower.endswith(".mdx")):
            continue

        try:
            with zip_file.open(file_info) as f:
                raw_content = f.read().decode("utf-8", errors="ignore")

                # Parse frontmatter
                post = frontmatter.loads(raw_content)
                data = post.to_dict()

                # Add filename for reference
                data["filename"] = filename

                repository_data.append(data)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")

    zip_file.close()

    print(f"Finished processing {len(repository_data)} documents\n")
    return repository_data

In [3]:
# Ingest documentation from my own GitHub repository
dajuctech_docs = read_repo_data("dajuctech", "ai-engineering-toolkit")

print(f"dajuctech documents: {len(dajuctech_docs)}")

Downloading repository: dajuctech/ai-engineering-toolkit
Finished processing 1 documents

dajuctech documents: 1


In [4]:
## Simple Chunking

dajuctech_chunks = []

for doc in dajuctech_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    dajuctech_chunks.extend(chunks)

NameError: name 'sliding_window' is not defined