Skip to content

Commit

Permalink
Add gitpythonfs fsspec implementation dlt-hub#301
Browse files Browse the repository at this point in the history
  • Loading branch information
deanja committed Dec 26, 2023
1 parent 6681b22 commit 3e21941
Show file tree
Hide file tree
Showing 8 changed files with 2,484 additions and 1 deletion.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ readme = "README.md"
packages = [{include = "sources"}]

[tool.poetry.dependencies]
python = "^3.8.1"
python = ">=3.8.1,<3.13"
dlt = {version = "^0.3.23", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb", "s3", "gs"]}
# dlt = {path = "../dlt", develop = true}
gitpythonfs = {path = "./sources/filesystem/gitpythonfs", develop = true}

[tool.poetry.group.dev.dependencies]
mypy = "1.6.1"
Expand Down
6 changes: 6 additions & 0 deletions sources/filesystem/gitpythonfs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# gitpythonfs

Builds on [GitPython](https://gitpython.readthedocs.io/) to provide a Python filesystem interface for git.

The initial use case is to load file contents from git repos into destinations using tools such as [dlt](https://dlthub.com)

2,101 changes: 2,101 additions & 0 deletions sources/filesystem/gitpythonfs/docs/usage.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions sources/filesystem/gitpythonfs/gitpythonfs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .core import GitPythonFileSystem, register_implementation_in_fsspec

register_implementation_in_fsspec()
165 changes: 165 additions & 0 deletions sources/filesystem/gitpythonfs/gitpythonfs/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from typing import List, Dict, Any, Union
from fsspec.registry import register_implementation
from fsspec.spec import AbstractFileSystem
from fsspec.implementations.memory import MemoryFile
import git


def register_implementation_in_fsspec() -> None:
"""Dyanmically register the filesystem with fsspec.
This is needed if the implementation is not officially registered in the fsspec codebase.
It will also override ("clobber") an existing implementation having the same protocol.
The registration is only valid for the current process.
"""
register_implementation(
"gitpythonfs",
"gitpythonfs.GitPythonFileSystem",
clobber=True,
errtxt="Please install gitpythonfs to access GitPythonFileSystem",
)


class GitPythonFileSystem(AbstractFileSystem):
"""A filesystem for git repositories on the local filesystem.
An instance of this class provides the files residing within a remote github
repository. You may specify a point in the repo's history, by SHA, branch
or tag (default is current master).
You can retrieve information such as a file's modified time, which would not
be possible if looking at the local filesystem directly.
It is based on the gitpython library, which could be used to clone or update
files from a remote repo before reading them with this filesystem.
"""

protocol = "gitpythonfs"

def __init__(self, path: str = None, ref: str = None, **kwargs: Any) -> None:
"""
Initialize a GitPythonFS object.
Args:
path (str): Local location of the Git repo. When used with a higher
level function such as fsspec.open(), may be of the form
"gitpythonfs://[path-to-repo:][ref@]path/to/file" so that repo
and/or ref can be passed in the URL instead of arguments. (The
actual file path should not contain "@" or ":"). Examples:
When instantiating GitPythonFileSystem:
/some_folder/my_repo
When calling open(), open_files() etc:
gitpythonfs:///some_folder/my_repo:path/to/intro.md
gitpythonfs:///some_folder/my_repo:mybranch@path/to/intro.md
ref (str): (To be implemented). A branch, tag or commit hash to use.
Defaults to head of the local repo.
"""
super().__init__(**kwargs)
self.repo_path = path
self.repo = git.Repo(self.repo_path)

@classmethod
def _strip_protocol(cls, path: str) -> str:
path = super()._strip_protocol(path).lstrip("/")
if ":" in path:
path = path.split(":", 1)[1]
if "@" in path:
path = path.split("@", 1)[1]
return path.lstrip("/")

# ToDo support arguments in url, like this example from git fsspec implementation:
@staticmethod
def _get_kwargs_from_urls(path: str) -> Dict[str, str]:
if path.startswith("gitpythonfs://"):
path = path[14:]
out = {}
if ":" in path:
out["path"], path = path.split(":", 1)
if "@" in path:
out["ref"], path = path.split("@", 1)
return out

def _git_type_to_file_type(self, object: git.Object) -> str:
if isinstance(object, git.Blob):
return "file"
elif isinstance(object, git.Tree):
return "directory"
else:
return type(object).__name__

def _details(
self, object: git.Object, include_committed_date: bool = True
) -> Dict[str, Union[str, int]]:
"""
Retrieves the details of a Git object.
Args:
object (git.Object): The Git object to retrieve details for.
include_committed_date (bool, optional): Whether to include the committed date. Defaults to True.
Getting the committed date is an expensive operation and will slow down
walk(), a method that is extensively used by fsspec for find(), glob() etc.
Returns:
dict: A dictionary containing the details typical for fsspec.
"""
# commit=next(self.repo.iter_commits(paths=object.path, max_count=1))
details = {
"name": object.path,
"type": self._git_type_to_file_type(object),
"mime_type": object.mime_type if isinstance(object, git.Blob) else None,
"size": object.size,
"hexsha": object.hexsha,
# "committed_date": commit.committed_date,
}

if include_committed_date:
commit = next(self.repo.iter_commits(paths=object.path, max_count=1))
details["committed_date"] = commit.committed_date

return details

def ls(
self, path: str, detail: bool = False, ref: str = None, **kwargs: Any
) -> Union[List[str], List[Dict]]: # Todo implement ref
"""List files at given path in the repo."""
path = self._strip_protocol(path)
results = []

# For traversal, always start at the root of repo.
tree = self.repo.tree()
root_object = tree if path == "" else tree / path

if isinstance(root_object, git.Tree):
if detail:
for object in root_object:
results.append(self._details(object, **kwargs))
return results
else:
for object in root_object:
results.append(object.path)
return results
else:
# path is to a single blob.
if detail:
results.append(self._details(root_object, **kwargs))
return results
else:
results.append(root_object.path)
return results

# ToDo implement refs
def _open(
self,
path: str,
mode: str = "rb",
block_size: int = None,
autocommit: bool = True,
cache_options=None,
ref: str = None,
**kwargs: Any,
) -> MemoryFile:
# ToDo: support refs, with something like `ref or self.ref`.
path = self._strip_protocol(path)
tree = self.repo.tree()
blob = tree / path
return MemoryFile(data=blob.data_stream.read())
7 changes: 7 additions & 0 deletions sources/filesystem/gitpythonfs/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions sources/filesystem/gitpythonfs/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[tool.poetry]
name = "gitpythonfs"
version = "0.1.0"
description = "An fsspec implementation for git repositories on the local file system."
authors = ["Your Name <you@example.com>"]
license = "Apache License 2.0"
readme = "README.md"

[tool.poetry.dependencies]
# python = "^3.10"
python = ">=3.8.1,<3.13"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
183 changes: 183 additions & 0 deletions sources/filesystem/gitpythonfs/tests/test_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import os
import subprocess
import tempfile
import shutil

import pytest
from typing import Iterator

import fsspec # ToDo, narrow for open()
from fsspec.implementations.local import make_path_posix
from fsspec.registry import (
get_filesystem_class,
known_implementations,
available_protocols,
filesystem,
)

from gitpythonfs import GitPythonFileSystem
from gitpythonfs.core import register_implementation_in_fsspec

PROTOCOL = "gitpythonfs"
# TEST_REPO_PATH = ["~/dlt"]

@pytest.fixture()
def repo_fixture() -> Iterator[tuple[str, str]]:
"""Create a temporary git repository.
Thanks to https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/tests/test_git.py
"""
orig_dir = os.getcwd()
d = tempfile.mkdtemp()
try:
os.chdir(d)
# -b option requires git 2.28 or later.
subprocess.call("git init -b master", shell=True, cwd=d)
subprocess.call('git config user.email "you@example.com"', shell=True, cwd=d)
subprocess.call('git config user.name "Your Name"', shell=True, cwd=d)
open(os.path.join(d, "file1"), "wb").write(b"data0")
subprocess.call("git add file1", shell=True, cwd=d)
subprocess.call('git commit -m "init"', shell=True, cwd=d)
sha_first = open(os.path.join(d, ".git/refs/heads/master"), "r").read().strip()
open(os.path.join(d, "file1"), "wb").write(b"data00")
subprocess.check_output('git commit -a -m "tagger"', shell=True, cwd=d)
subprocess.call('git tag -a thetag -m "make tag"', shell=True, cwd=d)
open(os.path.join(d, "file2"), "wb").write(b"data000")
subprocess.call("git add file2", shell=True)
subprocess.call('git commit -m "master tip"', shell=True, cwd=d)
subprocess.call("git checkout -b abranch", shell=True, cwd=d)
os.mkdir("inner")
open(os.path.join(d, "inner", "file3"), "wb").write(b"data3")
subprocess.call("git add inner/file3", shell=True, cwd=d)
open(os.path.join(d, "inner", "file4"), "wb").write(b"data4")
subprocess.call("git add inner/file4", shell=True, cwd=d)
subprocess.call('git commit -m "branch tip"', shell=True, cwd=d)
os.chdir(orig_dir)
yield d, sha_first
finally:
os.chdir(orig_dir)
shutil.rmtree(d)


def test_register_implementation_in_fsspec() -> None:
"""Test registering a filesystem with fsspec."""
known_implementations.pop(PROTOCOL)
assert (
not PROTOCOL in known_implementations
), f"As a test precondition, {PROTOCOL} should not be registered."

register_implementation_in_fsspec()
assert PROTOCOL in available_protocols(), f"{PROTOCOL} should be registered."

cls = get_filesystem_class(PROTOCOL)
assert cls == GitPythonFileSystem


# @pytest.mark.parametrize("path", TEST_REPO_PATH)
def test_instantiate_fsspec_filesystem(repo_fixture) -> None:
"""Test instantiating a filesystem with fsspec.
Args:
repo_path (str): The path to the repository. The repository must exist.
"""
d = repo_fixture[0]

fs = filesystem(PROTOCOL, path=d)
assert type(fs) == GitPythonFileSystem


def test_ls_entries(repo_fixture):
"""Test listing folders and files in a repository."""
d, sha_first = repo_fixture
fs = filesystem(PROTOCOL, path=d)

assert fs.ls("") == [
"file1",
"file2",
"inner",
], "Should return all objects at root of repo."
assert fs.ls("file1") == ["file1"], "Should return a single file at root."
assert fs.ls("inner") == [
"inner/file3",
"inner/file4",
], "Should return 2 files, with their paths."
assert fs.ls("inner/file3") == [
"inner/file3"
], "Should return a single file in folder."


def test_ls_file_details(repo_fixture) -> None:
"""Test showing details for a file (git.Blob) in a repository."""

# setup
d, sha_first = repo_fixture
fs = filesystem(PROTOCOL, path=d)

# do
files = fs.ls("file1", detail=True, include_committed_date=True)
assert len(files) == 1, "Should return a single object."
details = files[0]

# assert
assert details["name"] == "file1"
assert details["type"] == "file"
assert details["mime_type"] == "text/plain"
assert isinstance(details["size"], int)
assert isinstance(details["hexsha"], str)
assert isinstance(details["committed_date"], int)


def test_git_refs(repo_fixture) -> None:
"""Test results for git refs - eg sha, branch, tag."""
d, sha_first = repo_fixture

with fsspec.open("gitpythonfs://inner/file3", path=d) as f:
bytes = f.read()
assert bytes == b"data3", "Should read from head if no ref given."

# with fsspec.open("gitpythonfs://file1", path=d, ref=sha_first) as f:
# bytes = f.read()
# assert bytes == b"data0", "Should read file version at given ref."

# with fsspec.open("git://file1", path=d, ref="thetag") as f:
# assert f.read() == b"data00"

# with fsspec.open("git://file2", path=d, ref="master") as f:
# assert f.read() == b"data000"

# with fsspec.open("git://file2", path=d, ref=None) as f:
# assert f.read() == b"data000"

# with fsspec.open("git://inner/file1", path=d, ref="abranch") as f:
# assert f.read() == b"data3"


def test_url(repo_fixture) -> None:
d, sha_first = repo_fixture

with fsspec.open(f"gitpythonfs://file1", path=d) as f:
assert f.read() == b"data00", "Should return file at root."

# ToDo: implement/test more complex urls, eg gitpythonfs://[path-to-repo[:]][ref@]path/to/file as used in git fsspec implementation.
# The colon (:) is compulsory if path-to-repo is give. ie, gitpythonfs://[path-to-repo:][ref@]path/to/file

with fsspec.open(f"gitpythonfs://{d}:file1") as f:
assert (
f.read() == b"data00"
), "Should return file via the repo path embedded in the url."

# ToDo expand test once refs supported.
# with fsspec.open(f"gitpythonfs://abranch@inner/file3", path=d) as f:
# assert f.read() == b"data3", "Should return file at ref embedded in url."
# with fsspec.open(f"gitpythonfs://{d}:abranch@innerfile3") as f:
# assert f.read() == b"data3", "Should return file at repo and ref embedded in url."


def test_multiple_files(repo_fixture) -> None:
"""Test reading multiple files from a repository."""
d, sha_first = repo_fixture

files = fsspec.open_files(f"gitpythonfs://{d}:**/file*")
assert (
len(files) == 4
), "Glob should return 4 files that start with `file` from any folder."

0 comments on commit 3e21941

Please sign in to comment.