forked from dlt-hub/verified-sources
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add gitpythonfs fsspec implementation dlt-hub#301
- Loading branch information
Showing
8 changed files
with
2,484 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# gitpythonfs | ||
|
||
Builds on [GitPython](https://gitpython.readthedocs.io/) to provide a Python filesystem interface for git. | ||
|
||
The initial use case is to load file contents from git repos into destinations using tools such as [dlt](https://dlthub.com) | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .core import GitPythonFileSystem, register_implementation_in_fsspec | ||
|
||
register_implementation_in_fsspec() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
from typing import List, Dict, Any, Union | ||
from fsspec.registry import register_implementation | ||
from fsspec.spec import AbstractFileSystem | ||
from fsspec.implementations.memory import MemoryFile | ||
import git | ||
|
||
|
||
def register_implementation_in_fsspec() -> None: | ||
"""Dyanmically register the filesystem with fsspec. | ||
This is needed if the implementation is not officially registered in the fsspec codebase. | ||
It will also override ("clobber") an existing implementation having the same protocol. | ||
The registration is only valid for the current process. | ||
""" | ||
register_implementation( | ||
"gitpythonfs", | ||
"gitpythonfs.GitPythonFileSystem", | ||
clobber=True, | ||
errtxt="Please install gitpythonfs to access GitPythonFileSystem", | ||
) | ||
|
||
|
||
class GitPythonFileSystem(AbstractFileSystem): | ||
"""A filesystem for git repositories on the local filesystem. | ||
An instance of this class provides the files residing within a remote github | ||
repository. You may specify a point in the repo's history, by SHA, branch | ||
or tag (default is current master). | ||
You can retrieve information such as a file's modified time, which would not | ||
be possible if looking at the local filesystem directly. | ||
It is based on the gitpython library, which could be used to clone or update | ||
files from a remote repo before reading them with this filesystem. | ||
""" | ||
|
||
protocol = "gitpythonfs" | ||
|
||
def __init__(self, path: str = None, ref: str = None, **kwargs: Any) -> None: | ||
""" | ||
Initialize a GitPythonFS object. | ||
Args: | ||
path (str): Local location of the Git repo. When used with a higher | ||
level function such as fsspec.open(), may be of the form | ||
"gitpythonfs://[path-to-repo:][ref@]path/to/file" so that repo | ||
and/or ref can be passed in the URL instead of arguments. (The | ||
actual file path should not contain "@" or ":"). Examples: | ||
When instantiating GitPythonFileSystem: | ||
/some_folder/my_repo | ||
When calling open(), open_files() etc: | ||
gitpythonfs:///some_folder/my_repo:path/to/intro.md | ||
gitpythonfs:///some_folder/my_repo:mybranch@path/to/intro.md | ||
ref (str): (To be implemented). A branch, tag or commit hash to use. | ||
Defaults to head of the local repo. | ||
""" | ||
super().__init__(**kwargs) | ||
self.repo_path = path | ||
self.repo = git.Repo(self.repo_path) | ||
|
||
@classmethod | ||
def _strip_protocol(cls, path: str) -> str: | ||
path = super()._strip_protocol(path).lstrip("/") | ||
if ":" in path: | ||
path = path.split(":", 1)[1] | ||
if "@" in path: | ||
path = path.split("@", 1)[1] | ||
return path.lstrip("/") | ||
|
||
# ToDo support arguments in url, like this example from git fsspec implementation: | ||
@staticmethod | ||
def _get_kwargs_from_urls(path: str) -> Dict[str, str]: | ||
if path.startswith("gitpythonfs://"): | ||
path = path[14:] | ||
out = {} | ||
if ":" in path: | ||
out["path"], path = path.split(":", 1) | ||
if "@" in path: | ||
out["ref"], path = path.split("@", 1) | ||
return out | ||
|
||
def _git_type_to_file_type(self, object: git.Object) -> str: | ||
if isinstance(object, git.Blob): | ||
return "file" | ||
elif isinstance(object, git.Tree): | ||
return "directory" | ||
else: | ||
return type(object).__name__ | ||
|
||
def _details( | ||
self, object: git.Object, include_committed_date: bool = True | ||
) -> Dict[str, Union[str, int]]: | ||
""" | ||
Retrieves the details of a Git object. | ||
Args: | ||
object (git.Object): The Git object to retrieve details for. | ||
include_committed_date (bool, optional): Whether to include the committed date. Defaults to True. | ||
Getting the committed date is an expensive operation and will slow down | ||
walk(), a method that is extensively used by fsspec for find(), glob() etc. | ||
Returns: | ||
dict: A dictionary containing the details typical for fsspec. | ||
""" | ||
# commit=next(self.repo.iter_commits(paths=object.path, max_count=1)) | ||
details = { | ||
"name": object.path, | ||
"type": self._git_type_to_file_type(object), | ||
"mime_type": object.mime_type if isinstance(object, git.Blob) else None, | ||
"size": object.size, | ||
"hexsha": object.hexsha, | ||
# "committed_date": commit.committed_date, | ||
} | ||
|
||
if include_committed_date: | ||
commit = next(self.repo.iter_commits(paths=object.path, max_count=1)) | ||
details["committed_date"] = commit.committed_date | ||
|
||
return details | ||
|
||
def ls( | ||
self, path: str, detail: bool = False, ref: str = None, **kwargs: Any | ||
) -> Union[List[str], List[Dict]]: # Todo implement ref | ||
"""List files at given path in the repo.""" | ||
path = self._strip_protocol(path) | ||
results = [] | ||
|
||
# For traversal, always start at the root of repo. | ||
tree = self.repo.tree() | ||
root_object = tree if path == "" else tree / path | ||
|
||
if isinstance(root_object, git.Tree): | ||
if detail: | ||
for object in root_object: | ||
results.append(self._details(object, **kwargs)) | ||
return results | ||
else: | ||
for object in root_object: | ||
results.append(object.path) | ||
return results | ||
else: | ||
# path is to a single blob. | ||
if detail: | ||
results.append(self._details(root_object, **kwargs)) | ||
return results | ||
else: | ||
results.append(root_object.path) | ||
return results | ||
|
||
# ToDo implement refs | ||
def _open( | ||
self, | ||
path: str, | ||
mode: str = "rb", | ||
block_size: int = None, | ||
autocommit: bool = True, | ||
cache_options=None, | ||
ref: str = None, | ||
**kwargs: Any, | ||
) -> MemoryFile: | ||
# ToDo: support refs, with something like `ref or self.ref`. | ||
path = self._strip_protocol(path) | ||
tree = self.repo.tree() | ||
blob = tree / path | ||
return MemoryFile(data=blob.data_stream.read()) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
[tool.poetry] | ||
name = "gitpythonfs" | ||
version = "0.1.0" | ||
description = "An fsspec implementation for git repositories on the local file system." | ||
authors = ["Your Name <you@example.com>"] | ||
license = "Apache License 2.0" | ||
readme = "README.md" | ||
|
||
[tool.poetry.dependencies] | ||
# python = "^3.10" | ||
python = ">=3.8.1,<3.13" | ||
|
||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
build-backend = "poetry.core.masonry.api" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
import os | ||
import subprocess | ||
import tempfile | ||
import shutil | ||
|
||
import pytest | ||
from typing import Iterator | ||
|
||
import fsspec # ToDo, narrow for open() | ||
from fsspec.implementations.local import make_path_posix | ||
from fsspec.registry import ( | ||
get_filesystem_class, | ||
known_implementations, | ||
available_protocols, | ||
filesystem, | ||
) | ||
|
||
from gitpythonfs import GitPythonFileSystem | ||
from gitpythonfs.core import register_implementation_in_fsspec | ||
|
||
PROTOCOL = "gitpythonfs" | ||
# TEST_REPO_PATH = ["~/dlt"] | ||
|
||
@pytest.fixture() | ||
def repo_fixture() -> Iterator[tuple[str, str]]: | ||
"""Create a temporary git repository. | ||
Thanks to https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/tests/test_git.py | ||
""" | ||
orig_dir = os.getcwd() | ||
d = tempfile.mkdtemp() | ||
try: | ||
os.chdir(d) | ||
# -b option requires git 2.28 or later. | ||
subprocess.call("git init -b master", shell=True, cwd=d) | ||
subprocess.call('git config user.email "you@example.com"', shell=True, cwd=d) | ||
subprocess.call('git config user.name "Your Name"', shell=True, cwd=d) | ||
open(os.path.join(d, "file1"), "wb").write(b"data0") | ||
subprocess.call("git add file1", shell=True, cwd=d) | ||
subprocess.call('git commit -m "init"', shell=True, cwd=d) | ||
sha_first = open(os.path.join(d, ".git/refs/heads/master"), "r").read().strip() | ||
open(os.path.join(d, "file1"), "wb").write(b"data00") | ||
subprocess.check_output('git commit -a -m "tagger"', shell=True, cwd=d) | ||
subprocess.call('git tag -a thetag -m "make tag"', shell=True, cwd=d) | ||
open(os.path.join(d, "file2"), "wb").write(b"data000") | ||
subprocess.call("git add file2", shell=True) | ||
subprocess.call('git commit -m "master tip"', shell=True, cwd=d) | ||
subprocess.call("git checkout -b abranch", shell=True, cwd=d) | ||
os.mkdir("inner") | ||
open(os.path.join(d, "inner", "file3"), "wb").write(b"data3") | ||
subprocess.call("git add inner/file3", shell=True, cwd=d) | ||
open(os.path.join(d, "inner", "file4"), "wb").write(b"data4") | ||
subprocess.call("git add inner/file4", shell=True, cwd=d) | ||
subprocess.call('git commit -m "branch tip"', shell=True, cwd=d) | ||
os.chdir(orig_dir) | ||
yield d, sha_first | ||
finally: | ||
os.chdir(orig_dir) | ||
shutil.rmtree(d) | ||
|
||
|
||
def test_register_implementation_in_fsspec() -> None: | ||
"""Test registering a filesystem with fsspec.""" | ||
known_implementations.pop(PROTOCOL) | ||
assert ( | ||
not PROTOCOL in known_implementations | ||
), f"As a test precondition, {PROTOCOL} should not be registered." | ||
|
||
register_implementation_in_fsspec() | ||
assert PROTOCOL in available_protocols(), f"{PROTOCOL} should be registered." | ||
|
||
cls = get_filesystem_class(PROTOCOL) | ||
assert cls == GitPythonFileSystem | ||
|
||
|
||
# @pytest.mark.parametrize("path", TEST_REPO_PATH) | ||
def test_instantiate_fsspec_filesystem(repo_fixture) -> None: | ||
"""Test instantiating a filesystem with fsspec. | ||
Args: | ||
repo_path (str): The path to the repository. The repository must exist. | ||
""" | ||
d = repo_fixture[0] | ||
|
||
fs = filesystem(PROTOCOL, path=d) | ||
assert type(fs) == GitPythonFileSystem | ||
|
||
|
||
def test_ls_entries(repo_fixture): | ||
"""Test listing folders and files in a repository.""" | ||
d, sha_first = repo_fixture | ||
fs = filesystem(PROTOCOL, path=d) | ||
|
||
assert fs.ls("") == [ | ||
"file1", | ||
"file2", | ||
"inner", | ||
], "Should return all objects at root of repo." | ||
assert fs.ls("file1") == ["file1"], "Should return a single file at root." | ||
assert fs.ls("inner") == [ | ||
"inner/file3", | ||
"inner/file4", | ||
], "Should return 2 files, with their paths." | ||
assert fs.ls("inner/file3") == [ | ||
"inner/file3" | ||
], "Should return a single file in folder." | ||
|
||
|
||
def test_ls_file_details(repo_fixture) -> None: | ||
"""Test showing details for a file (git.Blob) in a repository.""" | ||
|
||
# setup | ||
d, sha_first = repo_fixture | ||
fs = filesystem(PROTOCOL, path=d) | ||
|
||
# do | ||
files = fs.ls("file1", detail=True, include_committed_date=True) | ||
assert len(files) == 1, "Should return a single object." | ||
details = files[0] | ||
|
||
# assert | ||
assert details["name"] == "file1" | ||
assert details["type"] == "file" | ||
assert details["mime_type"] == "text/plain" | ||
assert isinstance(details["size"], int) | ||
assert isinstance(details["hexsha"], str) | ||
assert isinstance(details["committed_date"], int) | ||
|
||
|
||
def test_git_refs(repo_fixture) -> None: | ||
"""Test results for git refs - eg sha, branch, tag.""" | ||
d, sha_first = repo_fixture | ||
|
||
with fsspec.open("gitpythonfs://inner/file3", path=d) as f: | ||
bytes = f.read() | ||
assert bytes == b"data3", "Should read from head if no ref given." | ||
|
||
# with fsspec.open("gitpythonfs://file1", path=d, ref=sha_first) as f: | ||
# bytes = f.read() | ||
# assert bytes == b"data0", "Should read file version at given ref." | ||
|
||
# with fsspec.open("git://file1", path=d, ref="thetag") as f: | ||
# assert f.read() == b"data00" | ||
|
||
# with fsspec.open("git://file2", path=d, ref="master") as f: | ||
# assert f.read() == b"data000" | ||
|
||
# with fsspec.open("git://file2", path=d, ref=None) as f: | ||
# assert f.read() == b"data000" | ||
|
||
# with fsspec.open("git://inner/file1", path=d, ref="abranch") as f: | ||
# assert f.read() == b"data3" | ||
|
||
|
||
def test_url(repo_fixture) -> None: | ||
d, sha_first = repo_fixture | ||
|
||
with fsspec.open(f"gitpythonfs://file1", path=d) as f: | ||
assert f.read() == b"data00", "Should return file at root." | ||
|
||
# ToDo: implement/test more complex urls, eg gitpythonfs://[path-to-repo[:]][ref@]path/to/file as used in git fsspec implementation. | ||
# The colon (:) is compulsory if path-to-repo is give. ie, gitpythonfs://[path-to-repo:][ref@]path/to/file | ||
|
||
with fsspec.open(f"gitpythonfs://{d}:file1") as f: | ||
assert ( | ||
f.read() == b"data00" | ||
), "Should return file via the repo path embedded in the url." | ||
|
||
# ToDo expand test once refs supported. | ||
# with fsspec.open(f"gitpythonfs://abranch@inner/file3", path=d) as f: | ||
# assert f.read() == b"data3", "Should return file at ref embedded in url." | ||
# with fsspec.open(f"gitpythonfs://{d}:abranch@innerfile3") as f: | ||
# assert f.read() == b"data3", "Should return file at repo and ref embedded in url." | ||
|
||
|
||
def test_multiple_files(repo_fixture) -> None: | ||
"""Test reading multiple files from a repository.""" | ||
d, sha_first = repo_fixture | ||
|
||
files = fsspec.open_files(f"gitpythonfs://{d}:**/file*") | ||
assert ( | ||
len(files) == 4 | ||
), "Glob should return 4 files that start with `file` from any folder." |