diff --git a/.gitignore b/.gitignore index 7bbc71c09..e145117aa 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ celerybeat-schedule .venv venv/ ENV/ +.idea/ # Spyder project settings .spyderproject diff --git a/README.md b/README.md index 41193c0cf..22be7bb6e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,83 @@ # filesystem_spec -A specification that python filesystems should adhere to. + +A specification for pythonic filesystems. + +## Purpose + +To produce a template or specification for a file-system interface, that specific implementations should follow, +so that applications making use of them can rely on a common behaviour and not have to worry about the specific +internal implementation decisions with any given backend. + +In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE +mounting of the file-system implementation may be available for all implementations "for free". + +## Background + +Python provides a standard interface for open files, so that alternate implementations of file-like object can +work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries +have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system +which may be local, structured data store or some remote service. + +This repository is intended to be a place to define a standard interface that such file-systems should adhere to, +such that code using them should not have to know the details of the implementation in order to operate on any of +a number of backends. + +Everything here is up for discussion, and although a little code has already been included to kick things off, it +is only meant as a suggestion of one possible way of doing things. With hope, the community can come together to +define an interface that is the best for the highest number of users, and having the specification, makes developing +other file-system implementations simpler. + +There is no specific model (yet) of how the contents of this repo would be used, whether as a spec to refer to, +or perhaps something to subclass or use as a mixin, that can also form part of the conversation. + +#### History + +I (Martin Durant) have been involved in building a number of remote-data file-system implementations, principally +in the context of the [Dask](http://dask.pydata.org/en/latest/) project. In particular, several are listed +in [the docs](http://dask.pydata.org/en/latest/remote-data-services.html) with links to the specific repositories. +With common authership, there is much that is similar between the implementations, for example posix-like naming +of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic +URLs in order to select amongst them. However, *some* extra code was required in each case to adapt the peculiarities +of each implementation with the generic usage that Dask demanded. People may find the +[code](https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266) which parses URLs and creates file-system +instances interesting. + +At the same time, the Apache [Arrow](https://arrow.apache.org/) project was also concerned with a similar problem, +particularly a common interface to local and HDFS files, for example the +[hdfs](https://arrow.apache.org/docs/python/filesystems.html) interface (which actually communicated with HDFS +with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able +to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a +[conversation](https://github.com/dask/dask/issues/2880) +was started, and I invite all interested parties to continue the conversation in this location. + +There is a good argument that this type of code has no place in Dask, which is concerned with making graphs +representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful, +and each has a user-base wider than just those that work via Dask. + +## Influences + +The following places to consider, when chosing the definitions of how we would like the file-system specification +to look: + +- pythons [os](https://docs.python.org/3/library/os.html) moduler and its `path` namespace; also other file-connected + functionality in the standard library +- posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants +- the existing implementations for the various backends (e.g., + [gcsfs](http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem) or Arrow's + [hdfs](https://arrow.apache.org/docs/python/filesystems.html#hdfs-api)) +- [pyfilesystems](https://docs.pyfilesystem.org/en/latest/index.html), an attempt to do something similar, with a + plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out + validation code. + +## Contents of the Repo + +The main proposal here is in `fsspec/spec.py`, a single class with methods and doc-strings, and a little code. The +initial method names were copied from `gcsfs`, but this reflects only lazyness on the part of the inital committer. +Although the directory and files appear like a python package, they are not meant for installation or execution +until possibly some later date - or maybe never, if this is to be only loose reference specification. + +In addition `fsspec/utils.py` contains a couple of useful functions that Dask happens to rely on; it is envisaged +that if the spec here matures to real code, then a number of helpful functions may live alongside the main +definitions. Furthermore, `fsspec/mapping.py` shows how a key-value map may be easily implemented for all file-systems +for free, by adhering to a single definition of the structure. This is meant as a motivator, and happens to be +particularly useful for the [zarr](https://zarr.readthedocs.io) project. diff --git a/fsspec/__init__.py b/fsspec/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fsspec/mapping.py b/fsspec/mapping.py new file mode 100644 index 000000000..213619b3f --- /dev/null +++ b/fsspec/mapping.py @@ -0,0 +1,104 @@ + +from collections import MutableMapping + + +class FSMap(MutableMapping): + """Wrap a FileSystem instance as a mutable wrapping. + + The keys of the mapping become files under the given root, and the + values (which must be bytes) the contents of those files. + + Parameters + ---------- + root : string + prefix for all the files + fs : FileSystem instance + check : bool (=True) + performs a touch at the location, to check for write access. + + Examples + -------- + >>> fs = FileSystem(**parameters) # doctest: +SKIP + >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP + >>> d['loc1'] = b'Hello World' # doctest: +SKIP + >>> list(d.keys()) # doctest: +SKIP + ['loc1'] + >>> d['loc1'] # doctest: +SKIP + b'Hello World' + """ + + def __init__(self, root, fs, check=False, create=False): + self.fs = fs + self.root = root + if create: + self.fs.mkdir(root) + if check: + if not self.fs.exists(root): + raise ValueError("Path %s does not exist. Create " + " with the ``create=True`` keyword" % + root) + self.fs.touch(root+'/a') + self.fs.rm(root+'/a') + + def clear(self): + """Remove all keys below root - empties out mapping + """ + try: + self.fs.rm(self.root, True) + self.fs.mkdir(self.root) + except (IOError, OSError): + pass + + def _key_to_str(self, key): + """Generate full path for the key""" + return '/'.join([self.root, key]) + + def _str_to_key(self, s): + """Strip path of to leave key name""" + return s[len(self.root) + 1:] + + def __getitem__(self, key, default=None): + """Retrieve data""" + key = self._key_to_str(key) + try: + with self.fs.open(key, 'rb') as f: + result = f.read() + except (IOError, OSError): + if default is not None: + return default + raise KeyError(key) + return result + + def __setitem__(self, key, value): + """Store value in key""" + key = self._key_to_str(key) + with self.fs.open(key, 'wb') as f: + f.write(value) + + def keys(self): + """List currently defined keys""" + return (self._str_to_key(x) for x in self.fs.walk(self.root)) + + def __iter__(self): + return self.keys() + + def __delitem__(self, key): + """Remove key""" + self.fs.rm(self._key_to_str(key)) + + def __contains__(self, key): + """Does key exist in mapping?""" + return self.fs.exists(self._key_to_str(key)) + + def __len__(self): + """Number of stored elements""" + return sum(1 for _ in self.keys()) + + def __getstate__(self): + """Mapping should be pickleable""" + return self.fs, self.root + + def __setstate__(self, state): + fs, root = state + self.fs = fs + self.root = root diff --git a/fsspec/spec.py b/fsspec/spec.py new file mode 100644 index 000000000..9d9648786 --- /dev/null +++ b/fsspec/spec.py @@ -0,0 +1,282 @@ + +from .utils import read_block + + +class AbstractFileSystem(object): + """ + A specification for python file-systems + """ + _singleton = [None] + + def __init__(self, *args, **kwargs): + """Configure + + Instances may be cachable, so if similar enough arguments are seen + a new instance is not required. + + A reasonable default should be provided if there are no arguments + """ + self._singleton[0] = self + + @classmethod + def current(cls): + """ Return the most recently created FileSystem + + If no instance has been created, then create one with defaults + """ + if not cls._singleton[0]: + return AbstractFileSystem() + else: + return cls._singleton[0] + + def invalidate_cache(self, path=None): + """ + Discard any cached directory information + + Parameters + ---------- + path: string or None + If None, clear all listings cached else listings at or under given + path. + """ + pass + + def mkdir(self, path, **kwargs): + """ + Create directory entry at path + + For systems that don't have true directories, may create an for + this instance only and not touch the real filesystem + + Parameters + ---------- + path: str + location + kwargs: + may be permissions, etc. + """ + pass + + def rmdir(self, path): + """Remove a directory, if empty""" + pass + + def ls(self, path, detail=False): + """List objects at path. + + This should include subdirectories and files at that location. The + difference between a file and a directory must be clear when details + are requested. + + The specific keys, or perhaps a FileInfo class, or similar, is TBD, + but must be consistent across implementations. + Must include: + - full path to the entry + - size of the entry, in bytes + + Additional information + may be present, aproriate to the file-system, e.g., generation, + checksum, etc. + + Parameters + ---------- + detail: bool + if True, gives a list of dictionaries, where each is the same as + the result of ``info(path)``. If False, gives a list of paths + (str). + """ + pass + + def walk(self, path, detail=False): + """ Return all files belows path + + Like ``ls``, but recursing into subdirectories. If detail is False, + returns a list of full paths. + """ + + def du(self, path, total=False, deep=False): + """Space used by files within a path + + If total is True, returns a number (bytes), if False, returns a + dict mapping file to size. + + Parameters + ---------- + total: bool + whether to sum all the file sized + deep: bool + whether to descend into subdirectories. + """ + if deep: + sizes = {f['name']: f['size'] for f in self.walk(path, True)} + else: + sizes = {f['name']: f['size'] for f in self.ls(path, True)} + if total: + return sum(sizes.values()) + else: + return sizes + + def glob(self, path): + """ + Find files by glob-matching. + + If the path ends with '/' and does not contain "*", it is essentially + the same as ``ls(path)``, returning only files. + + We do not attempt to match for ``"**"`` notation. + + Example reimplements code in ``glob.glob()``, taken from hdfs3. + """ + import re + import posixpath + if '/' in path[:path.index('*')]: + ind = path[:path.index('*')].rindex('/') + root = path[:ind + 1] + else: + root = '/' + allpaths = [] + for dirname, dirs, fils in self.walk(root): + allpaths.extend(posixpath.join(dirname, d) for d in dirs) + allpaths.extend(posixpath.join(dirname, f) for f in fils) + pattern = re.compile("^" + path.replace('//', '/') + .rstrip('/') + .replace('*', '[^/]*') + .replace('?', '.') + "$") + return [p for p in allpaths + if pattern.match(p.replace('//', '/').rstrip('/'))] + + def exists(self, path): + """Is there a file at the given path""" + pass + + def info(self, path): + """Give details of entry at path + + Returns a single dictionary, with exactly the same information as ``ls`` + would with ``detail=True`` + """ + + def isdir(self, path): + """Is this entry directory-like?""" + + def cat(self, path): + """ Get the content of a file """ + + def get(self, rpath, lpath, **kwargs): + """ Copy file to local + + Possible extension: maybe should be able to copy to any file-system + """ + + def put(self, lpath, rpath, **kwargs): + """ Upload file from local """ + + def head(self, path, size=1024): + """ Get the first ``size`` bytes from file """ + with self.open(path, 'rb') as f: + return f.read(size) + + def tail(self, path, size=1024): + """ Get the last ``size`` bytes from file """ + with self.open(path, 'rb') as f: + f.seek(-size, 2) + return f.read() + + def copy(self, path1, path2, **kwargs): + """ Copy within two locations in the filesystem""" + + def mv(self, path1, path2, **kwargs): + """ Move file from one location to another """ + self.copy(path1, path2, **kwargs) + self.rm(path1) + + def rm(self, path, recursive=False): + """Delete files. + + Parameters + ---------- + path: str or list of str + File(s) to delete. + recursive: bool + If file(s) are directories, recursively delete contents and then + also remove the directory + """ + + def open(self, path, mode='rb', block_size=None, **kwargs): + """ + Return a file-like object from the filesystem + + The resultant instance must function correctly in a context ``with`` + block. + + Parameters + ---------- + mode: str like 'rb', 'w' + See builtin ``open()`` + block_size: int + Some indication of buffering - this is a value in bytes + """ + import io + if 'b' not in mode: + mode = mode.replace('t', '') + 'b' + return io.TextIOWrapper( + self.open(self, path, mode, block_size, **kwargs)) + + def touch(self, path, **kwargs): + """ Create empty file """ + with self.open(path, 'wb', **kwargs): + pass + + def read_block(self, fn, offset, length, delimiter=None): + """ Read a block of bytes from + + Starting at ``offset`` of the file, read ``length`` bytes. If + ``delimiter`` is set then we ensure that the read starts and stops at + delimiter boundaries that follow the locations ``offset`` and ``offset + + length``. If ``offset`` is zero then we start at zero. The + bytestring returned WILL include the end delimiter string. + + If offset+length is beyond the eof, reads to eof. + + Parameters + ---------- + fn: string + Path to filename on GCS + offset: int + Byte offset to start read + length: int + Number of bytes to read + delimiter: bytes (optional) + Ensure reading starts and stops at delimiter bytestring + + Examples + -------- + >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP + b'Alice, 100\\nBo' + >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\n' + + Use ``length=None`` to read to the end of the file. + >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\nCharlie, 300' + + See Also + -------- + utils.read_block + """ + with self.open(fn, 'rb') as f: + size = f.size + if length is None: + length = size + if offset + length > size: + length = size - offset + bytes = read_block(f, offset, length, delimiter) + return bytes + + def __getstate__(self): + """ Instance should be pickleable """ + d = self.__dict__.copy() + return d + + def __setstate__(self, state): + self.__dict__.update(state) diff --git a/fsspec/utils.py b/fsspec/utils.py new file mode 100644 index 000000000..6c3e12786 --- /dev/null +++ b/fsspec/utils.py @@ -0,0 +1,83 @@ +def seek_delimiter(file, delimiter, blocksize): + """ Seek current file to next byte after a delimiter bytestring + + This seeks the file to the next byte following the delimiter. It does + not return anything. Use ``file.tell()`` to see location afterwards. + + Parameters + ---------- + file: a file + delimiter: bytes + a delimiter like ``b'\n'`` or message sentinel + blocksize: int + Number of bytes to read from the file at once. + """ + + if file.tell() == 0: + return + + last = b'' + while True: + current = file.read(blocksize) + if not current: + return + full = last + current + try: + i = full.index(delimiter) + file.seek(file.tell() - (len(full) - i) + len(delimiter)) + return + except ValueError: + pass + last = full[-len(delimiter):] + + +def read_block(f, offset, length, delimiter=None): + """ Read a block of bytes from a file + + Parameters + ---------- + fn: string + Path to filename on S3 + offset: int + Byte offset to start read + length: int + Number of bytes to read + delimiter: bytes (optional) + Ensure reading starts and stops at delimiter bytestring + + If using the ``delimiter=`` keyword argument we ensure that the read + starts and stops at delimiter boundaries that follow the locations + ``offset`` and ``offset + length``. If ``offset`` is zero then we + start at zero. The bytestring returned WILL include the + terminating delimiter string. + + Examples + -------- + + >>> from io import BytesIO # doctest: +SKIP + >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP + >>> read_block(f, 0, 13) # doctest: +SKIP + b'Alice, 100\\nBo' + + >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP + b'Alice, 100\\nBob, 200\\n' + + >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP + b'Bob, 200\\nCharlie, 300' + """ + if delimiter: + f.seek(offset) + seek_delimiter(f, delimiter, 2**16) + start = f.tell() + length -= start - offset + + f.seek(start + length) + seek_delimiter(f, delimiter, 2**16) + end = f.tell() + + offset = start + length = end - start + + f.seek(offset) + b = f.read(length) + return b