In [None]:
# default_exp train.object_store

# Object Store

The object store is an interface to other systems. It handles:

* efficient extraction: local when available, then remote or calculated
* versions: increment the latest version
* meta data: allow me to store and search for what's there
* offline mode: reasonable fallbacks when other systems aren't there

There are other versions of this in the data lab that will get cleaned out once this is working.

In [None]:
import abc
from pathlib import Path
import pathlib
import re
import shutil

In [None]:
#export

def get_config(**kw):
    """Get the local configuration dictionary."""
    
    # TODO: Actually manage this in a safe way.
    
    return {}

In [None]:
ROOT = '/tmp'
def root(**kw):
    return Path(kw.get('root', ROOT))

def name(**kw):
    o = kw.get('name', kw.get('filter', None))
    if isinstance(o, re.Pattern):
        return o.pattern
    if o is None or isinstance(o, str):
        return o
    return str(o)

def storage_filter(**kw):
    o = kw.get('filter', '')
    if isinstance(o, re.Pattern): return o
    return re.compile(f".*{o}.*")

def list_buckets(**kw):
    r = root(**kw)
    p = storage_filter(**kw)
    return [f for f in r.iterdir() if f.is_dir() and p.match(f.name)]

In [None]:
assert root(root='a') == Path('a')
assert root() == Path(ROOT)

assert name() is None
assert name(name='name') == 'name'
assert name(filter='name') == 'name'
r = re.compile('name')
assert name(name=r) == 'name'
assert name(filter=r) == 'name'
assert name(name=42) == '42'
assert name(filter=42) == '42'

assert storage_filter().pattern == '.*.*'
assert storage_filter().match('')
assert storage_filter().match('anything')
assert storage_filter(filter='foo').match('foo')
assert storage_filter(filter=re.compile('foo')).match('foo')

assert isinstance(list_buckets(), list)


In [None]:
class LocalObjectStore:
    ROOT = "/Users/davidrichards/.data"
    
    def __init__(self, **kw):
        self.kw = kw

    @property
    def root(self):
        if hasattr(self, '_root'): return self._root
        self._root = Path(self.kw.get('root', self.ROOT))
        return self._root
    
    @property
    def _name(self):
        if hasattr(self, '__name'): return self.__name
        o = self.kw.get('filter', None)
        if isinstance(o, re.Pattern):
            self.__name = o.pattern
        elif o is None or isinstance(o, str):
            self.__name = o
        else:
            self.__name = str(o)
        return self.__name
    
    @property
    def _filter(self):
        """Filter for whatever operation, bucket-level, object level."""
        if hasattr(self, '__filter'): return self._bucket_filter
        o = self.kw.get('filter', '')
        if isinstance(o, re.Pattern):
            self.__filter = o
        else:
            self.__filter = re.compile(f".*{o}.*")
        return self.__filter
    
    @property
    def _list_paths(self):
        if hasattr(self, '__list_paths'): return self.__list_paths
        self.__list_paths = [f for f in self.root.iterdir() if f.is_dir() and self._filter.match(f.name)]
        return self.__list_paths
        
    def list_buckets(self):
        return [p.name for p in self._list_paths]
    
    @property
    def _first(self):
        if hasattr(self, '__first'): return self.__first
        d = dict(enumerate(self._list_paths))
        self.__first = d.get(0, None)
        return self.__first

    @property
    def _exists(self):
        if hasattr(self, '__exists'): return self.__exists
        self.exists = not self._first is None and self._first.exists()
        return self.exists
    
    @property
    def _is_empty(self):
        if hasattr(self,'__is_empty'): return self.__is_empty
        if not self._exists:
            self.__is_empty = True
        else:
            self.__is_empty = len(list(self._first.glob('*'))) == 0
        return self.__is_empty
    
    @property
    def _can_remove_bucket(self):
        if hasattr(self, '__can_remove_bucket'): return self.__can_remove_bucket
        if self._is_empty:
            self.__can_remove_bucket = True
        elif self.kw.get('force', False):
            self.__can_remove_bucket = True
        else:
            self.__can_remove_bucket = False
        return self.__can_remove_bucket
    
    def find_or_create_bucket(self):
        path = self.root/self._name
        path.mkdir(parents=True, exist_ok=True)
        return path.name
    
    def remove_bucket(self):
        if not self._can_remove_bucket: return False
        try:
            shutil.rmtree(self._first)
            return True
        except:
            return True    

In [None]:
base = dict(
    root='/tmp'
)
s = LocalObjectStore(filter='power', **base)
assert s.root == Path('/tmp')

assert isinstance(s._filter, re.Pattern)
matching = ['power', 'this_power_that', 'this_power', 'power_that']
for match in matching:
    assert s._filter.match(match)
    
s = LocalObjectStore(filter='new_purpose', **base)
# s = LocalObjectStore(filter='power', **base)
# s.find_or_create_bucket()
# s.list_buckets()
s._first
# s.remove_bucket()
# s._filter
# s.find_or_create_bucket()
s.remove_bucket()

True

I'm not sure if I like returning a PosixPath, because that exposes an internal detail. Better to convert it to a string for any public-facing values.

In [None]:
shutil.rmtree(path)

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/bar'

In [None]:
class ObjectStoreInterface(abc.ABC):
    """This is how I store objects: models, treatment
    configurations, evaluations, expectations, slips,
    data, meta data, and probably more.
    
    This interface creates mid-level access to
    whichever store or stack of stores I use. The
    design principle is consistent: push detail
    deeper.
    
    I'm not sure, but I think the ObjectStore works
    in three layers of the lab: the backend storage
    (MinIO, local files, shared volume), object-level
    management (this interface), and purpose-driven
    work (model training, data extraction, progress
    review, expectation development, model deployment,
    or production monitoring). It's possible that the
    way I wrap everything else doesn't add enough
    flexibility, transparency, or somethng else I'm
    not thinking of yet. Think of stream processing,
    distributed training pipelines, vendor-supplied
    services.
    
    I'm also not sure if an interface is useful. I
    tend to keep too-loose interfaces (**kw for
    everything, passing unfiltered keywords along,
    making it impossible to read or enforce expectated
    types.)"""
    
    def list_buckets(cls, **kw):
        pass
    
    def find_or_create_bucket(cls, **kw):
        pass
    
    def remove_bucket(cls, **kw):
        pass
    
    def find_items(cls, **kw):
        pass
    
    def put(cls, **kw):
        pass
    
    def get(cls, **kw):
        pass
    
    def get_stats(cls, **kw):
        pass
    
    def copy(cls, **kw):
        pass
    
    def remove(cls, **kw):
        pass