In [None]:
#export
from lab.imports import *
import lab.train.protocols
import random

In [None]:
# default_exp train.local_object_store

# Local Object Store

Namespace for these functions. I'm trying a functional approach instead of wrapping everything in a one-type-use class.

In [None]:
#export

WHITE_LIST_CONFIG = ['root', 'name', 'filter', 'force']

def white_list(keywords, keys, require_keys=False, expand_missing=False):
    """Filter a dictionary by a set of keys"""
    if require_keys:
        return {key:keywords[key] for key in keys}
    if expand_missing:
        return {key:keywords.get(key) for key in keys}
    result = {}
    for key in keys:
        if key in keywords: result[key] = keywords[key]
    return result

def merge_config(**kw):
    """Get the local environemnt variables, convert the keys
    to lower case, merge new keywords over the environment
    variables, and use a white list filter on valid entries.
    
    Right now, I'm only white listing the environment variables,
    but possibly I'll do this for everything."""
    
    env = {k.lower():v for k, v in os.environ.items()}
    env = white_list(env, WHITE_LIST_CONFIG)
    return {**env, **kw}

# TODO:

* Move the above into imports
* Move what I have in imports into a notebook
* Create a consistent set of configuration expectations (keys that work with local, minio, some of the ops)

All of this should be done in a notebook first, because there are 100 things to test.

More-specifically:

* read an environment file, not settings, not .env
* read .env
* manage white lists and merges
* create a constant for the configuration
* ensure nothing sensitive is being shared (.gitignore, cached in a notebook)
* make sure I have easy access to namespaced configuration
* create a default set of parameters for basic ops (object store, at least)

In [None]:
d = dict(a=1, b=2)

assert 'b' not in white_list(d, ['a'])
assert 'c' in white_list(d, ['a', 'c'], expand_missing=True)
assert white_list(d, ['c'], expand_missing=True)['c'] is None

with check_raises():
    white_list(d, ['c'], require_keys=True)
    
known_keys = ['force', 'filter', 'name', 'root']
for key in known_keys:
    assert key in WHITE_LIST_CONFIG


This is interesting, to bring in the environment variables. It's because in a Docker container, this is the best way to handle configuration.

In [None]:
#export

ROOT = '/tmp'
def _root(**kw):
    """Expose root from the config or a default."""
    kw = merge_config(**kw)
    return Path(kw.get('root', ROOT))

def _is_re(o):
    """Deal with whether an object is a regular expression.
    Modern Python uses re.Pattern for all compiled regular
    expressions, but in 3.6.9 and ostensibly before, I don't
    have this. So, duck type it: if it has match and pattern,
    good enough."""
    return hasattr(o, 'match') and hasattr(o, 'pattern')

def _name(**kw):
    """Name is gathered from the configuration or keyword
    arguments. It uses name or filter, because both keys
    make sense with different calls. Name is used
    for the bucket or item name, depending on the context."""
    kw = merge_config(**kw)
    o = kw.get('name', kw.get('filter', None))
    if _is_re(o):
        return o.pattern
    if o is None or isinstance(o, str):
        return o
    return str(o)

def _bucket_filter(**kw):
    kw = merge_config(**kw)
    o = kw.get('name', kw.get('filter', ''))
    if _is_re(o): return o
    return re.compile(f".*{o}.*")

In [None]:
assert _root(root='a') == Path('a')
assert _root() == Path(ROOT)

assert _name() is None
assert _name(name='name') == 'name'
assert _name(filter='name') == 'name'
r = re.compile('name')
assert _name(name=r) == 'name'
assert _name(filter=r) == 'name'
assert _name(name=42) == '42'
assert _name(filter=42) == '42'

assert _bucket_filter().pattern == '.*.*'
assert _bucket_filter().match('')
assert _bucket_filter().match('anything')
assert _bucket_filter(name='foo').match('foo')
assert _bucket_filter(name=re.compile('foo')).match('foo')
assert _bucket_filter(filter='foo').match('foo')
assert _bucket_filter(filter=re.compile('foo')).match('foo')

Getting root, a name, and a filter for buckets. This is mostly just establishing a convention.

The gap between bucket/directory and item/file is a little weird. I opaquely thought that types would begin to straighten that out. Given a type of a thing, when I put it, I can find or create an appropriate bucket for it, given a mapping.

There's a bit of weirdness going on with:

* filter/name being top-level
* calling keyword extrapolation several times for a single call
* from opaque-to-specific on something like a bucket or a type of thing to enforce

The idea is I get a set of subject references, say from a filter or something like ['research', 'supervised', 'small']. Then, I get a set of treatments like, ['common', 'first pass']. Then I look for or create each model, allowing for a comparative replacement to be at least stored in memory. I train the models, evaluate them, possbly label, version, and store them. Maybe the treatment can fill in the invocation gaps, if I need those. Maybe I run some post-training diagnostics on subjects and models for expectations.

A lot of this is finding, organizing, and creating objects. There are fundamental issues with the above and below, so what I'm going to do is create a fake set of sequences, see how to work it.


Dataset Name	Brief description	Preprocessing	Instances	Format	Default Task	Created (updated)	Reference	Creator

Bike Sharing Dataset	Hourly and daily count of rental bikes in a large city.	Many features, including weather, length of trip, etc., are given.	17,389	Text	Regression	2013	[427][428]	H. Fanaee-T

New York City Taxi Trip Data	Trip data for yellow and green taxis in New York City.	Gives pick up and drop off locations, fares, and other details of trips.	6 years	Text	Classification, clustering	2015	[429]	New York City Taxi and Limousine Commission

Taxi Service Trajectory ECML PKDD	Trajectories of all taxis in a large city.	Many features given, including start and stop points.	1,710,671	Text	Clustering, causal-discovery	2015	[430][431]	M. Ferreira et al.


In [None]:
_subjects_raw = [
    dict(
        name='bike',
        short_description='Bike Sharing Dataset',
        description='Hourly and daily count of rental bikes in a large city.',
        preprocessing='Many features, including weather, length of trip, etc., are given.',
        instances=17389,
        format='text',
        created_on=dict(year=2013),
        url='tbd',
        contributors=['H. Fanaee-T'],
        tags=['regression', 'medium', 'supervised'],
    ),
    dict(name='test'),
]

In [None]:
import lab.train.protocols as protocol_utils

def _dict_from_name(o):
    if isinstance(o, dict): return d
    return {'name': o}

def _list_of_dicts(l):
    return [_dict_from_name(e) for e in l]

def _value_format(o):
    if isinstance(o, list): return _list_of_dicts(o)
    return o

def _raw_format(d):
    return {k:_value_format(v) for k, v in d.items()}

def _see_saw(d, builder):
    message = builder(**d)
    d = protocol_utils.to_dict(message)
    return builder(**d)

def _raw_to_prototype(raw, kind):
    d = _raw_format(raw)
    builder = protocol_utils.MessageBuilder(kind=kind)
    return _see_saw(d, builder)
#     return builder(**d)

def _list_to_prototypes(l, kind):
    return [_raw_to_prototype(raw, kind) for raw in l]

In [None]:
generated = _list_to_prototypes(_subjects_raw, 'Subject')
assert isinstance(generated, list)
message = generated[0]
assert type(message) == training_prototypes.Subject
message

name: "bike"
type: "supervised"
size: "medium"
short_description: "Bike Sharing Dataset"
description: "Hourly and daily count of rental bikes in a large city."
preprocessing: "Many features, including weather, length of trip, etc., are given."
instances: 17389
format: "text"
created_on {
  year: 2013
}
url: "tbd"
version {
}
contributors {
  name: "H. Fanaee-T"
}
tags {
  name: "regression"
}
tags {
  name: "medium"
}
tags {
  name: "supervised"
}

Iteratively figuring out what should come out of the object store, how best to access it, what the learning loop needs to do with it.

In [None]:
a = []
h, *t = a

ValueError: not enough values to unpack (expected at least 1, got 0)

In [None]:
TYPE_PREFIX_MAP = dict(
    subject = ['type', 'size', 'name', 'year', 'month', 'version'],
    treatment = ['type', 'name'],
    model = ['treatment', 'name', 'type'],
    evaluation = ['type', 'name', 'year', 'month'],
    expectation = ['source_type', 'name', 'year', 'month'],
    invocation = ['name', 'version'],
)

def _find_on_message(message, attribute_path):
    if len(attribute_path) == 0: return message
    if len(attribute_path) == 1:
        attribute = attribute_path[0]
        if hasattr(message, attribute):
            return getattr(message, attribute)
        return None
    attribute, *attribute_path = attribute_path
    if hasattr(message, attribute):
        message = getattr(message, attribute)
        return _find_on_message(message, attribute_path)
    
def _created_on_year(message):
    return _find_on_message(message, ['created_on', 'year'])

def _created_on_month(message):
    return _find_on_message(message, ['created_on', 'month'])

def _version_string(message):
    version = _find_on_message(message, ['version'])
    if version is None: return None
    return f"{version.major}.{version.minor}.{version.patch}"

FIELD_TYPE_MAP = dict(
    year = _created_on_year,
    month = _created_on_month,
    version = _version_string,
)
NAME_DELIMITER = "|"

def _value_from_message(message, key):
    if key in FIELD_TYPE_MAP:
        value = FIELD_TYPE_MAP[key](message)
    else:
        value = _find_on_message(message, [key])
    if value is None: return ''
    return str(value)

def _build_until_missing(keys, d):
    result = []
    for key in keys:
        if not key in d:
            break
        result.append(str(d[key]))
    return result

def _prefix_from_dictionary(d, kind):
    if kind not in TYPE_PREFIX_MAP: return None
    keys = TYPE_PREFIX_MAP[kind]
    values = _build_until_missing(keys, d)
    return NAME_DELIMITER.join(values)

def _key_from_message(message, kind):
    if kind not in TYPE_PREFIX_MAP: return None
    keys = TYPE_PREFIX_MAP[kind]
    values = [_value_from_message(message, key) for key in keys]
    return NAME_DELIMITER.join(values)

In [None]:
def _list_to_dict(l):
    """Test helper: key == value, so expectations are obvious."""
    return {k:k for k in l}

assert _prefix_from_dictionary({}, 'weird') is None
assert _prefix_from_dictionary(_list_to_dict(['a', 'b']), 'subject') == ''
assert _prefix_from_dictionary(
    _list_to_dict(['type', 'size', 'name', 'month']), 'subject'
) == 'type|size|name'
assert _prefix_from_dictionary(
    _list_to_dict(['type', 'name', 'foo']), 'treatment'
) == 'type|name'

message = _raw_to_prototype(_subjects_raw[0], 'Subject')
message

assert _key_from_message(message, 'weird') is None
assert _key_from_message(message, 'subject') == 'supervised|medium|bike|2013|0|0.0.1'

Because MinIO uses prefixes only to filter items, so do I, even with local storage. These are ordered, so we build a prefix from a dictionary, as long as each value in the ordered list is provided.

In [None]:
_prefix_from('subject', dict(purpose=1, size=2, year=2020))

'1|2'

In [None]:
_prefix_from('treatment', dict(purpose=1, name=2, foo=3))

'1|2'

In [None]:
a = list('abcdefg')
d = dict(a=1, b=2, d=3)

In [None]:
for k in a:
    if not k in d:
        break
    print(k)

a
b


What I now know is the put operation needs to serialize or compress an item or group of items, and that the get operations needs to deserialize or decompress an item or group of items.

I know that goofing with the data (mocking it midway) shows me more of what I need to do on the types.

I also know that the old school stuff with remote vs local data is a thing. I also know that there probably needs to be a different kind of object storage for meta data vs data. Meaning, subjects and modes get stored in some state of compression/raw/whatever, and that the meta data is something that has types.

Also, there could be different formats of the same subject, which is different than versions. A version on a model just increments. I treated it with X, and it gets a version 0.2.1.

* type relations
* learning type
* pull-through storage
* processing on the pull through (say tab-separated to CSV or even a preprocessing pipeline...more tbd)
* attribute updates
* more subject raw data
* more treatment raw/fake data
* more evaluation raw data


In [None]:
a = dict()
a

{}

In [None]:
list(a)

[]

In [None]:
ALL_RESULTS = -1
ALPHABETIC_SORT = 'todo'
RANDOM_SORT = 'todo'

def _alphabetic_sort(seq):
    """Dictionaries need a key"""
    try:
        return sorted(seq)
    except:
        return seq

def _random_sample(seq):
    seq = list(seq)
    k = len(seq)
    return random.sample(seq, k=k)

def _by_dictionary_key(seq, k):
    seq = list(seq)
    return sorted(seq, key=lambda d: d.get(k, 0))

def _by_name(seq):
    return _by_dictionary_key(seq, 'name')

SORT_MAP = dict(
    alphabetic = _alphabetic_sort,
    name = _by_name,
    random = _random_sample,
)
DEFAULT_SORT = 'name'
_item_types_list = [
    'subject', 'treatment', 'model', 'evaluation',
    'expecation', 'invocation'
]
ITEMS_MAP = {k:f'{k}s' for k in _item_types_list}
DEFAULT_ITEM = 'model'
    
def _get_sort(o):
    if callable(o): return o
    if o in SORT_MAP: return SORT_MAP[o]
    return SORT_MAP[DEFAULT_SORT]

def _label_filter_for(labels, limit):
    pass

def _get_items_with(bucket, item_filter):
    # Do extract here...
    # return list('abcdefghijklmnopqrstuvwxyz')
    return [{'name': k} for k in random.sample(list('abcdef'), k=6)]

def _get_objects_by_label(
    labels, bucket, limit=ALL_RESULTS, sort=DEFAULT_SORT, **kw):
    
    bucket = ITEMS_MAP.get(bucket, DEFAULT_ITEM)
    item_filter = _label_filter_for(labels, limit)
    items = _get_items_with(bucket, item_filter)

    sorter = _get_sort(sort)
    items = sorter(items)

    if limit == ALL_RESULTS: return items
    return items[:limit]

def get_subjects(labels, **kw):
    return _get_objects_by_label(labels, 'subject', **kw)

def get_treatments(labels, **kw):
    return _get_objects_by_label(labels, 'treatment', **kw)

def get_models(labels, **kw):
    return _get_objects_by_label(labels, 'model', **kw)

def get_evaluations(labels, **kw):
    return _get_objects_by_label(labels, 'evaluation', **kw)

def get_expectations(labels, **kw):
    return _get_objects_by_label(labels, 'expectation', **kw)

def get_invocations(labels, **kw):
    return _get_objects_by_label(labels, 'invocation', **kw)

def dispatch(subject, treatment, evaluations):
    print("Dispatch")
#     print(f"Dispatch: subject: {subject}, treatment: {treatment}, evaluations: {evaluations}")

def get_trained(subject, treatment, evaluations):
    print("Find", end="...")
#     print(f"Find: subject: {subject}, treatment: {treatment}, evaluations: {evaluations}")

def get_or_train(subject, treatment, evaluations):
    get_trained(subject, treatment, evaluations)
    if random.random() > 0.5:
        dispatch(subject, treatment, evaluations)
    else:
        print('Found')

In [None]:
found = _get_objects_by_label('labels', 'bucket', limit=3)
assert len(found) == 3
names = [e['name'] for e in found]
assert sorted(names) == names

In [None]:
subjects = get_subjects(
    ['research', 'supervised', 'small'],
    limit=2, sort='random'
)

treatments = get_treatments(['common', 'supervised', 'first_pass'], limit=1)
evaluations = get_evaluations(['supervised'], limit=1)

In [None]:
def learn(subjects, treatments, evaluations):
    for subject in subjects:
        for treatment in treatments:
            get_or_train(subject, treatment, evaluations)

In [None]:
learner = partial(learn, subjects, treatments, evaluations)
for _ in range(3):
    learner()
    print('_' * 10)

Find...Dispatch
Find...Dispatch
__________
Find...Dispatch
Find...Found
__________
Find...Dispatch
Find...Dispatch
__________


In [None]:
def _create_step(name, message=None, status='passed', **kw):
    d = {**kw, **dict(name=name, status=status)}
    if not message is None: d['message'] = message
    return d

def _fail_step(name, exception=None, **kw):
    return _create_step(name, status='failed', message=str(exception), **kw)

def validate_treatment(fn, **kw):
    steps = []
    advance = lambda e: steps.append(_create_step(e))
    try:
        current = 'setup class'
        cls = fn(**kw)
        advance(current)
        
        current = 'setup model'
        model = cls()
        advance(current)
        
        # TODO: check duck typing
        return True, steps

    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        steps.append(_fail_step(current, exception=e, traceback=exc_traceback))
        return False, steps
    
def format_validation(steps, valid=None):
    def filter_dicts(l, key='status', value=None):
        return [d for d in l if d.get(key) == value]
    def get_key(l, key):
        return [e.get(key) for e in l]
    def get_paired(l, value):
        filtered = filter_dicts(l, value=value)
        labeled = get_key(filtered, 'name')
        return filtered, labeled

    if valid is None: valid = len(filter_dicts(steps, value='failed')) == 0
    
    def explain_success(step):
        name = step.get('name', "unknown")
        return f"Success with: {name}"

    def explain_failure(step):
        name = step.get('name', "unknown")
        reason = step.get('message', "reason unknown")
        return f"Failing step: {name}. Reason: {reason}."

    def explain_malformed(step):
        name = step.get('name', "unknown")
        reason = step.get('message')
        message = f"Possible problem with {name}. That status was not set."
        if not reason is None: message += " Message: {reason}"
        return message
        
    def explain_step(step):
        status = step.get('status')
        if status == 'passed': return explain_success(step)
        if status == 'failed': return explain_failure(step)
        return explain_malformed(step)
    
    messages = []

    if valid:
        _passed_steps, passed_names = get_paired(steps, 'passed')
        messages.append("Validation was successful.")
        if len(passed_names) == 0:
            messages.append("No successful steps were reported.")
        else:
            messages.append(f"Steps: {', '.join(passed_names)}")
    else:
        messages.append("Validation failed.")
        
        for step in steps:
            messages.append(explain_step(step))

        failed_steps = filter_dicts(steps, value='failed')
        if len(failed_steps) == 0: messages.append("No failing steps were reported.")

    return "\n".join(messages)
    
def store_treatment(fn, **kw):
    valid, steps = validate_treatment(fn, **kw)
    if not valid:
        print(format_validation(steps, valid=False))
        return False, steps
    # TODO: extract/enforce more treatment attributes
    # TODO: think about object store dependency
    # object_store.put(validation)

In [None]:
store_treatment('foo')

Validation failed.
Failing step: setup class. Reason: 'str' object is not callable.


(False,
 [{'traceback': <traceback at 0x115075900>,
   'name': 'setup class',
   'status': 'failed',
   'message': "'str' object is not callable"}])

In [None]:
l2 = lambda: "This would be the model."
l1 = lambda **kw: l2
store_treatment(l1)

In [None]:
message = None
problem = None
tback = None
try:
    raise AttributeError("Cool message bro.")
except Exception as e:
    exc_type, exc_value, exc_traceback = sys.exc_info()
    print('!' * 100)
    print(exc_type)
    print(exc_value)
    print(exc_traceback)
    print('_' * 100)

    tback = exc_traceback
    problem = e

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
<class 'AttributeError'>
Cool message bro.
<traceback object at 0x114852b80>
____________________________________________________________________________________________________


Although really rough, this is cool:

* create a way to validate each type.
* use a consistent workflow to validate, report, and store objects
* push all the object store details lower

What I think happens is I mention something I want to to use, and the system looks for it. If I'm not satisfied, because it's incomplete or I want to improve it, then I either branch from that concept or start something fresh.

Maybe I iteratively get a subject into the lab. The reference is there, or I read a paper using a public dataset. I figure out if I want it stored locally, or where, if anywhere. I figure out what the processing whould be done on it. Maybe I have some downsampling tools, maybe I have some EDA tools, maybe I build a pipeline, maybe I just goof with the data in a notebook, informally figuring out how to work with the subject.

Meanwhile, there are notes about the dataset, about the tools I'm using, about the papers I'm reading. These should be treated like slips. I'm not very happy with the documentation tools in nbdev yet, but maybe I figure it out well enough to add a slip annotation to a notebook. The idea is the cell becomes the slip, and I just leave the full reference or a citation at the bottom.

So, it's iterative, and this interface is about right, yes?

In [None]:
import traceback

In [None]:
tb = traceback.extract_tb(tback)

In [None]:
print(tb)

[<FrameSummary file <ipython-input-297-0baecb16672e>, line 5 in <module>>]


In [None]:
problem.__dict__

{}

In [None]:
str(problem)

'Cool message bro.'

In [None]:
a = [
    dict(name='a', status='passed'),
    dict(name='b', status='passed'),
    dict(name='c', status='failed', message='Explain problems with c'),
    dict(name='d', status='passed'),
    dict(name='e'),
    dict(name='f', status='failed'),
    dict(status='failed', message="Something went wrong in a malformed step"),
]

In [None]:
print(format_validation(a, True))

Validation was successful.
Steps: a, b, d


In [None]:
print(format_validation(a))

Validation failed.
Success with: a
Success with: b
Failing step: c. Reason: Explain problems with c.
Success with: d
Possible problem with e. That status was not set.
Failing step: f. Reason: reason unknown.
Failing step: unknown. Reason: Something went wrong in a malformed step.


In [None]:
a = [
    {'name': 'a'},
    {'name': 'b', 'status': 'failed'},
]
print(format_validation(a, False))
# print(format_validation([{'name': 'a',}, {'name': 'b', status='failed'}], False))

Validation failed.
Possible problem with a. That status was not set.
Failing step: b. Reason: reason unknown.


In [None]:
print(format_validation([], False))

Validation failed.
No failing steps were reported.


In [None]:
print(format_validation([], True))

Validation was successful.
No successful steps were reported.


In [None]:
#export

def _list_paths(**kw):
    """In the local object store, a directory is a bucket.
    Find all directories, given an optional filter."""
    r = _root(**kw)
    p = _bucket_filter(**kw)
    return [f for f in r.iterdir() if f.is_dir() and p.match(f.name)]

def list_buckets(**kw):
    """Convert found directories to just their names."""
    
    # TODO: Consider expanding this to a sequence of bucket types,
    # after working through this interface and the MinIO one.
    return [p.name for p in _list_paths(**kw)]

def _get_slice(seq, key, default=None):
    """Get key from a sequence, returning a default.
    This is the same as d.get(key, default), but allowing
    a slice instead of just a key (so _get_slice(a, slice(2:4))
    works).
    Note: I use this here on sequences, rather than maps,
    a quick way to say something like a[0] when a is
    potentially empty."""
    d = dict(enumerate(seq))
    return d.get(key, default)

def _first_path(**kw):
    """What's the first path filtered from root, default None."""
    return _get_slice(_list_paths(**kw), 0)

def _first_bucket(**kw):
    "What's the first directory name filtered from root, default None."
    return _get_slice(list_buckets(**kw), 0)

def _exists(**kw):
    """Does a directory exist, given a filter?"""
    found = _first_path(**kw)
    return not found is None and found.exists()

def _is_empty(**kw):
    """Are there contents inside the directory?
    Returns True (is empty) when the directory doesn't exist."""
    if not _exists(**kw): return True
    return len(list(_first_path(**kw).glob('*'))) == 0

def _can_remove_bucket(**kw):
    kw = merge_config(**kw)
    if _is_empty(**kw): return True
    if kw.get('force', False): return True
    return False

def _full_path(**kw):
    return root(**kw)/name(**kw)
    
def find_or_create_bucket(**kw):
    path = _full_path(**kw)
    path.mkdir(parents=True, exist_ok=True)
    return path.name

def remove_bucket(**kw):
    if not _can_remove_bucket(**kw): return False
    try:
        path = _full_path(**kw)
        shutil.rmtree(path)
        return True
    except:
        return False

# Interface

Anything not in the interface gets an underscore prefix. This is because this file is meant to be used as a namespace:

    import lab.train.local_object_store as object_store
    object_store.list_buckets()
    
Then, when I get my config and MinIO where I want it:

    import lab.train.minio_object_store as object_store
    object_store.list_buckets()

Now I've got a remote and sharable object store that can version my work the way I've wanted to use it. If I'm working on a model, or I've dispatched a model to be trained, the object store will be updated when the work is ready.

The interface is:

    list_buckets
    find_or_create_bucket
    remove_bucket
    find_items
    put
    get
    get_stats
    copy
    remove

I'm working out in here what the signature is for everything.

In [None]:
assert isinstance(list_buckets(), list)

remove_bucket(name='junk', force=True)
assert _full_path(name='junk') == Path(ROOT)/'junk'
assert not _exists(name='junk')
assert _first_bucket(name='junk') is None
assert _first_path(name='junk') is None
assert list_buckets(name='junk') == []
assert _can_remove_bucket(name='junk')

find_or_create_bucket(name='junk')

assert len(list_buckets(name='junk')) == 1
assert _is_empty(name='junk')

Bucket management is kind of cool. We can force a removal of a bucket, even with contents in it, but I'll create objects in the bucket first, then I can come back to that.

In [None]:
def item_filter(**kw):
    kw = merge_config(**kw)
    return '*' # TODO
    
def find_items(**kw):
    path = _first_path(**kw)
    if path is None: return []
    return [o for o in path.glob(item_filter(**kw)) if o.is_file()]

def put(**kw):
    pass

def get(**kw):
    pass

def get_stats(**kw):
    pass

def copy(**kw):
    pass

def remove(**kw):
    pass


In [None]:
find_items()

[]

TODO:

* what is the thing? (open file reference, filename, url, content)
* item filter syntax (include white list above)
* underscore second-class functions (still exported, but not meant for the main interface)
* put an object by its reference
* deal with pickle or better-than-pickle

In [None]:
_first_path()

PosixPath('/tmp/junk')

In [None]:
path = Path('/tmp')

In [None]:
path.is_file()

False