In [None]:
#default_exp data.source

In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.core import *
from local.data.pipeline import *
from local.notebook.showdoc import show_doc

# Data source
> Base container for all the items

## Convenience functions

In [None]:
#export core
def all_union(sets):
    "Set of union of all `sets` (each `setified` if needed)"
    return set().union(*(map(setify,sets)))

In [None]:
sets = [[1,2],[2,3]]
test_eq(all_union(sets), {1,2,3})

In [None]:
#export core
def all_disjoint(sets):
    "`True` iif no element appears in more than one item of `sets`"
    return sum(map(len,sets))==len(all_union(sets))

In [None]:
assert not all_disjoint(sets)
assert all_disjoint([[1,2],[3,4]])
assert all_disjoint([[1,2],[]])
assert all_disjoint([[1,2]])
assert all_disjoint([])

## DataSource -

In [None]:
# export
@docs
class DataSource(PipedList):
    "Applies a `Pipeline` of `tfms` to filtered subsets of `items`"
    def __init__(self, items, tfms=None, filts=None):
        if filts is None: filts = [range_of(items)]
        self.filts = listify(mask2idxs(filt) for filt in filts)
        # Create map from item id to filter id
        assert all_disjoint(self.filts)
        self.filt_idx = ListContainer([None]*len(items))
        for i,f in enumerate(self.filts): self.filt_idx[f] = i
        super().__init__(items, tfms)

    @property
    def n_subsets(self): return len(self.filts)
    def len(self,filt): return len(self.filts[filt])
    def subset(self, i): return DsrcSubset(self, i)
    def subsets(self): return map(self.subset, range(self.n_subsets))
    def __repr__(self): return '\n'.join(map(str,self.subsets())) + f'\ntfm - {self.tfm}'
    
    def __getitem__(self, i):
        "Transformed item(s) at `i`"
        its,fts = self.items[i],self.filt_idx[i]
        if is_iter(i): return ListContainer(self.tfm(it, filt=f) for it,f in zip(its,fts))
        else: return self.tfm(its, filt=fts)

    _docs = dict(len="`len` of subset `filt`",
                 subset="Filtered `DsrcSubset` `i`",
                 subsets="Iterator for all subsets")

DataSource.train,DataSource.valid = add_props(lambda i,x: x.subset(i), 2)

A `DataSource` provides filtering and transformation capabilities to a list of items. Although it has all the attributes of `PipedList` (since it's a subclass) they are mainly used internally; you will generally want to instead access its `subset`s.

If you don't pass any filters or transforms, it simply provides a single subset (of type `DsrcSubset`) with the same behavior as a `ListContainer`.

In [None]:
inp = [0,1,2,3,4]
dsrc = DataSource(inp)

test_eq(dsrc,inp)            # No filters, so equal to input items
test_eq(dsrc.n_subsets, 1)
test_ne(dsrc, [0,1,2,3,5])
test_eq(dsrc[2], 2)          # Retrieve one item (subset 0 is the default)
test_eq(dsrc[1,2], [1,2])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsrc[mask], [0,3])   # Retrieve two items by mask

In [None]:
#export
@docs
class DsrcSubset():
    "A filtered subset of a `DataSource`"
    def __init__(self, dsrc, filt): self.dsrc,self.filt,self.filts = dsrc,filt,dsrc.filts[filt]
    def __getitem__(self,i): return self.dsrc[self.filts[i]]
    def decode(self, o, **kwargs): return self.dsrc.decode(o, self.filt, **kwargs)
    def decode_at(self, i, **kwargs): return self.decode(self[i], **kwargs)
    def show_at  (self, i, **kwargs): return self.dsrc.show(self.decode_at(i), **kwargs)
    def __len__(self): return len(self.filts)
    def __eq__(self,b): return all_equal(b,self)
    def __repr__(self): return coll_repr(self)
    
    _docs = dict(decode="Transform decode",
                 __getitem__="Encoded item(s) at `i`",
                 decode_at="Decoded item at `i`",
                 show_at="Show item at `i`")

Passing `filts` to the `DataSource` constructor allows you to create multiple subsets, each of type `DsrcSubset`.

In [None]:
# filts can be indices
dsrc = DataSource(range(5), filts=[tensor([0,2]), [1,3,4]])

test_eq(dsrc.n_subsets, 2)
test_eq(dsrc.subset(0), [0,2])
test_eq(dsrc.train, [0,2])       # Subset 0 is aliased to `train`
test_eq(dsrc.subset(1), [1,3,4])
test_eq(dsrc.valid, [1,3,4])     # Subset 1 is aliased to `valid`
test_eq(dsrc.valid[2], 4)
assert '[1,3,4]' in str(dsrc) and '[0,2]' in str(dsrc)
dsrc

(2 items) [0,2]
(3 items) [1,3,4]
tfm - []

In [None]:
# filts can be boolean masks (they don't have to cover all items, but must be disjoint)
filts = [[False,True,True,False,True], [True,False,False,False,False]]
dsrc = DataSource(range(5), filts=filts)

test_eq(dsrc.train, [1,2,4])
test_eq(dsrc.valid, [0])

Pass `tfms` to have transformations applied before returning items.

In [None]:
isinstance(dsrc, Iterator)

False

In [None]:
is_iter(dsrc.items)

1

In [None]:
# apply transforms to all items
tfms = [lambda x: x*2,lambda x: x+1]
filts = [[1,2],[0,3,4]]
dsrc = DataSource(range(5), tfms, filts=filts)
test_eq(dsrc.train,[3,5])
test_eq(dsrc.valid,[1,7,9])
test_eq(dsrc.train[False,True], [5])

The subset index is also passed to your transform, so if it is an instance of `Transform` it will only be applied if the filt idx matches.

In [None]:
# only transform subset 1
class _Tfm(Transform):
    def encodes(self, x): return x*2
    def decodes(self, x): return x//2
    def shows(self, x): return f" * {x}"
        
tfm = _Tfm(filt=1)

In [None]:
dsrc = DataSource(range(5), tfm, filts=[[1,2],[0,3,4]])
test_eq(dsrc.train,[1,2])
test_eq(dsrc.valid,[0,6,8])
test_eq(dsrc.train[False,True], [2])
dsrc

(2 items) [1,2]
(3 items) [0,6,8]
tfm - [<class '__main__._Tfm'>]

### `DataSource` Methods

You won't need to use many methods of `DataSource`, since normally you'll be accessing subsets, and therefore will be using `DsrcSubset` methods. However there are a few `DataSource` methods that may be useful:

In [None]:
show_doc(DataSource.len)

<h4 id="DataSource.len" class="doc_header"><code>len</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/05_data_source.ipynb#DataSource--" class="source_link" style="float:right">[source]</a></h4>

> <code>len</code>(**`filt`**)

`len` of subset `filt`

In [None]:
[dsrc.len(i) for i in range(2)]

[2, 3]

In [None]:
show_doc(DataSource.subset)

<h4 id="DataSource.subset" class="doc_header"><code>subset</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/05_data_source.ipynb#DataSource--" class="source_link" style="float:right">[source]</a></h4>

> <code>subset</code>(**`i`**)

Filtered [`DsrcSubset`](/data.source.html#DsrcSubset) `i`

Subset 0 is aliased to the `train` property, and subset 1 is aliased to the `valid` property.

In [None]:
dsrc.subset(1)

(3 items) [0,6,8]

In [None]:
show_doc(DataSource.subsets)

<h4 id="DataSource.subsets" class="doc_header"><code>subsets</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/05_data_source.ipynb#DataSource--" class="source_link" style="float:right">[source]</a></h4>

> <code>subsets</code>()

Iterator for all subsets

In [None]:
for i,s in enumerate(dsrc.subsets()): print(f"Subset {i}: {s}")

Subset 0: (2 items) [1,2]
Subset 1: (3 items) [0,6,8]


### `DsrcSubset` Methods

In [None]:
#export
@docs
class DsrcSubset():
    "A filtered subset of a `DataSource`"
    def __init__(self, dsrc, filt): self.dsrc,self.filt,self.filts = dsrc,filt,dsrc.filts[filt]
    def __getitem__(self,i): return self.dsrc[self.filts[i]]
    def decode(self, o, **kwargs): return self.dsrc.decode(o, filt=self.filt, **kwargs)
    def decode_at(self, i, **kwargs): return self.decode(self[i], **kwargs)
    def show_at  (self, i, **kwargs): return self.dsrc.show(self[i], filt=self.filt, **kwargs)
    def __len__(self): return len(self.filts)
    def __eq__(self,b): return all_equal(b,self)
    def __repr__(self): return coll_repr(self)
    
    _docs = dict(decode="Transform decode",
                 __getitem__="Encoded item(s) at `i`",
                 decode_at="Decoded item at `i`",
                 show_at="Show decoded item at `i`")

In [None]:
show_doc(DsrcSubset.__getitem__)

<h4 id="DsrcSubset.__getitem__" class="doc_header"><code>__getitem__</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/05_data_source.ipynb#DataSource--" class="source_link" style="float:right">[source]</a></h4>

> <code>__getitem__</code>(**`i`**)

Encoded item(s) at `i`

In [None]:
dsrc.valid[1]

6

In [None]:
show_doc(DsrcSubset.show_at)

<h4 id="DsrcSubset.show_at" class="doc_header"><code>show_at</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/05_data_source.ipynb#DataSource--" class="source_link" style="float:right">[source]</a></h4>

> <code>show_at</code>(**`i`**, **\*\*`kwargs`**)

Show decoded item at `i`

In [None]:
test_eq(dsrc.valid.show_at(1), ' * 3')

In [None]:
show_doc(DsrcSubset.decode_at)

<h4 id="DsrcSubset.decode_at" class="doc_header"><code>decode_at</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/05_data_source.ipynb#DataSource--" class="source_link" style="float:right">[source]</a></h4>

> <code>decode_at</code>(**`i`**, **\*\*`kwargs`**)

Decoded item at `i`

In [None]:
test_eq(dsrc.valid.decode_at(1), 3)

In [None]:
show_doc(DsrcSubset.decode)

<h4 id="DsrcSubset.decode" class="doc_header"><code>decode</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/05_data_source.ipynb#DataSource--" class="source_link" style="float:right">[source]</a></h4>

> <code>decode</code>(**`o`**, **\*\*`kwargs`**)

Transform decode

In [None]:
t = dsrc.valid[1]
test_eq(dsrc.valid.decode(t), 3)

## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)