In [None]:
# default_exp core
# default_cls_lvl 3

In [None]:
#hide
%load_ext line_profiler

# Corefunctions
> Corefunctionality for data preparation of sequential data for pytorch, fastai models

# Application Structure
The data will be extracted and prepared via transforms. Those are grouped in:
- Type Transforms: Those extraxt the needed components from the source items, like input sequences or target scalar values. The work on single tensors.
- Item Transforms: Those Transforms may work on tuple level and therefore may process relationships between input and output.
- Batch Transform: Those transforms work on batch level. They receive batched tensors and may apply lazy transforms like normalization very effeciently.

An application example may look like the following:
- sourceitems: 
    - path extraction with hdf5 file endings
    - create pandas dataframe with information for type transforms, like slices
    - filter items in pandas dataframe
- type transforms: 
    - extract hdf5 input and output sequence
    - create windows
- item transforms: 
    - filter sequence by value
    - shift output sequence by 1 element
- batch transforms: 
    - noise injection
    - normalization
    

In [None]:
#export
from fastai2.data.all import *
import h5py

## 1. Extract Source Items
The file paths may be extracted with `get_files` of fastai2. `get_hdf_files` removes the need of writing the hdf5 file extension.

Then a pandas dataframe may be created in case further information for the source items need to be stored like slices for the windowing function.

### 1.1 Extract File Paths

In [None]:
f_path = 'test_data/'
hdf_files = get_files(f_path,extensions='.hdf5',recurse=True)
len(hdf_files),hdf_files[0]

(3, PosixPath('test_data/train/Sim_RealisticCycle2.hdf5'))

In [None]:
#export
hdf_extensions = ['.hdf5']
def get_hdf_files(path,recurse=True, folders=None):
    "Get hdf5 files in `path` recursively, only in `folders`, if specified."
    return get_files(path, extensions=hdf_extensions, recurse=recurse, folders=folders)

In [None]:
hdf_files = get_hdf_files(f_path)
len(hdf_files),hdf_files[0]

(3, PosixPath('test_data/train/Sim_RealisticCycle2.hdf5'))

### 1.2 Create Pandas Source Dataframe

In [None]:
#export
def df_source_items(f_list,pd_tfms = None):
    '''Create Pandas Dataframe out of a list of items, with a list of df transforms applied'''
    df = pd.DataFrame(data=f_list.items,columns=['path'])
    if pd_tfms is not None:
        for t in pd_tfms:
            df = t(df)
    return df

In [None]:
df = df_source_items(hdf_files)
df.head()

Unnamed: 0,path
0,test_data/train/Sim_RealisticCycle2.hdf5
1,test_data/train/Sim_RealisticCycle1.hdf5
2,test_data/valid/Sim_RealisticCycle3.hdf5


In [None]:
#export
def DfHDFCreateWindows(win_sz,stp_sz, clm, fixed_start = False, fixed_end = False):
    '''create windows of sequences, splits sequence into multiple items'''
    def _inner(df):
        if fixed_start and fixed_end: raise Exception

        lst_df = [] #new dataframe for every row
        for idx, row in df.iterrows():
            with h5py.File(row.path,'r') as f:
                #TODO make clm optional
#                 if clm == '': 
#                     clm = list(f.keys())[0]
                f_len = f[clm].shape[0]

                n_win = ((f_len-win_sz)//stp_sz)+1
                tmp_df = df.iloc[[idx]*n_win]; #duplicate the row of the df multiple times by reference
                lst_idx = np.arange(n_win)

                #every row is a reference so we need to suppress the warning messages while copying
                pd.options.mode.chained_assignment = None
                tmp_df['l_slc'] = lst_idx*stp_sz
                tmp_df['r_slc'] = lst_idx*stp_sz + win_sz
                pd.options.mode.chained_assignment = 'warn'

                lst_df.append(tmp_df)

        res_df = pd.concat(lst_df)
        return res_df
    
    return _inner

In [None]:
%%time
create_win = DfHDFCreateWindows(win_sz=100,stp_sz=100,clm='current')
create_win(df).head()

CPU times: user 7.37 ms, sys: 5.47 ms, total: 12.8 ms
Wall time: 11.2 ms


In [None]:
# %%time
src_df = df_source_items(hdf_files,[DfHDFCreateWindows(win_sz=100,stp_sz=10,clm='current')])
src_df.head()

Unnamed: 0,path,l_slc,r_slc
0,test_data/train/Sim_RealisticCycle2.hdf5,0,100
0,test_data/train/Sim_RealisticCycle2.hdf5,10,110
0,test_data/train/Sim_RealisticCycle2.hdf5,20,120
0,test_data/train/Sim_RealisticCycle2.hdf5,30,130
0,test_data/train/Sim_RealisticCycle2.hdf5,40,140


## 2. Convert Paths to Sequence Objects
Der Pfad wird unter Angabe der Spaltennamen in Sequenzen und Skalare Werte umgewandelt, um so am Ende ein 3-Tupel zu erhalten aus:
- (Sequence, Scalar, Sequence) <-> (input,input,output)

### 2.1 Datatypes for Sequences and Scalars

In [None]:
#export
class TensorSequences(TensorBase): pass

In [None]:
#export
class TensorSequencesInput(TensorSequences): pass

In [None]:
#export
class TensorSequencesOutput(TensorSequences): pass

In [None]:
#export
class TensorScalars(TensorBase): pass

In [None]:
#export
class TensorScalarsInput(TensorScalars): pass

In [None]:
#export
class TensorScalarsOutput(TensorScalars): pass

### 2.2 Extract sequential data from hdf5-files
Two different functions, based on pandas df and on lists

In [None]:
#export
from functools import lru_cache

def HDF2Sequence(c_names,cached=True):
    def _extract_sequence(hdf_path,dataset = None, l_slc = None, r_slc= None):
        with h5py.File(hdf_path,'r') as f:
            ds = f if dataset is None else f[dataset]
            l_array = [ds[n][l_slc:r_slc][:,None] for n in c_names]
            seq = np.concatenate(l_array,axis=1)
            return seq
        
    _exseq = lru_cache(maxsize=None)(_extract_sequence) if cached else _extract_sequence
    
    def _extract_df_sequence(item):
#         import pdb; pdb.set_trace()
        if not isinstance(item,pd.Series):
            return _exseq(str(item))
        
        path = item.path
        dataset = item.dataset if hasattr(df,'dataset') else None
        l_slc = item.l_slc if hasattr(df,'l_slc') else None
        r_slc = item.r_slc if hasattr(df,'r_slc') else None
        
        if cached:
            return _exseq(path,dataset)[l_slc:r_slc]
        else:
            return _exseq(path,dataset,l_slc,r_slc)

    return _extract_df_sequence

In [None]:
%%timeit
hdf2seq = HDF2Sequence(['current','voltage'],cached=False)
hdf2seq(hdf_files[0])

21.5 ms ± 53.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
hdf2seq = HDF2Sequence(['current','voltage'],cached=True)

In [None]:
%%timeit
hdf2seq(hdf_files[0])

1.62 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


Die Funktion lässt sich mittels Pipeline auf eine Liste von Quellobjekten (hier Pfade) anwenden 

In [None]:
pipe = Pipeline(HDF2Sequence(['current','voltage']))

In [None]:
res_pipe = pipe(hdf_files)
len(res_pipe), res_pipe[0][0]

(3, array([0.       , 4.1873503], dtype=float32))

In [None]:
#export
def hdf2scalars(hdf_path,c_names):
    with h5py.File(hdf_path,'r') as f:
#         import pdb; pdb.set_trace()
#         l_array = [f[n][:][:,None] for n in c_names]
#         seq = np.concatenate(l_array,axis=1)
        return None

### Performance Test
Caching stores the arrays for future use at every function call. Very usefull, especially for windows. Should allways be turned. Only explicitly turn it off when there is not enough memory for your data.

In [None]:
tfms=[  [HDF2Sequence(['current','voltage'],cached=False)],
        [HDF2Sequence(['voltage'],cached=False)]]
dsrc = DataSource(src_df.iloc[:1000],tfms=tfms)

In [None]:
len(dsrc)

1000

In [None]:
%%time
for x in dsrc:
    x

CPU times: user 34.2 s, sys: 1.69 s, total: 35.9 s
Wall time: 36 s


In [None]:
tfms=[  [HDF2Sequence(['current','voltage'],cached=True)],
        [HDF2Sequence(['voltage'],cached=True)]]
dsrc = DataSource(src_df.iloc[:1000],tfms=tfms)

In [None]:
%%time
for x in dsrc:
    x

CPU times: user 586 ms, sys: 3.92 ms, total: 590 ms
Wall time: 588 ms


Caching is way faster because every file gets loaded multiple times

### 2.3 Extract tensor tuples from hdf5-files
Depending on your application where you either need scalars as additional input or label we need different tuples.

In [None]:
#export
class Hdf2SeqSeq(Transform):
    def __init__(self, seq_inp, seq_out): 
        self.seq_inp,self.seq_out = seq_inp,seq_out
    def encodes(self, o): return (TensorSequencesInput(hdf2sequence(o,self.seq_inp)),
                                  TensorSequencesOutput(hdf2sequence(o,self.seq_out)))
    def decodes(self, x): return SequenceItem(x)

class Hdf2SeqScal(Transform):
    def __init__(self, seq_inp, scal_out): 
        self.seq_inp,self.scal_out = seq_inp,scal_out
    def encodes(self, o): return (TensorSequencesInput(hdf2sequence(o,self.seq_inp)),
                                  TensorScalarsOutput(hdf2scalars(o,self.scal_out)))
    def decodes(self, x): return SequenceItem(x) 
class Hdf2SeqScalSeq(Transform):
    def __init__(self, seq_inp,scal_inp, seq_out): 
        self.seq_inp,self.scal_inp,self.seq_out = seq_inp,scal_inp,seq_out
    def encodes(self, o): return (TensorSequencesInput(hdf2sequence(o,self.seq_inp)),
                                  TensorScalarsInput(hdf2scalars(o,self.scal_inp)),
                                  TensorSequencesOutput(hdf2sequence(o,self.seq_out)))
    def decodes(self, x): return SequenceItem(x)

class Hdf2SeqScalScal(Transform):
    def __init__(self, seq_inp,scal_inp, scal_out): 
        self.seq_inp,self.scal_inp,self.scal_out = seq_inp,scal_inp,scal_out
    def encodes(self, o): return (TensorSequencesInput(hdf2sequence(o,self.seq_inp)),
                                  TensorScalarsInput(hdf2scalars(o,self.scal_inp)),
                                  TensorScalarsOutput(hdf2scalars(o,self.scal_out)))
    def decodes(self, x): return SequenceItem(x) 

In [None]:
hdf2seq = Pipeline(Hdf2SeqSeq(['current','voltage'],['voltage']))

items = hdf2seq(hdf_files)
len(items),items[0][0].shape

(3, torch.Size([265598, 2]))

### SequenceItem
Damit die Sequenz visualisiert werden kann und auch dritte Informationen gespeichert werden können, wird eine Klasse erstellt 

In [None]:
#export

#TODO: Fallunterscheidung der Sequenzen
class SequenceItem(Tuple):
    def show(self, ctx=None, **kwargs): 
        plt.figure()
        plt.plot(self[2])

SequenceItem ist nur für die Darstellung eines Tupels von Sequenzen zuständig. Es muss zwischen Skalaren und Vektoriellen Zielgrößen unterschieden werden.

In [None]:
#export
class SeqTfm(Transform):
    def decodes(self, x): return SequenceItem(x)

SequenceTfm erstellt ein SequenceItem beim decoding für die spätere Darstellung.

## 3. Item Transformations


In [None]:
#export 
class SeqSlice(Transform):
    '''Take a slice from an array-like object. Useful for e.g. shifting input and output'''
    def __init__(self, l_slc=None,r_slc=None):
        self.l_slc,self.r_slc = l_slc,r_slc
        
    def encodes(self, o): return o[self.l_slc:self.r_slc]

In [None]:
l_shift = SeqSlice(r_slc=-1)
arr = np.ones((5))
test_eq(l_shift(arr),arr[:-1])

## 3. Split in Training, Validation
Splitting kann anhand von vorher bekannten Indizes, dem Dateipfad oder anderen allgemeinen Funktion durchgeführt werden.

Splitting innerhalb einer Sequenzen sollte in der Praxis nur dann geschehen wenn eine einzige Sequenz vorhanden ist. Diese kann dann vorher manuell geteilt werden.


### 3.1 Splitting mit vorgegebenem Index

In [None]:
splitter = IndexSplitter([1,2])
test_eq(splitter(hdf_files),[[0],[1,2]])

### 3.2 Splitting mit allgemeiner Funktion
Items, bei denen die definierte Funktion `True` zurück gibt, werden den Validierungsdatensatz zugeordnet, der Rest dem Training. In diesem Fall wird nach dem Übergeordneten Ordnernamen gesucht.

In [None]:
splitter = FuncSplitter(lambda o: Path(o).parent.name == 'valid')
test_eq(splitter(hdf_files),[[0,1],[2]])

### 3.3 Splitting anhand des Parent-Folders
Splitter, der Explizit Training und Validierungsordner den Datensätzen zuordnet

In [None]:
#export
def _parent_idxs(items, name): return mask2idxs(Path(o).parent.name == name for o in items)

def ParentSplitter(train_name='train', valid_name='valid'):
    "Split `items` from the parent folder names (`train_name` and `valid_name`)."
    def _inner(o, **kwargs):
        return _parent_idxs(o, train_name),_parent_idxs(o, valid_name)
    return _inner

In [None]:
splitter = ParentSplitter()
test_eq(splitter(hdf_files),[[0,1],[2]])

## 4. Create Datasource

In [None]:
tfms=[  [HDF2Sequence(['current','voltage']),SeqSlice(l_slc=1)],
        [HDF2Sequence(['voltage']),SeqSlice(r_slc=-1)]]
dsrc = DataSource(src_df,tfms=tfms,splits=splitter(src_df.path))

In [None]:
dsrc.databunch(after_batch=)

In [None]:
dsrc[0]

(array([[ 0.       ,  4.1873503],
        [-0.0052   ,  4.187454 ],
        [-0.009    ,  4.187548 ],
        ...,
        [ 1.0783   ,  3.7160358],
        [ 1.0739   ,  3.716139 ],
        [ 1.0706   ,  3.7162225]], dtype=float32), array([[4.1873503],
        [4.187454 ],
        [4.187548 ],
        ...,
        [3.7160358],
        [3.716139 ],
        [3.7162225]], dtype=float32))

In [None]:
len(dsrc.train),len(dsrc.valid)

(53101, 26550)

In [None]:
def func(a,b):
    x = 3*a
    return x+b

In [None]:
%lprun -f dsrc.tfms[1][0].init_enc dsrc[0]

In [None]:
%%time
dsrc[0]

CPU times: user 1.74 ms, sys: 0 ns, total: 1.74 ms
Wall time: 1.76 ms


(array([[ 0.       ,  4.1873503],
        [-0.0052   ,  4.187454 ],
        [-0.009    ,  4.187548 ],
        ...,
        [ 1.0783   ,  3.7160358],
        [ 1.0739   ,  3.716139 ],
        [ 1.0706   ,  3.7162225]], dtype=float32), array([[4.1873503],
        [4.187454 ],
        [4.187548 ],
        ...,
        [3.7160358],
        [3.716139 ],
        [3.7162225]], dtype=float32))

In [None]:
# %%time
# ls = list(dsrc)

KeyboardInterrupt: 

## 5. Normalization

In [None]:
#export
class SequenceNorm(Transform):
    def encodes(self, o): 
#         import pdb; pdb.set_trace()
        return (o-self.m)/self.s
    def decodes(self, o): return (o*self.s)+self.m
#     def setups(self, items):
#         import pdb; pdb.set_trace()
#         its = np.vstack(items)
#         self.m,self.s = its.mean(axis=0),its.std(axis=0)

In [None]:
tfms=[  [partial(hdf2sequence,c_names=['current','voltage'])],
        [partial(hdf2sequence,c_names=['voltage'])]]
dsrc = DataSource(hdf_files,tfms=tfms,splits=split_idxs)

In [None]:
dsrc.train

(#2) [(array([[ 0.       ,  4.1873503],
       [-0.0052   ,  4.187454 ],
       [-0.009    ,  4.187548 ],
       ...,
       [ 1.0783   ,  3.7160358],
       [ 1.0739   ,  3.716139 ],
       [ 1.0706   ,  3.7162225]], dtype=float32), array([[4.1873503],
       [4.187454 ],
       [4.187548 ],
       ...,
       [3.7160358],
       [3.716139 ],
       [3.7162225]], dtype=float32)),(array([[ 0.       ,  4.1873503],
       [-0.1      ,  4.18935  ],
       [-0.1      ,  4.1896954],
       ...,
       [ 8.8388   ,  3.3932123],
       [ 8.846    ,  3.3928714],
       [ 8.8531   ,  3.3925302]], dtype=float32), array([[4.1873503],
       [4.18935  ],
       [4.1896954],
       ...,
       [3.3932123],
       [3.3928714],
       [3.3925302]], dtype=float32))]

In [None]:
lsrc = list(dsrc.train)
test_eq(dsrc.train,lsrc)

In [None]:
len(lsrc)

2

In [None]:
%%timeit
dsrc[0]

35.4 ms ± 162 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
lsrc[0]

38.7 ns ± 0.13 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [None]:
ImageBlock??

Object `ImageBlock` not found.


In [None]:
db=dsrc.databunch(after_item=[SequenceNorm()], bs=1, num_workers=0)

In [None]:
db.one_batch()

AttributeError: 'SequenceNorm' object has no attribute 'm'

In [None]:
#hide
from nbdev.export import *
notebook2script()