In [None]:
# default_exp core
# default_cls_lvl 3

In [None]:
#hide
%load_ext line_profiler

# Corefunctions
> Corefunctionality for data preparation of sequential data for pytorch, fastai models

# Application Structure
The data will be extracted and prepared via transforms. Those are grouped in:
- Type Transforms: Those extraxt the needed components from the source items, like input sequences or target scalar values. The work on single tensors.
- Item Transforms: Those Transforms may work on tuple level and therefore may process relationships between input and output.
- Batch Transform: Those transforms work on batch level. They receive batched tensors and may apply lazy transforms like normalization very effeciently.

An application example may look like the following:
- sourceitems: 
    - path extraction with hdf5 file endings
    - create pandas dataframe with information for type transforms, like slices
    - filter items in pandas dataframe
- type transforms: 
    - extract hdf5 input and output sequence
    - create windows
- item transforms: 
    - filter sequence by value
    - shift output sequence by 1 element
- batch transforms: 
    - noise injection
    - normalization
    

In [None]:
#export
from fastai2.data.all import *
import h5py

## 1. Extract Source Items
The file paths may be extracted with `get_files` of fastai2. `get_hdf_files` removes the need of writing the hdf5 file extension.

Then a pandas dataframe may be created in case further information for the source items need to be stored like slices for the windowing function.

### 1.1 Extract File Paths

In [None]:
f_path = 'test_data/'
hdf_files = get_files(f_path,extensions='.hdf5',recurse=True)
len(hdf_files),hdf_files[0]

(3, PosixPath('test_data/train/Sim_RealisticCycle2.hdf5'))

In [None]:
#export
hdf_extensions = ['.hdf5']
def get_hdf_files(path,recurse=True, folders=None):
    "Get hdf5 files in `path` recursively, only in `folders`, if specified."
    return get_files(path, extensions=hdf_extensions, recurse=recurse, folders=folders)

In [None]:
hdf_files = get_hdf_files(f_path)
len(hdf_files),hdf_files[0]

(3, PosixPath('test_data/train/Sim_RealisticCycle2.hdf5'))

### 1.2 Create Pandas Source Dataframe

In [None]:
#export
def df_source_items(f_list,pd_tfms = None):
    '''Create Pandas Dataframe out of a list of items, with a list of df transforms applied'''
    df = pd.DataFrame(data=f_list.items,columns=['path'])
    if pd_tfms is not None:
        for t in pd_tfms:
            df = t(df)
    return df

In [None]:
df = df_source_items(hdf_files)
df.head()

Unnamed: 0,path
0,test_data/train/Sim_RealisticCycle2.hdf5
1,test_data/train/Sim_RealisticCycle1.hdf5
2,test_data/valid/Sim_RealisticCycle3.hdf5


In [None]:
#export
def DfHDFCreateWindows(win_sz,stp_sz, clm, fixed_start = False, fixed_end = False):
    '''create windows of sequences, splits sequence into multiple items'''
    def _inner(df):
        if fixed_start and fixed_end: raise Exception

        lst_df = [] #new dataframe for every row
        for idx, row in df.iterrows():
            with h5py.File(row.path,'r') as f:
                #TODO make clm optional
#                 if clm == '': 
#                     clm = list(f.keys())[0]
                f_len = f[clm].shape[0]

                n_win = ((f_len-win_sz)//stp_sz)+1
                tmp_df = df.iloc[[idx]*n_win]; #duplicate the row of the df multiple times by reference
                lst_idx = np.arange(n_win)

                #every row is a reference so we need to suppress the warning messages while copying
                pd.options.mode.chained_assignment = None
                tmp_df['l_slc'] = lst_idx*stp_sz
                tmp_df['r_slc'] = lst_idx*stp_sz + win_sz
                pd.options.mode.chained_assignment = 'warn'

                lst_df.append(tmp_df)

        res_df = pd.concat(lst_df)
        return res_df
    
    return _inner

In [None]:
%%time
create_win = DfHDFCreateWindows(win_sz=100,stp_sz=100,clm='current')
create_win(df).head()

CPU times: user 8.69 ms, sys: 1.63 ms, total: 10.3 ms
Wall time: 9.99 ms


In [None]:
# %%time
src_df = df_source_items(hdf_files,[DfHDFCreateWindows(win_sz=100+1,stp_sz=10,clm='current')])
src_df.head()

Unnamed: 0,path,l_slc,r_slc
0,test_data/train/Sim_RealisticCycle2.hdf5,0,101
0,test_data/train/Sim_RealisticCycle2.hdf5,10,111
0,test_data/train/Sim_RealisticCycle2.hdf5,20,121
0,test_data/train/Sim_RealisticCycle2.hdf5,30,131
0,test_data/train/Sim_RealisticCycle2.hdf5,40,141


## 2. Convert Paths to Sequence Objects
Der Pfad wird unter Angabe der Spaltennamen in Sequenzen und Skalare Werte umgewandelt, um so am Ende ein 3-Tupel zu erhalten aus:
- (Sequence, Scalar, Sequence) <-> (input,input,output)

### 2.1 Datatypes for Sequences and Scalars

In [None]:
#export
class TensorSequences(TensorBase): pass
class TensorSequencesInput(TensorSequences): pass
class TensorSequencesOutput(TensorSequences): pass

In [None]:
#export
@Transform
def toTensorSequencesInput(o): return TensorSequencesInput(o)
@Transform
def toTensorSequencesOutput(o): return TensorSequencesOutput(o)

In [None]:
#export
class TensorScalars(TensorBase): pass
class TensorScalarsInput(TensorScalars): pass
class TensorScalarsOutput(TensorScalars): pass

### 2.2 Extract sequential data from hdf5-files
Two different functions, based on pandas df and on lists

In [None]:
#export
from functools import lru_cache

def HDF2Sequence(c_names,cached=True):
    def _extract_sequence(hdf_path,dataset = None, l_slc = None, r_slc= None):
        with h5py.File(hdf_path,'r') as f:
            ds = f if dataset is None else f[dataset]
            l_array = [ds[n][l_slc:r_slc][:,None] for n in c_names]
            seq = np.concatenate(l_array,axis=1)
            return tensor(seq)
        
    _exseq = lru_cache(maxsize=None)(_extract_sequence) if cached else _extract_sequence
    
    def _extract_df_sequence(item):
        if not isinstance(item,pd.Series):
            return _exseq(str(item))
        
        path = item.path
        dataset = item.dataset if hasattr(item,'dataset') else None
        l_slc = item.l_slc if hasattr(item,'l_slc') else None
        r_slc = item.r_slc if hasattr(item,'r_slc') else None
        
        if cached:
            return _exseq(path,dataset)[l_slc:r_slc]
        else:
            return _exseq(path,dataset,l_slc,r_slc)

    return _extract_df_sequence

In [None]:
# %%timeit
hdf2seq = HDF2Sequence(['current','voltage'],cached=False)
hdf2seq(hdf_files[0])

tensor([[ 0.0000,  4.1874],
        [-0.0052,  4.1875],
        [-0.0090,  4.1875],
        ...,
        [ 1.0783,  3.7160],
        [ 1.0739,  3.7161],
        [ 1.0706,  3.7162]])

In [None]:
hdf2seq = HDF2Sequence(['current','voltage'],cached=True)

In [None]:
# %%timeit
hdf2seq(hdf_files[0])

tensor([[ 0.0000,  4.1874],
        [-0.0052,  4.1875],
        [-0.0090,  4.1875],
        ...,
        [ 1.0783,  3.7160],
        [ 1.0739,  3.7161],
        [ 1.0706,  3.7162]])

Die Funktion lässt sich mittels Pipeline auf eine Liste von Quellobjekten (hier Pfade) anwenden 

In [None]:
pipe = Pipeline(HDF2Sequence(['current','voltage']))

In [None]:
res_pipe = pipe(hdf_files)
len(res_pipe), res_pipe[0][0]

(3, tensor([0.0000, 4.1874]))

In [None]:
#export
def hdf2scalars(hdf_path,c_names):
    with h5py.File(hdf_path,'r') as f:
#         import pdb; pdb.set_trace()
#         l_array = [f[n][:][:,None] for n in c_names]
#         seq = np.concatenate(l_array,axis=1)
        return None

### Performance Test
Caching stores the arrays for future use at every function call. Very usefull, especially for windows. Should allways be turned. Only explicitly turn it off when there is not enough memory for your data.

In [None]:
tfms=[  [HDF2Sequence(['current','voltage'],cached=False)],
        [HDF2Sequence(['voltage'],cached=False)]]
dsrc = DataSource(src_df.iloc[:1000],tfms=tfms)

In [None]:
len(dsrc)

1000

In [None]:
# %%time
# for x in dsrc:
#     x

In [None]:
tfms=[  [HDF2Sequence(['current','voltage'],cached=True)],
        [HDF2Sequence(['voltage'],cached=True)]]
dsrc = DataSource(src_df.iloc[:1000],tfms=tfms)

In [None]:
%%time
for x in dsrc:
    x

CPU times: user 701 ms, sys: 7.01 ms, total: 708 ms
Wall time: 708 ms


Caching is way faster because every file gets loaded multiple times

### SequenceItem
Damit die Sequenz visualisiert werden kann und auch dritte Informationen gespeichert werden können, wird eine Klasse erstellt 

In [None]:
#export

#TODO: Fallunterscheidung der Sequenzen
class SequenceItem(Tuple):
    def show(self, ctx=None, **kwargs): 
        plt.figure()
        plt.plot(self[2])

SequenceItem ist nur für die Darstellung eines Tupels von Sequenzen zuständig. Es muss zwischen Skalaren und Vektoriellen Zielgrößen unterschieden werden.

In [None]:
#export
class SeqTfm(Transform):
    def decodes(self, x): return SequenceItem(x)

SequenceTfm erstellt ein SequenceItem beim decoding für die spätere Darstellung.

## 3. Item Transformations


### 3.1 Sequence Slicing Transformation

In [None]:
#export 
class SeqSlice(Transform):
    '''Take a slice from an array-like object. Useful for e.g. shifting input and output'''
    def __init__(self, l_slc=None,r_slc=None):
        self.l_slc,self.r_slc = l_slc,r_slc
        
    def encodes(self, o): return o[self.l_slc:self.r_slc]

In [None]:
l_shift = SeqSlice(r_slc=-1)
arr = np.ones((5))
test_eq(l_shift(arr),arr[:-1])

### 3.2 Sequence Noise Injection Transformation

In [None]:
#export
class SeqNoiseInjection(Transform):
    '''Adds normal distributed noise to the tensor sequence with seperate mean and std for every signal'''
    def __init__(self, std=1e-1,mean=0.):
        self.std,self.mean = tensor(std),tensor(mean)
        
    def setups(self, dl:DataLoader):
        #check the tensor type of your input
        #TODO: include scalar type case
        x,*_ = dl.one_batch()
        self.std = to_device(self.std,x.device)
        self.mean = to_device(self.mean,x.device)
        
    def encodes(self, o:TensorSequencesInput): 
        #expand creates a view on a tensor and is therefore very fast compared to copy
        return o+torch.normal(mean=self.mean.expand_as(o), 
                              std=self.std.expand_as(o))

In [None]:
x = TensorSequencesInput(tensor([[1,1,1],[-1,-1,-1.0]]))
ns_mean = tensor([0.,10.1,3.1])
ns_std = tensor([1.,1.1,0.1])
x,x.shape

(tensor([[ 1.,  1.,  1.],
         [-1., -1., -1.]]), torch.Size([2, 3]))

In [None]:
seq_noise = SeqNoiseInjection(std=ns_std,mean=ns_std)
seq_noise(x)

tensor([[ 3.9732,  3.8923,  1.1886],
        [-2.5649, -1.1318, -1.0120]])

In [None]:
seq_noise = SeqNoiseInjection(std=ns_std*10)
seq_noise(x)

tensor([[-16.8420,  -2.0927,   0.6381],
        [  5.0517,   7.7966,  -1.0036]])

### 3.3 Normalization
`Normalize` is programmed for `TensorImage` as an input tensor. It gets. At init the variable axes need to be chosen correspondingly to the shape of your tensor.

In [None]:
@Normalize
def encodes(self, x:TensorSequencesInput): 
    return (x-self.mean) / self.std

@Normalize
def decodes(self, x:TensorSequencesInput):
    f = to_cpu if x.device.type=='cpu' else noop
    return (x*f(self.std) + f(self.mean))

In [None]:
norm = Normalize.from_stats(mean=ns_mean,std=ns_std,dim=1,ndim=2,cuda=False)
x,norm(x)

(tensor([[ 1.,  1.,  1.],
         [-1., -1., -1.]]), tensor([[  1.0000,  -8.2727, -21.0000],
         [ -1.0000, -10.0909, -41.0000]]))

## 4. Split in Training, Validation
Splitting kann anhand von vorher bekannten Indizes, dem Dateipfad oder anderen allgemeinen Funktion durchgeführt werden.

Splitting innerhalb einer Sequenzen sollte in der Praxis nur dann geschehen wenn eine einzige Sequenz vorhanden ist. Diese kann dann vorher manuell geteilt werden.


### 4.1 Splitting mit vorgegebenem Index

In [None]:
splitter = IndexSplitter([1,2])
test_eq(splitter(hdf_files),[[0],[1,2]])

### 4.2 Splitting mit allgemeiner Funktion
Items, bei denen die definierte Funktion `True` zurück gibt, werden den Validierungsdatensatz zugeordnet, der Rest dem Training. In diesem Fall wird nach dem Übergeordneten Ordnernamen gesucht.

In [None]:
splitter = FuncSplitter(lambda o: Path(o).parent.name == 'valid')
test_eq(splitter(hdf_files),[[0,1],[2]])

### 4.3 Splitting anhand des Parent-Folders
Splitter, der Explizit Training und Validierungsordner den Datensätzen zuordnet

In [None]:
#export
def _parent_idxs(items, name): return mask2idxs(Path(o).parent.name == name for o in items)

def ParentSplitter(train_name='train', valid_name='valid'):
    "Split `items` from the parent folder names (`train_name` and `valid_name`)."
    def _inner(o, **kwargs):
        return _parent_idxs(o, train_name),_parent_idxs(o, valid_name)
    return _inner

In [None]:
splitter = ParentSplitter()
test_eq(splitter(hdf_files),[[0,1],[2]])

## 5. Create Datasource
A Datasource combines all implemented components on item level.

In [None]:
tfms=[  [HDF2Sequence(['current','voltage']),SeqSlice(l_slc=1),toTensorSequencesInput],
        [HDF2Sequence(['voltage']),SeqSlice(r_slc=-1),toTensorSequencesOutput]]
dsrc = DataSource(src_df,tfms=tfms,splits=splitter(src_df.path))

In [None]:
db = dsrc.databunch(bs=128,after_batch=[Cuda(),SeqNoiseInjection(std=[1.1,0.01]),Normalize(axes=[0,1])])
db.one_batch()[0].shape

torch.Size([128, 100, 2])

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_model.ipynb.
Converted 10_performance_test.ipynb.
Converted index.ipynb.
