In [None]:
# default_exp dataloaders
# default_cls_lvl 3

In [None]:
#hide
%matplotlib widget
from fastai.callback.progress import *
from fastai.callback.tracker import *
from fastai.callback.schedule import *

In [None]:
#export
from seqdata.core import *
from seqdata.models.core import *
from seqdata.learner import *
from fastai.basics import *

import math

## Custom Dataloaders
> Pytorch Modules for Training Models for sequential data

# Truncated Backpropagation Through Time

The tbptt dataloader needs to split the minibatches that are created in several smaller minibatches that will be returned sequentially before the next minibatch may be created.

In [None]:
#export
from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter,_DatasetKind
_loaders = (_MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter)

@delegates()
class TbpttDl(TfmdDL):

    def __init__(self, dataset, sub_seq_len=None, seq_len = None ,shuffle=True,num_workers=2, **kwargs):
#         assert sub_seq_len is not None
        store_attr('sub_seq_len,seq_len')
        self.rnn_reset = False
        super().__init__(dataset=dataset, shuffle=shuffle, num_workers=num_workers, **kwargs)
        
    @property
    def n_sub_seq(self):
        if self.sub_seq_len is None: return 1
        if self.seq_len is None: self.seq_len = self.do_item(0)[0].shape[0]
        return math.ceil(self.seq_len / self.sub_seq_len)
        
    def __len__(self):
        return super().__len__() * self.n_sub_seq
    
    def _next_worker(self,w_id):
        w_id += 1
        if w_id > self.fake_l.num_workers-1: w_id = 0
        return w_id

    def sample(self):
        #replaced new fastai sample formulation that store __idxs in main process, lead to attribute error
        #return (b for i,b in enumerate(self.__idxs) if i//(self.bs or 1)%self.num_workers==self.offs)
        return (b for i,b in enumerate(self.get_idxs()) if i//(self.bs or 1)%self.num_workers==self.offs)
            
    def __iter__(self):
        '''iterator that handles multiprocessing by caching samples that are generated out of order'''
        self.randomize()
        self.before_iter()
        
        n_buffer = self.fake_l.num_workers*self.n_sub_seq
        queue = {n:[] for n in range(self.fake_l.num_workers)} 
        current_worker = None
        idx = 0
        for loaded_b,w_id in _loaders[self.fake_l.num_workers==0](self.fake_l):

            if w_id is None:
                self.rnn_reset=True
                b= loaded_b
                self.rnn_reset = (idx % self.n_sub_seq) == 0
                yield self.after_batch(b if self.device is None else to_device(b, self.device))
                idx += 1 #idx increments after every yield, not every loop
            else:
                if current_worker is None:
                    current_worker = w_id
                
                #retrieve queued elements from worker
                while len(queue[current_worker]) > 0:
                    b = queue[current_worker].pop(0)
                    self.rnn_reset = (idx % self.n_sub_seq) == 0
                    yield self.after_batch(b if self.device is None else to_device(b, self.device))
                    idx += 1
                    if (idx % self.n_sub_seq) == 0:
                        current_worker = self._next_worker(current_worker) #next worker, stay in loop for the queue
                        
                
                #retrieve fresh elements from worker
                if w_id != current_worker: #not active worker
                    queue[w_id] += [loaded_b]
                    continue
                else:#active worker
                    b = loaded_b
                    self.rnn_reset = (idx % self.n_sub_seq) == 0
                    yield self.after_batch(b if self.device is None else to_device(b, self.device))
                    idx += 1 #idx increments after every yield, not every loop
                    if (idx % self.n_sub_seq) == 0:
                        current_worker = self._next_worker(current_worker)
                
        self.after_iter()
        if hasattr(self, 'it'): del(self.it)
    
    def create_batches(self, samps):
        yield from self._tbptt_generator(super().create_batches(samps))
        
    def _tbptt_generator(self,batch_iter):
        '''generator function that splits batches in smaller windows, yields mini_batch and worker id'''
        for b in batch_iter:
            for i in range(self.n_sub_seq):
                #it is importan to retain the tuple type, or future transforms may now work
                if self.sub_seq_len is None:
                    trunc_b = b
                else:
                    trunc_b = tuple([retain_type(x[:,i*self.sub_seq_len:(i+1)*self.sub_seq_len],x) for x in b])
                yield trunc_b, (None if torch.utils.data.get_worker_info() is None else torch.utils.data.get_worker_info().id)
        

In [None]:
tfm_lst = [DfHDFCreateWindows(win_sz=1000+1,stp_sz=1000,clm='current')]
seq = DataBlock(blocks=(SequenceBlock.from_hdf(['current','voltage'],TensorSequencesInput,clm_shift=[-1,-1]),
                        SequenceBlock.from_hdf(['voltage'],TensorSequencesOutput,clm_shift=[1])),
                 get_items=CreateDict(tfm_lst),
                 splitter=ApplyToDict(ParentSplitter()))
db = seq.dataloaders(get_hdf_files('test_data/'),dl_type=TbpttDl,sub_seq_len=10,num_workers=6)

In [None]:
l = [array(x[-1][0,:,0].cpu()) for x in db.train]


In [None]:
plt.figure()
plt.plot(np.concatenate(l))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7f8482b4d0d0>]

num_workers has to be 0. If there are parallel workers, the order of minibatches will be corrupted

## TBPTT_Reset_Callback
The stateful model needs to reset its hidden state, when a new sequence begins. The callback reads the reset flag and acts accordingly.

In [None]:
#export
def reset_model_state(model):
    for m in model.modules():
        if hasattr(m,'reset_state'): m.reset_state()

In [None]:
#export
class TbpttResetCB(Callback):
    "`Callback` resets the rnn model with every new sequence for tbptt, calls `reset_state` in every module of the model"
        
    def before_batch(self):
        dl = self.learn.dls.train if self.training else self.learn.dls.valid
#         if not self.training: import pdb; pdb.set_trace()
        if (hasattr(dl,'rnn_reset') and dl.rnn_reset) or not hasattr(dl,'rnn_reset'):
            reset_model_state(self.learn.model)
        
    def after_fit(self): 
        reset_model_state(self.learn.model)

## Example

In [None]:
lrn = RNNLearner(db,num_layers=1,rnn_type='gru',stateful=False,metrics=[SkipNLoss(fun_rmse,100)])
lrn.add_cb(TbpttResetCB())

<fastai.learner.Learner at 0x7f8482b42b90>

In [None]:
lrn.fit_one_cycle(1,lr_max=3e-2)

epoch,train_loss,valid_loss,fun_rmse,time
0,1.7e-05,7.1e-05,0.006916,00:15


In [None]:
db.train.sub_seq_len = 10

In [None]:
lrn.fit_one_cycle(1,lr_max=3e-2)

epoch,train_loss,valid_loss,fun_rmse,time
0,0.000148,0.011639,0.105891,00:15


# Weighted Sampling Dataloader

A weighted sampling dataloader for nonuniforly distributed data. A factory method receives the base Dataloader class and returns the inherited weighted sampling dataloader class

In [None]:
#export
def WeightedDL_Factory(cls):
    '''
    Weighted Dataloader that provides control over sampling probabilities.
    wgts: probability array with probability for every item
            gets extracted from the pandas 'p_sample' column if given. 
            Otherwise uniform sampling will be enabled
        
    '''
    assert issubclass(cls, TfmdDL)
    
    class WeightedDL(cls):
        def __init__(self, dataset, wgts=None, **kwargs):
#             import pdb;pdb.set_trace()
            self.wgts = None
            #self.items need to be assigned, but super.init needs wgts allready assigned
            super().__init__(dataset=dataset, **kwargs) 
            if wgts is None:
                if  (type(self.items) is list and
                    len(self.items) > 0 and 
                    type(self.items[0]) is dict and 
                    'p_sample' in self.items[0].keys()):
                    self.wgts = np.array([x['p_sample'] for x in self.items])
                    self.wgts = self.wgts/self.wgts.sum()
                else:
                    print('No wgts provided for WeightedDL. Was that intentional?')
            else:
                self.wgts = wgts/np.sum(wgts)

        def get_idxs(self):
            if self.n==0: return []
            if not self.shuffle or self.wgts is None: return super().get_idxs()
            return list(np.random.choice(self.n, size=len(self)*self.bs, p=self.wgts))
    return WeightedDL

In [None]:
dl = WeightedDL_Factory(TfmdDL)([1,2]*5,bs=10,wgts=[2,1]*5)

In [None]:
dl.wgts

array([0.13333333, 0.06666667, 0.13333333, 0.06666667, 0.13333333,
       0.06666667, 0.13333333, 0.06666667, 0.13333333, 0.06666667])

In [None]:
dl.one_batch()

tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2])

## ItemLst Transform for weight calculation

In [None]:
#export
def uniform_p_of_category(cat_name):  
    '''Scales sampling weights for an even distribution between every category'''
    def _inner(df):
        if 'p_sample' in df:
            df_targ = df.drop('p_sample',axis='columns')
        else:
            df_targ = df
            
        counts = df_targ[cat_name].value_counts()
        sample_prob =  1/counts
        sample_prob.name = 'p_sample'
        df_res = df_targ.merge(sample_prob,left_on=cat_name,right_index=True)
        
        if 'p_sample' in df: 
            df_res.p_sample = df_res.p_sample* df.p_sample.values
            
        df_res.p_sample /= df_res.p_sample.sum()
            
        return df_res
    
    return _inner

In [None]:
#export
def uniform_p_of_float(var_name,bins = 10):
    '''Scales sampling weights for an even distribution of the continous variable by creating equi sized bins'''
    def _inner(df):
        if 'p_sample' in df:
            df_targ = df.drop('p_sample',axis='columns')
        else:
            df_targ = df
            
        df_targ['bins'] = pd.cut(df_targ[var_name], bins)
        counts = df_targ['bins'].value_counts()
        sample_prob =  1/counts
        sample_prob.name = 'p_sample'
        df_res = df_targ.merge(sample_prob,left_on='bins',right_index=True)
        df_res.drop(['bins'],axis='columns',inplace=True)
        
        if 'p_sample' in df: 
            df_res.p_sample = df_res.p_sample* df.p_sample.values
            
        df_res.p_sample /= df_res.p_sample.sum()
        
        return df_res

    return _inner

In [None]:
#export
def uniform_p_of_float_with_gaps(var_name,bins = 100):
    '''Scales sampling weights for an even distribution of the continous variable by creating equi sized bins'''
    def _inner(df):
        if 'p_sample' in df:
            df_targ = df.drop('p_sample',axis='columns')
        else:
            df_targ = df
            
        l = df_targ[var_name].max()-df_targ[var_name].min() #value range
        df_targ['bins'] = pd.qcut(df_targ[var_name],bins,duplicates='drop') #bins with rougly the same size
        df_targ['p_sample'] =  df_targ['bins'].apply(lambda x: x.length).astype('f8')/l #sample_prob by bin width
        sample_prob =  1/df_targ['bins'].value_counts() #correct uneven bin distribution
        sample_prob.name = 'p_sample_correction'
        df_res = df_targ.merge(sample_prob,left_on='bins',right_index=True)
        
        df_res.p_sample *= df_res.p_sample_correction
        df_res.drop(['bins','p_sample_correction'],axis='columns',inplace=True)

        if 'p_sample' in df: 
            df_res.p_sample = df_res.p_sample* df.p_sample.values
            
        df_res.p_sample /= df_res.p_sample.sum()
        
        return df_res

    return _inner

In [None]:
def train_valid(df):   
    ''' test function that extracts valid and train from the path string'''
    df['train'] = df.path.astype(str).str.contains('train',regex=False)
    return df

In [None]:
# %%time
tfm_lst = [train_valid, DfHDFCreateWindows(win_sz=1000+1,stp_sz=1000,clm='current') ,uniform_p_of_category('train'),uniform_p_of_float('l_slc'),uniform_p_of_float_with_gaps('r_slc')]
apply_df_tfms(get_hdf_files('test_data/'),tfm_lst) 

Unnamed: 0,path,train,l_slc,r_slc,p_sample
0,test_data/train/Sim_RealisticCycle1.hdf5,True,0,1001,0.000620
0,test_data/train/Sim_RealisticCycle1.hdf5,True,1000,2001,0.000620
0,test_data/train/Sim_RealisticCycle1.hdf5,True,2000,3001,0.000620
1,test_data/train/Sim_RealisticCycle2.hdf5,True,0,1001,0.000620
1,test_data/train/Sim_RealisticCycle2.hdf5,True,1000,2001,0.000620
...,...,...,...,...,...
0,test_data/train/Sim_RealisticCycle1.hdf5,True,264000,265001,0.001859
1,test_data/train/Sim_RealisticCycle2.hdf5,True,263000,264001,0.001859
1,test_data/train/Sim_RealisticCycle2.hdf5,True,264000,265001,0.001859
2,test_data/valid/Sim_RealisticCycle3.hdf5,False,263000,264001,0.001859


In [None]:
seq = DataBlock(blocks=(SequenceBlock.from_hdf(['current','voltage'],TensorSequencesInput,clm_shift=[-1,-1]),
                        SequenceBlock.from_hdf(['voltage'],TensorSequencesOutput,clm_shift=[1])),
                 get_items=CreateDict(tfm_lst),
                 splitter=ApplyToDict(ParentSplitter()))
db = seq.dataloaders(get_hdf_files('test_data/'),dl_type=WeightedDL_Factory(TbpttDl),sub_seq_len=10)

In [None]:
db.train.wgts[:5],db.valid.wgts[:5]

(array([0.00093157, 0.00093157, 0.00093157, 0.00093157, 0.00093157]),
 array([0.00184989, 0.00184989, 0.00184989, 0.00277484, 0.00277484]))

# Mini Batch Limiter Dataloader

A weighted sampling dataloader for nonuniforly distributed data. A factory method receives the base Dataloader class and returns the inherited weighted sampling dataloader class

In [None]:
#export
def BatchLimit_Factory(cls):
    '''
    Batch limited Dataloader that provides an upper limit for the number of mini batches per epoch
    max_batches: upper limit for minibatch count per epoch
        
    '''
    assert issubclass(cls, TfmdDL)
    
    class BatchLimitDL(cls):
        def __init__(self, dataset, max_batches=None, **kwargs):
            self.max_batches = max_batches
            super().__init__(dataset=dataset, **kwargs)

        def __len__(self):
            l = super().__len__() 
            if self.max_batches is not None: l = min(l,self.max_batches)
            return l

        def __iter__(self):
            if self.max_batches is None: 
                yield from super().__iter__()
            else:
                for idx,b in enumerate(super().__iter__()):
                    if idx >= self.max_batches: break
                    yield b

    return BatchLimitDL

In [None]:
dl = BatchLimit_Factory(TfmdDL)([1,2]*5,bs=2,max_batches=3)

In [None]:
len(dl)

3

In [None]:
[x for x in dl]

[tensor([1, 2]), tensor([1, 2]), tensor([1, 2])]

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_models.ipynb.
Converted 01a_IndRNN.ipynb.
Converted 02_learner.ipynb.
Converted 03_dataloaders.ipynb.
Converted 11_dualrnn.ipynb.
Converted 12_TensorQuaternions.ipynb.
Converted 13_HPOpt.ipynb.
Converted index.ipynb.
