### Resources
https://medium.com/analytics-vidhya/finding-data-block-nirvana-a-journey-through-the-fastai-data-block-api-part-2-9b23ea5d83ee
    

## Imports

In [None]:
# Create a data set ....
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.init import xavier_normal_ , uniform_
from torch.utils.data import Dataset,DataLoader
import fastai
from fastai.data_block import FloatList
from fastai.basic_train import *
from fastai.metrics import *
import random; 

import os

def model_summary(net) :
    #print(net.children)
    header = "{:<30}{:<30}{:<20}".format("Layer" ,"Weight Size", "#Params")
    print(header)
    print("="*70)
    tp = 0
    for (ln,i) in net.named_parameters() :
        #print(ln, i.size(),np.prod(np.asarray(i.size())))
        trainable_params = np.prod(np.asarray(i.size()))
        ln_out = "{:<30}{:<30}{:<20}".format(ln , str(i.size()), trainable_params)
        print(ln_out)
        tp += trainable_params
    print("="*70)
    print("Total params: {}".format(tp))

def set_device(MODE,num=0) :
#MODE = "GPU" # CPU
    device=None
    if(MODE=="GPU") :
        os.environ["CUDA_VISIBLE_DEVICES"] = str(num)
        print('CUDA available: {}  Using device {}'.format(torch.cuda.is_available(), os.environ['CUDA_VISIBLE_DEVICES']))
        device = torch.device('cuda')
    else :
        device = torch.device('cpu')
    return device


In [None]:
d = set_device("GPU",2)

## Create a Synthetic DataSet

In [None]:
def create_data_set(NP=1000,p=0.05) :
    # Create a sample time series
    x_range = int(NP/10) # basically a point every 0.1 in x ..
    print("Creating Dataset : Num Points = {} Reset prob = {}".format(NP,p))
    print("Sine wave with {} samples over x range of {}".format(NP,x_range))
    ii=np.linspace(0,x_range,num=NP)
    a = np.sin(ii)
    b = np.random.choice([0,1],size=NP,p=[p,1-p])
    b2 = np.zeros(NP)
    c = np.zeros(NP)
    idx=0
    for i in range(NP) :
        if(b[i]==0 or i==0):
            idx=0
            b[i]=0
        c[i] = a[idx]
        b2[i] = idx
        idx+=1
    c_prev = np.zeros(NP)
    c_prev[1:NP] = c[0:NP-1]
    
    X = np.stack((a,b,b2,c_prev),axis=1)
    y = c
    columns=['ii','a','b','b2','c_prev','c']
    return(ii,X,y,columns)
    

In [None]:
class CustomDataset(Dataset):
    # y_offset is cycles into future
    def __init__(self,NP,Tx=70,y_offset=0,num_features=1):
        ii,x,y,columns = create_data_set(NP=NP)
        self.x=x
        self.y=y
        self.columns=columns
        self.NP=NP
        self.Tx=Tx
        self.y_offset=y_offset
        self.num_features=num_features
        if(num_features == 1) : print("Warning only returning one feature (reset).\n  Hack around with CustomDataset to get what you want")
        elif(num_features == 2) : print("Warning only returning two features (reset and counter)\n.  Hack around with CustomDataset to get what you want")
        elif(num_features == 2) : print("Warning only returning 4 features (orig_sin, reset and counter, prev)\n.  Hack around with CustomDataset to get what you want")
        else : print("Warning , verify what you want and add some code here")
        self.c=Tx # fastai requirment
        self.loss_func=nn.MSELoss()
        
    def __len__(self):
        return len(self.x)
    
    # this returns numpy arrays ....
    def __getitem__(self, idx):
        # simple fix to going out of bounds
        if(idx > NP-self.Tx - 1) :
            idx -= self.Tx
        X=self.x[idx:idx+self.Tx]
        y=self.y[idx+self.y_offset:idx+self.Tx+self.y_offset]
        if(len(X) < self.Tx) :
            print("error idx = {}".format(idx))
        if(self.num_features == 1) : # just return the 'reset signal'
            X=X[:,1:2]
        elif(self.num_features == 2) :
            X=X[:,1:3]
        elif(self.num_features == 4) :
            # do nuttin
            X=X
        
        X = torch.from_numpy(X).type(torch.FloatTensor)
        y = torch.from_numpy(y).type(torch.FloatTensor)
        return X,y

# Custom Collate function to take a set of tuples (Seqlen x numfeatures) and convert to
# (Seqlen x batch x numfeatures)
def collate_fn(data):
    """
       data: is a list of tuples with (X, y)

    """
    # collate X, y
    X, y  = zip(*data)
    X=torch.stack( X, axis=1 )
    y=torch.stack( y, axis=1 )
    
    #print(len(X),type(X),X.size())
    #print(len(y),type(y),y.size())

    return X,y


In [None]:
print("np.sin is in radians")
NP=10000
dataset = CustomDataset(NP=NP,Tx=70,num_features=4)
(x0,y0) = dataset[0]
dl = DataLoader(dataset, collate_fn=collate_fn, batch_size=6,shuffle=True)

In [None]:
for _ in range(100) :
    (X,y) = next(iter(dl))
    type(X)
#print(X.size())
X.to(d)

### Visualize Data 

In [None]:
# X:,0 is a... just a sine wave ...
# resets when b equals zero
plt.figure(figsize=(25,5))
display(plt.plot(x0[:,0],'-bo',markersize=5))
display(plt.plot(x0[:,1],'-bo',markersize=5))

display(plt.plot(y0,'-go',markersize=10))


### Chuck it in a DF


In [None]:
print(x0.shape)
print(y0.shape)
df=pd.DataFrame(x0, columns=['a','b','b2','c_prev'])
df["y"] = y0

df.head(20) # 2*pi 6.28

## RNN Using pytorch RNN library

In [None]:
# Initialize with Xavier normal distribuition        
def weights_xavier(m):
    if isinstance(m, nn.Conv2d):
        xavier_normal_(m.weight.data)
    elif isinstance(m, nn.Linear) :
        xavier_normal_(m.weight.data)
        m.bias.data.fill_(0)
    elif isinstance(m, nn.RNN) :
        xavier_normal_(m.weight_ih_l0.data)
        xavier_normal_(m.weight_hh_l0.data)
        m.bias_ih_l0.data.fill_(0)
        m.bias_hh_l0.data.fill_(0)

### Instantiate DVRNN

![Image of Yaktocat  https://github.ibm.com/vanstee/aicoc-ai-immersion/raw/master/nb_images/lstm_rnn.png 
![Image of Yaktocat    ../nb_images/lstm_rnn.png)

In [None]:
# https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/rnn.html

class DVRNN(nn.Module) :
    def __init__(self,nf,tx,hs,nl) :
        super(DVRNN,self).__init__()
        self.name = "DVRNN"
        self.tx=tx
        self.num_features=nf
        self.hidden_size=hs
        self.num_layers=nl
        self.rnn = nn.RNN(nf, hs, nl)
        self.fc = nn.Linear(hs, 1)
    
    def forward(self, x, all_outputs=True):
        bs=x.size()[1]
        # Create Hidden init on the fly ...
        print(x.size(), self.num_layers, bs, self.hidden_size)
        hidden_init = torch.zeros(self.num_layers, bs, self.hidden_size).to(d)
        x=x.to(d)

        out1, hidden = self.rnn(x, hidden_init)
         # batchsize hard coded
        if(all_outputs == False) :
            out3 = torch.zeros(1,bs,1).to(d) 
            for b in range(bs) :
                out2 = out1[self.tx-1,b].view(-1)
                out3[0,b,0] = self.fc(out2)
        else :
            #print("batch_size={}".format(bs))
            out3 = torch.zeros(self.tx,bs,1).to(d) 
            for i in range(self.tx) :
                for b in range(bs) :
                    out2 = out1[i,b,:].view(-1)
                    out3[i,b,0] = self.fc(out2)
        return out3,hidden.detach()
    
    # reset all parameters of model 
    def init_params(self) :
        self.apply(weights_xavier)


## [skip] Train RNN

### tests skip

## FastAI Implementation

In [None]:
#https://johaupt.github.io/python/fastai/pytorch/fastai_custom_network_module.html


### Build a fastai databunch from np array


In [None]:
# Setup Custom RNN
# Build X, y
# Added in time lagged y (c_prev)
torch.manual_seed(0)
NP=3008
Tx = 70          # sequence length 
hidden_size = 50  # number of features of hidden state
num_layers  = 1  # this is for stacked implementations.  Keep 1 for now
batch_size = 16
num_features = 4 # aka input_size, aka number of columns in X
reset_prob = 0.05

#hidden_rnn = torch.zeros(num_layers, batch_size, hidden_size).to(d)
#hidden_lstm = (torch.zeros(num_layers, batch_size, hidden_size).to(d),torch.zeros(num_layers, batch_size, hidden_size).to(d))
#dataset = CustomDataset(NP=NP,Tx=Tx)
#dataloader = DataLoader(dataset, collate_fn=collate_fn, 
#                        batch_size=batch_size,shuffle=True,num_workers=30)
def print_run() :
    print("NP           : {}".format(NP))
    print("num_features : {}".format(num_features))
    print("Tx : {}".format(Tx))
    print("hidden_size : {}".format(hidden_size))
    print("num_layers : {}".format(num_layers))
    print("batch_size : {}".format(batch_size))
    print("reset_prob : {}".format(reset_prob))
print_run()


torch.manual_seed(1)

dvrnn = DVRNN(num_features,Tx,hidden_size,num_layers).to(d)
dvrnn.init_params()

#dvlstm = DVLSTM(num_features,Tx,hidden_size,num_layers).to(d)
#dvlstm.init_params()

#dvrnn.train()
#optimizer = optim.SGD(dvrnn.parameters(), lr=0.1)


In [None]:
# DataBunch.create(dataset=,collate_fn=)
from fastai.basic_data import *
dataset0 = CustomDataset(NP=NP,Tx=70,num_features=num_features)
dataset1 = CustomDataset(NP=NP,Tx=70,num_features=num_features)
(x0,y0) = dataset[0]
#mdb=DataBunch.create(train_ds=dataset0,valid_ds=dataset1,collate_fn=collate_fn)
dlt = DataLoader(dataset0, collate_fn=collate_fn, batch_size=batch_size,shuffle=True)
dlv = DataLoader(dataset1, collate_fn=collate_fn, batch_size=batch_size,shuffle=True)
mdb=DataBunch(train_dl=dlt,valid_dl=dlv,collate_fn=collate_fn)

x,y = mdb.one_batch()
#mdb.show_batch()


In [None]:
dl = DataLoader(dataset0, collate_fn=collate_fn, batch_size=batch_size,shuffle=True)
(X,y) = next(iter(dl))
dvrnn(X)
#print(type(X))

In [None]:

%load_ext autoreload
%autoreload 2
from fastai.basic_train import *
import fastai.train  
from fastai.metrics import *
learner = fastai.train.Learner(data=mdb, model=dvrnn, metrics=None) # ,metrics=accuracy)

# def tabular_learner(data:DataBunch, layers:Collection[int], emb_szs:Dict[str,int]=None, metrics=None,
#         ps:Collection[float]=None, emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, **learn_kwargs):
#     "Get a `Learner` using `data`, with `metrics`, including a `TabularModel` created using the remaining params."
#     emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))
#     model = TabularModel(emb_szs, len(data.cont_names), out_sz=data.c, layers=layers, ps=ps, emb_drop=emb_drop,
#                          y_range=y_range, use_bn=use_bn)
#     return Learner(data, model, metrics=metrics, **learn_kwargs)
# 


In [None]:
#learner.lr_find()
learner.model

In [None]:
#learner.fit_one_cycle()
#dir(learner)
learner.lr_find()
#learner.data.

In [None]:
x,y = next(iter(learner.data.train_dl))

In [None]:
type(y)
y.size()
def xxx(*x) :
    print(type(x))
    print(x.size())

xxx(y)

In [None]:
learner.fit(1, 0.001, callbacks=None, wd=0.001)

In [None]:
# Sample code, not used ...

# split_by_idxs
#  ok, so here split__by_idxs requires 2 collections of indexes ..
# train_idxs = range(0,10)
# val_idxs = range(20,25)
# db1 = FloatList(items=X,ignore_empty=True).\
#      split_by_idxs(train_idx=train_idxs, valid_idx=val_idxs) # .label_from_func(get_float_labels, label_cls=FloatList)

## Databunch.add
# .add(FloatList(items=X))  , add extra data here ...
# ItemList class has all the goodies and methods implemented.  Look there for code examples

# Example how to extend an existing datatype
class NPList(FloatList) :
    def __init__(self, items, classes=None, label_delim=None, **kwargs):
        super().__init__(items, classes=classes, **kwargs)
    
    def show_xys(a,b,c) :
        print("NotImplemented [yet]")

# Numpy Arrays
NP=1000
ii,X,y,columns = create_data_set(NP=NP)
tv_split=range(700,1000) # Indexes 700-1000 will be used for validation ...

# Using Datablocks API
db = NPList(items=X).split_by_idx(tv_split)._label_from_list(y, label_cls=NPList)
db.train.get(1)
print(db) # .num_parts
mdb = db.databunch()
mdb.show_batch()