### Dataset Utilities for Algo-Fin Data

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset
import sklearn.datasets as skds
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import math
from IPython import display
from time import sleep
import pickle
import pandas as pd

In [None]:
import import_ipynb
from feeds import DataFeed, BackFeed
from featfuncs import USE_COLS_DICT_ORIG,update_use_cols_dict 
from featfuncs import DiscretizeK,discretize_features_feed
from utils import MyDS

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
from datetime import datetime

In [None]:
def get_date_era(d):
    return (datetime.strptime(d,'%d-%b-%Y')-datetime.strptime('31-Oct-2021','%d-%b-%Y')).days

In [None]:
# dataset to be created - # dates test data # dates train data before
# train tickers even test tickers odd
# target return = 10 (default)
# feed to use DATAFEED (default)
# counters to use per date (default: range(75,260,20) i.e. 10 samples per date)
# size per sample (default 20)
# regression task (for now)
def make_regression_sample(feed,date,ticker,counter,size,retwins=[5,10],suffixes=['_5','_10']):
    maxretwin=max(retwins)
    data=feed.ndata[ticker][date]
    batch=data.iloc[counter-maxretwin:counter+size+maxretwin]
    for retwin,suffix in zip(retwins,suffixes):
        batch['target'+suffix]=batch.shift(-retwin)['Close_n']-batch['Close_n']
    return batch.iloc[maxretwin:maxretwin+size]
def create_algofin_dataset(feed,kind='train',discretize=False,estgiven=False,
                           cols='mincols',batch_size=32,size=20,
                          DkT5=None,DkT10=None,DkD=None,SYN=False):
    counters=[i for i in range(75,135,size)]
    dfL=[]
    USE_COLS_DICT=update_use_cols_dict(USE_COLS_DICT_ORIG)
    COLS=USE_COLS_DICT[cols]
    if kind is 'train': tickers=[i for i in range(0,len(feed.tickers),2)]
    elif kind is 'test': tickers=[i for i in range(1,len(feed.tickers),2)]
    dwin=10
    if SYN: tickers,dwin=[0],int(len([date for date in feed.ndata['SYN']])/2)
    for ti in tickers:
        t=feed.tickers[ti]
        if kind is 'train': dates=[date for date in feed.ndata[t]][1:-dwin]
        elif kind is 'test': dates=[date for date in feed.ndata[t]][-dwin:]
        for d in dates:
            for counter in counters:
                df=make_regression_sample(feed,d,t,counter,size)
                dfL+=[df[['ticker']+COLS+['target_5','target_10','Date']]]
    df=pd.concat(dfL,axis=0)
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df['era']=df['Date'].apply(get_date_era)
    if discretize:
        if not estgiven:
            DkT5=DiscretizeK()
            DkT10=DiscretizeK()
            DkD={c:DiscretizeK()for c in COLS} 
        DkT5.discretizeK(df,col='target_5')
        DkT10.discretizeK(df,col='target_10')
        _=[DkD[c].discretizeK(df,col=c,zeromean=False) for c in COLS if c is not 'row_num']
        VALCOLS=[c+'_val' for c in COLS if c is not 'row_num']
        X,y=df[VALCOLS+['row_num','era']].values,df['target_10_val'].values
    else:
        X,y=df[COLS+['row_num'+'era']].values,df['target_10_val'].values
    ds=MyDS(X,y,task='regression')
    dsloader = torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=False)
    if discretize: return df[VALCOLS+['row_num','era','target_10_val','target_5_val']],ds,dsloader,DkT5,DkT10,DkD
    else: return df[COLS+['row_num','era','target_10_val','target_5_val']],ds,dsloader,DkT5,DkT10,DkD

In [None]:
def create_algofin_data(feed,cols='mincols',size=20,batch_size=32,
                        discretize=False,estgiven=False,DkT5=None,DkT10=None,DkD=None,SYN=False):
    df_train,ds_train,train_loader,DkT5,DkT10,DkD=create_algofin_dataset(feed,kind='train',cols=cols,
                                                          batch_size=batch_size,
                                                          discretize=discretize,estgiven=estgiven,
                                                          DkT5=DkT5,DkT10=DkT10,DkD=DkD,SYN=SYN)
    df_test,ds_test,test_loader,_,_,_=create_algofin_dataset(feed,kind='test',cols=cols,size=size,
                                                       batch_size=batch_size,
                                                       discretize=discretize,estgiven=True,
                                                       DkT5=DkT5,DkT10=DkT10,DkD=DkD,SYN=SYN)
    return df_train,ds_train,train_loader,df_test,ds_test,test_loader,DkT5,DkT10,DkD