In [10]:
import import_ipynb
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler

In [3]:
from dataclasses import dataclass, fields, asdict, replace

In [11]:
import pickle

In [3]:
TA_COLS=['SMA_10', 'SMA_20',
       'VOL_SMA_20','RSI_14','BBL_5_2.0','BBM_5_2.0','BBU_5_2.0',
       'BBB_5_2.0', 'BBP_5_2.0','MACD_12_26_9','MACDh_12_26_9','MACDs_12_26_9']
TA_COLS_OLD=['SMA_10', 'SMA_20','VOL_SMA_20','RSI_14','BBL_5_2.0','BBM_5_2.0','BBU_5_2.0',
       'BBB_5_2.0', 'BBP_5_2.0','MACD_12_26_9','MACDh_12_26_9','MACDs_12_26_9']

In [13]:
def get_data(df,index,ticker,date,
             cols=None,outcols=None,latest=False,filter_data=True):
    if filter_data: df1=df.loc[(df['ticker']==ticker)&(df['Date']==date)]
    else: df1=df
    if cols==None: cols=df.columns
    if latest==False: df2=df1.iloc[0:index][cols]
    else: df2=df1.iloc[[index-1]][cols]
    if outcols!=None: df2.rename(columns=outcols,inplace=True)
    return df2

In [11]:
#Create combinations for training/testing data - (Date in dates, index in [first:last])
def generate_data(dft,dates,config,window,ticker,scale=1,stride=1):
    samplesL=[]
    labelsL=[]
    dataL=[]
    flatdataL=[]
    flatfeaturesL=[]
    tickersL=[]
    datesL=[]
    idxL=[]
    dfL=[]
    for d in dates:
        dfd=dft.loc[(dft['ticker']==ticker)&(dft['Date']==d)]
        for c in config.prev_cols: dfd[c]=dfd[c]/dfd['Close']
        dfd=dfd[config.data_cols+config.tar_cols+config.flat_features]
        if dfd.empty: break
        nrows=dfd.shape[0]
        for index in window:
            #df=get_data(dft,index,ticker=ticker,date=d,cols=data_cols+tar_cols,outcols=outcols)
            df=dfd.iloc[0:index:stride]
            labels=df.iloc[-1][config.tar_cols].values
            flatdata=scale*(torch.tensor(df.iloc[-1][config.data_cols].values).unsqueeze(0)-1)
            flatfeatures=scale*(torch.tensor(df.iloc[-1][config.flat_features].values).unsqueeze(0)-1)
            samples=scale*(torch.tensor(df[config.data_cols].values).unsqueeze(0)-1)
            samplesL+=[samples]
            labelsL+=[labels]
            flatdataL+=[flatdata]
            flatfeaturesL+=[flatfeatures]
            tickersL+=[ticker]
            datesL+=[d]
            idxL+=[index]
            dfL+=[df]
            if index>nrows: break
    if len(labelsL)>0: labelsL=torch.tensor([l.astype('float') for l in labelsL]).squeeze(1)
    return samplesL,labelsL,flatdataL,flatfeaturesL,datesL,tickersL,idxL,dfL

In [15]:
class TsDS(Dataset):
    def __init__(self, XL,yL,flatten=False,lno=None,long=True):
        self.samples=[]
        self.labels=[]
        self.flatten=flatten
        self.lno=lno
        self.long=long
        self.scaler = StandardScaler()
        for X,Y in zip(XL,yL):
            self.samples += [torch.tensor(X).float()]
            self.labels += [torch.tensor(Y)]
    def __len__(self):
        return sum([s.shape[0] for s in self.samples])
    def __getitem__(self, idx):
        if self.flatten: sample=self.samples[idx].flatten(start_dim=1)
        else: sample=self.samples[idx]
        if self.lno==None: label=self.labels[idx]
        elif self.long: label=self.labels[idx][:,self.lno].long()
        else: label=self.labels[idx][:,self.lno].float()
        return (sample,label)
    def fit(self,kind='seq'):
        if kind=='seq':
            self.lastelems=[torch.cat([s[:,-1,:] for s in self.samples],dim=0)]
            self.scaler.fit(torch.cat([le for le in self.lastelems],dim=0))            
        elif kind=='flat': self.scaler.fit(torch.cat([s for s in self.samples],dim=0))
    def scale(self,kind='flat',scaler=None):
        def cs(s):
            return (s.shape[0]*s.shape[1],s.shape[2])
        if scaler==None: scaler=self.scaler
        if kind=='seq':
            self.samples=[torch.tensor(scaler.transform(s.reshape(cs(s))).reshape(s.shape)).float() for s in self.samples]
            pass
        elif kind=='flat':
            self.samples=[torch.tensor(scaler.transform(s)).float() for s in self.samples]
    def unscale(self,kind='flat',scaler=None):
        def cs(s):
            return (s.shape[0]*s.shape[1],s.shape[2])
        if scaler==None: scaler=self.scaler
        if kind=='seq':
            self.samples=[torch.tensor(scaler.inverse_transform(s.reshape(cs(s))).reshape(s.shape)).float() for s in self.samples]
            pass
        elif kind=='flat':
            self.samples=[torch.tensor(scaler.inverse_transform(s)).float() for s in self.samples]

In [12]:
class TickDS():
    def __init__(self,df,train_dates,test_dates,config,start_idx,winlen,
                 padding=None,batch_size=1):
        self.df=df
        self.train_dates=train_dates
        self.test_dates=test_dates
        self.config=config
        self.start_idx=start_idx
        self.winlen=winlen
        self.padding=padding
        self.tickers=df['ticker'].unique()
        self.batch_size=batch_size
    def create_dataset(self,targetL,kind='train',tickers=None):
        sL,yL,xL,fL,cL,dL,tL,idsL,dfL=[],[],[],[],[],[],[],[],[]
        if tickers==None: tickers=self.tickers
        if kind=='train': dates=self.train_dates
        elif kind=='test': dates=self.test_dates
        for t in tqdm(tickers):
            s,y,x,f,d,t,ids,dfs=generate_data(self.df,dates,self.config,
                                [i for i in range(self.start_idx,self.winlen)],ticker=t)
            c=[torch.cat((xe,fe),dim=-1) for xe,fe in zip(x,f)]
            if len(s)>0: 
                if self.padding!=None: 
                    s,y=trunc_pad_batch(s,y,length=self.padding,batch_size=self.batch_size)
                x=batch(x,batch_size=self.batch_size)
                f=batch(f,batch_size=self.batch_size)
                c=batch(c,batch_size=self.batch_size)
                d=batch(d,batch_size=self.batch_size,tensors=False)
                t=batch(t,batch_size=self.batch_size,tensors=False)
                ids=batch(ids,batch_size=self.batch_size,tensors=False)
                sL+=s
                yL+=y
                xL+=x
                fL+=f
                cL+=c
                dL+=d
                tL+=t
                idsL+=ids
                dfL+=dfs
        ds=TsDS(sL,yL)
        xs=TsDS(xL,yL)
        fs=TsDS(fL,yL)
        cs=TsDS(cL,yL)
        return ds,xs,fs,cs,dL,tL,idsL,dfL
        #return ds,cs

In [17]:
def accuracy_var(Net,X_test,y_test,verbose=True,return_probs=False,batch_size=32):
    Net.eval()
    correct=0
    m = 0
    softmax=torch.nn.Softmax(dim=-1)
    predL=[]
    predictedL=[]
    loss=torch.nn.NLLLoss()
    for x,y in zip(X_test,y_test):
        m+=x.shape[0]
        y_pred = Net(x)
        #print(loss(y_pred,y))
        predicted = torch.max(y_pred, 1)[1]
        predictedL+=predicted
        #pred = torch.max(softmax(y_pred),1)[0]
        pred = softmax(y_pred).tolist()
        predL+=pred
        correct += (predicted == y).int().sum().item()
    if verbose: print(correct,m)
    accuracy = correct/m
    if return_probs==True: 
        print(accuracy)
        return accuracy,predictedL,predL
    else: return accuracy

In [18]:
def trunc_pad_batch(samples,labels,length=40,batch_size=32): 
    #samples=[torch.cat((torch.zeros(1,length-s.shape[1]+1,s.shape[2]),s),dim=1) if s.shape[1]<length else s for s in samples]
    samples=[torch.cat((torch.zeros(1,length-s.shape[1],s.shape[2]),s),dim=1) if s.shape[1]<length else s for s in samples]
    samples=[s[:,-length:,:] if s.shape[1]>length else s for s in samples]
    labels=[l.unsqueeze(0) for l in labels]
    if batch_size>1:
        n=len(samples)
        samples=[samples[i:i+batch_size] for i in range(0,n,batch_size)]
        labels=[labels[i:i+batch_size] for i in range(0,n,batch_size)]
        samples=[torch.cat(s,dim=0) for s in samples if len(s)>0]
        labels=[torch.cat(s,dim=0) for s in labels]
    return samples,labels

In [19]:
def batch(samples,batch_size=32,tensors=True): 
    if batch_size>1:
        n=len(samples)
        samples=[samples[i:i+batch_size] for i in range(0,n,batch_size)]
        if tensors: samples=[torch.cat(s,dim=0) for s in samples if len(s)>0]
    return samples

In [21]:
def set_label(dsL,lno,long=True,flatten=False):
    for ds in dsL:
        ds.flatten=flatten
        ds.lno=lno
        ds.long=long