In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
# from matplotlib import pyplot as plt
import yfinance as yf
from pandas.tseries.offsets import BDay
import pickle

In [None]:
import import_ipynb
from featfuncs import add_ta,norm_add_ta

In [None]:
def check_eod(dtstr):
    dt=pd.to_datetime(dtstr)
    if dt.hour=='15' and dt.minute=='28': return True
    else: return False

In [None]:
class LabelFeed():
    #process feed x given target and loss fractions s.t. for each i, label=1 if profit and -1 if loss
    #NOT DONE - eod exits are labeled 0; optional take eod signal and label EOD even if exit criterion not met
    def __init__(self,x,e=None,t=.02,l=.01):
        self.x=x
        self.e=e
        self.t=t
        self.l=l
        self.posL=[]
        self.labels={1:[],-1:[],2:[],-2:[]}
        self.records={2:[],-2:[]}
    def add(self,i,pos):
        self.posL+=[(i,pos)]
        return
    def check(self,x,pos):
        if x>pos+self.t*pos: return 2
        elif x<pos-self.l*pos: return -2
        else: return 0
    def exits(self,x,k):
        for label in [2,-2]:
            exitsL=[p for p in self.posL if self.check(x,p[1])==label]
            exitsS=set(exitsL)
            self.posL=[p for p in self.posL if p not in exitsS]
            recordL=[(p[0],p[1],k,self.x[k]) for p in exitsL]
            self.labels[label]+=exitsL
            self.records[label]+=recordL
        return
    def run(self):
        x=self.x
        n=x.shape[0]
        for i in range(n):
            self.exits(x[i],i)
            self.add(i,x[i])
        self.labels[-1]=[p for p in self.posL if self.x[-1]/p[1]<=1]
        self.labels[1]=[p for p in self.posL if self.x[-1]/p[1]>1]

In [None]:
class TopKFeed():
    def __init__(self,K=3):
        self.K=K
        self.T=[]
        self.B=[]
    #process feed x returning, for each i, highest-T (and lowest-B) k values of x for j>i before eod 
    #return format list [{'seq':i,'x':x[i],'top_k':top-k_value k=1..K}] that can be converted directly to dataframe
    #multi-task regression for these top-k values can then be learned using features, and applied
    #for decision making - entries as well as exits
    #Algorithm:
    #Create L=[(x[pos],pos)] from x until eod; sort on pos.
    #For each i until eod, define TI=L[(_,i+1)..(_,eod)]
    #Sort TI on first element, keep top_K and append to T
    #Repeat for each day; or at the start itself create LL=[[(x[pos],pos)...x[eod],eod)]] repeat above for each element
    def make_days(x,e):
        ed=[i for i,v in enumerate(e) if v==1]
        LL=[]
        bd=0
        for d in ed:
            LL+=[x[bd:d+1]]
            bd=d+1
        return LL
    def topK_day(self,L):
        n=len(L)
        T=[]
        TL=[(pos,val) for pos,val in enumerate(L)]
        TL=sorted(TL,key=lambda x:x[1],reverse=True)
        for i in range(n):
            if (i,L[i]) in TL: TL.remove((i,L[i]))
            T+=[(i,TL[0:min(len(TL),self.K)])]
        self.T+=[T]
    def botK_day(self,L):
        n=len(L)
        B=[]
        BL=[(pos,val) for pos,val in enumerate(L)]
        BL=sorted(BL,key=lambda x:x[1],reverse=False)
        for i in range(n):
            if (i,L[i]) in BL: BL.remove((i,L[i]))
            B+=[(i,BL[0:min(len(BL),self.K)])]
        self.B+=[B]

In [None]:
def compute_labels(df,tickers,
                   riskL=[(0.02,0.01),(0.01,0.005),(0.01,0.02),(0.005,0.01)],
                   drop_dates=[],fromFeed=False):
    dates=df['Date'].unique()
    datesL=list(dates)
    for d in drop_dates: 
        datesL.remove(d)
    dates=np.array(datesL)
    print(dates)
    # riskL=[(0.02,0.01),(0.01,0.005)]
    # dates=['17-Nov-2021']
    #tickers=['SSWL.NS']
    recDL={}
    dftL=[]
    for t in tqdm(tickers):
        recDL[t]=[]
        for d in tqdm(dates):
            dft=df.loc[(df['ticker']==t)&(df['Date']==d)]
            r=dft['Close'].values
            if len(r)>200: 
                rd=r/r[0]
                tK=TopKFeed()
                tK.topK_day(rd)
                tK.botK_day(rd)
                topK=np.array([[k3[1] for k3 in tk[1]] for tk in tK.T[0] if len(tk[1])==3])
                pad=np.zeros((dft.shape[0],3))
                if topK.shape[0]!=0:
                    pad[0:topK.shape[0],0:3]=topK 
                    dft[['top1','top2','top3']]=pad
                botK=np.array([[k3[1] for k3 in tk[1]] for tk in tK.B[0] if len(tk[1])==3])
                pad=np.zeros((dft.shape[0],3))
                if botK.shape[0]!=0:
                    pad[0:topK.shape[0],0:3]=botK
                    dft[['bot1','bot2','bot3']]=pad
                if not fromFeed: 
                    dft[['Open_n','High_n','Low_n','Close_n']]=dft[['Open','High','Low','Close']]/r[0]
                debugdf=dft
                if not fromFeed: dft=norm_add_ta(dft,drop_ta=False)
                recR={}
                for risk in riskL:
                    lD=LabelFeed(rd,t=risk[0],l=risk[1])
                    lD.run()
                    dft[str(risk)]=0
                    padL=[0]*dft.shape[0]
                    stops=[l[0] for l in lD.labels[-2]]
                    riskcol_m2=[0 if i in stops else 0 for i,v in enumerate(padL)]
                    stops=[l[0] for l in lD.labels[-1]]
                    riskcol_m1=[1 if i in stops else 0 for i,v in enumerate(padL)]
                    profits=[l[0] for l in lD.labels[1]]
                    profitcol_p1=[2 if i in profits else 0 for i,v in enumerate(padL)]
                    profits=[l[0] for l in lD.labels[2]]
                    profitcol_p2=[3 if i in profits else 0 for i,v in enumerate(padL)]
                    riskprofit=[rv2+rv1+pv2+pv1 for rv2,rv1,pv1,pv2 in zip(riskcol_m2,riskcol_m1,profitcol_p1,profitcol_p2)]
                    dft[str(risk)]=riskprofit
                    recR[str(risk)+str(d)]=lD.records
                recDL[t]+=[recR]
            dftL+=[dft]
    dft_all=pd.concat(dftL,axis=0)
    #dft_all.to_csv('./DataLocal/algo_fin_new/labeled_01_Dec_'+str(tickers[0].split('.')[0])+'.csv',index=False)
    return dft_all