In [3]:
import numpy as np
import pandas as pd
import zipfile
import os
import glob

from bokeh.plotting import figure
from bokeh.io import show, output_notebook

output_notebook()

In [4]:
if not os.path.exists('data'):
    with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
        zip_ref.extractall('data')
    os.remove('archive.zip')
else:
    print('Exists')

Exists


In [28]:
class NASDAQ_Data():
    def __init__(self, time_range = ['2010-1-1', '2020-1-1']):
        self.etf_files = glob.glob('data/etfs/*.csv')
        self.stock_files = glob.glob('data/stocks/*.csv')
        self.time_range = time_range

    def get_meta(self):
        return pd.read_csv('data/symbols_valid_meta.csv')
    
    def get_ticker(self, ticker):

        def calculate_rsi(prices, period=14):
            delta = prices.diff()
            gain = delta.where(delta > 0, 0)
            loss = -delta.where(delta < 0, 0)

            avg_gain = gain.ewm(com=period - 1, min_periods=period).mean()
            avg_loss = loss.ewm(com=period - 1, min_periods=period).mean()

            rs = avg_gain / avg_loss
            rsi = 100 - (100 / (1 + rs))
            return rsi
        
        def calculate_logReturns(prices):
            return np.log(prices) - np.log(prices.shift(1))
        
        def mfi(high, low, close, volume, n=14):
            typical_price = (high + low + close) / 3
            money_flow = typical_price * volume
            mf_sign = np.where(typical_price > typical_price.shift(1), 1, -1)
            signed_mf = money_flow * mf_sign

            # Calculate gain and loss using vectorized operations
            positive_mf = np.where(signed_mf > 0, signed_mf, 0)
            negative_mf = np.where(signed_mf < 0, -signed_mf, 0)

            mf_avg_gain = pd.Series(positive_mf).rolling(n, min_periods=1).sum()
            mf_avg_loss = pd.Series(negative_mf).rolling(n, min_periods=1).sum()

            return (100 - 100 / (1 + mf_avg_gain / mf_avg_loss)).to_numpy()
        
        def atr(high, low, close, n=14):
            tr = np.amax(np.vstack(((high - low).to_numpy(), (abs(high - close)).to_numpy(), (abs(low - close)).to_numpy())).T, axis=1)
            return pd.Series(tr).rolling(n).mean().to_numpy()

        def EMV(data, ndays): 
            dm = ((data['High'] + data['Low'])/2) - ((data['High'].shift(1) + data['Low'].shift(1))/2)
            br = (data['Volume'] / 100000000) / ((data['High'] - data['Low']))
            EMV = dm / br 
            EMV_MA = pd.Series(EMV.rolling(ndays).mean(), name = 'EMV') 
            return EMV_MA
        
        if f"data/etfs/{ticker}.csv" in self.etf_files:
            df = pd.read_csv(f"data/etfs/{ticker}.csv").drop(columns = ['Adj Close'])
            df['Date'] = pd.to_datetime(df['Date'], format = "%Y-%m-%d")
            df = df.set_index('Date')
        elif f"data/stocks/{ticker}.csv" in self.stock_files:
            df = pd.read_csv(f"data/stocks/{ticker}.csv").drop(columns = ['Adj Close'])
            df['Date'] = pd.to_datetime(df['Date'], format = "%Y-%m-%d")
            df = df.set_index('Date')
        else:
            print(f'{ticker} not found')
            return None
        
        df.insert(len(df.columns), "RSI14", calculate_rsi(df.Close, period = 14), True)
        df.insert(len(df.columns), "Log_ret", calculate_logReturns(df.Close), True)
        df.insert(len(df.columns), "MFI14", mfi(df.High, df.Low, df.Close, df.Volume, n = 14), True)
        df.insert(len(df.columns), "ATR14", atr(df.High, df.Low, df.Close, n = 14), True)
        df.insert(len(df.columns), "EMV14", EMV(df, ndays = 14), True)

        if (
            len(df[pd.to_datetime(self.time_range[0], format = "%Y-%m-%d") - pd.Timedelta(30, "d") : pd.to_datetime(self.time_range[0], format = "%Y-%m-%d")]) != 0
            ):
            return df[
                pd.to_datetime(self.time_range[0], format = "%Y-%m-%d") : pd.to_datetime(self.time_range[1], format = "%Y-%m-%d")
            ]
        else:
            return None

    def create_dataset(self):
        threshold = 0.05
        
        def rolling_window(arr, w_size, spacing):
            return [arr[i: i+w_size] for i in range(0, arr.shape[0] - w_size, spacing)]
        
        #symbols = self.get_meta().Symbol
        symbols = ['SPY']
        features = [] # shape = (# samples, 200 days, n features)
        target = [] # shape  = (# samples, label) label = [1, 0, 0]; [0, 1, 0]; [0, 0, 1]
        for symbol in symbols:
            data = rolling_window(self.get_ticker(symbol).to_numpy(), 210, 1) # shape = (# samples, 210 days, 5 features)
            for sample in data:
                future_10 = sample[-10:]
                prior_200 = sample[:200]
                last_day = prior_200[-1][3]
                if np.average(future_10[:, 3]) >= last_day + (threshold * last_day):
                    target.append([0, 0, 1])
                elif np.average(future_10[:, 3]) <= last_day - (threshold * last_day):
                    target.append([1, 0, 0])
                else:
                    target.append([0, 1, 0])

                features.append(prior_200[:, [5, 6, 7, 8, 9]])

        print(target)

In [29]:
data = NASDAQ_Data()
# aapl = data.get_ticker('AAPL')
# nvda = data.get_ticker('NVDA')
spy = data.create_dataset()
spy

[[0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0]

In [6]:
data.get_meta()

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
2,Y,AAAU,Perth Mint Physical Gold ETF,P,,Y,100.0,N,,AAAU,AAAU,N
3,Y,AACG,ATA Creativity Global - American Depositary Sh...,Q,G,N,100.0,N,N,,AACG,N
4,Y,AADR,AdvisorShares Dorsey Wright ADR ETF,P,,Y,100.0,N,,AADR,AADR,N
...,...,...,...,...,...,...,...,...,...,...,...,...
8044,Y,ZUO,"Zuora, Inc. Class A Common Stock",N,,N,100.0,N,,ZUO,ZUO,N
8045,Y,ZVO,Zovio Inc. - Common Stock,Q,Q,N,100.0,N,N,,ZVO,N
8046,Y,ZYME,Zymeworks Inc. Common Shares,N,,N,100.0,N,,ZYME,ZYME,N
8047,Y,ZYNE,"Zynerba Pharmaceuticals, Inc. - Common Stock",Q,G,N,100.0,N,N,,ZYNE,N


In [5]:
p = figure(title="SPY vs AAPL vs NVDA", x_axis_label='Time (days)', y_axis_label='Close Price', x_axis_type='datetime')
p.line(spy.index, spy.Close, legend_label="SPY", line_width=2, color="blue")
p.line(aapl.index, aapl.Close, legend_label="AAPL", line_width=2, color="red")
p.line(nvda.index, nvda.Close, legend_label="NVDA", line_width=2, color="green")
p.legend.location = "top_left"

show(p)