In [None]:
import numpy as np
import pandas as pd
import tables as tb
import zipfile
import os
import glob
import h5py

from bokeh.plotting import figure
from bokeh.io import show, output_notebook

output_notebook()

In [None]:
if not os.path.exists('data'):
    with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
        zip_ref.extractall('data')
    os.remove('archive.zip')
else:
    print('Exists')

In [None]:
class NASDAQ_Data():
    def __init__(self, time_range = ['2010-1-1', '2020-1-1']):
        self.etf_files = glob.glob('data/etfs/*.csv')
        self.stock_files = glob.glob('data/stocks/*.csv')
        self.time_range = time_range

    def get_meta(self):
        return pd.read_csv('data/symbols_valid_meta.csv')
    
    def get_ticker(self, ticker):

        def calculate_rsi(prices, period=14):
            delta = prices.diff()
            gain = delta.where(delta > 0, 0)
            loss = -delta.where(delta < 0, 0)

            avg_gain = gain.ewm(com=period - 1, min_periods=period).mean()
            avg_loss = loss.ewm(com=period - 1, min_periods=period).mean()

            rs = avg_gain / avg_loss
            rsi = 100 - (100 / (1 + rs))
            return rsi
        
        def calculate_logReturns(prices):
            return np.log(prices, where = prices > 0) - np.log(prices.shift(1), where = prices > 0)
        
        def mfi(high, low, close, volume, n=14):
            typical_price = (high + low + close) / 3
            money_flow = typical_price * volume
            mf_sign = np.where(typical_price > typical_price.shift(1), 1, -1)
            signed_mf = money_flow * mf_sign

            # Calculate gain and loss using vectorized operations
            positive_mf = np.where(signed_mf > 0, signed_mf, 0)
            negative_mf = np.where(signed_mf < 0, -signed_mf, 0)

            mf_avg_gain = pd.Series(positive_mf).rolling(n, min_periods=1).sum()
            mf_avg_loss = pd.Series(negative_mf).rolling(n, min_periods=1).sum()

            return (100 - 100 / (1 + mf_avg_gain / mf_avg_loss)).to_numpy()
        
        def atr(high, low, close, n=14):
            tr = np.amax(np.vstack(((high - low).to_numpy(), (abs(high - close)).to_numpy(), (abs(low - close)).to_numpy())).T, axis=1)
            return pd.Series(tr).rolling(n).mean().to_numpy()

        def EMV(data, ndays): 
            dm = ((data['High'] + data['Low'])/2) - ((data['High'].shift(1) + data['Low'].shift(1))/2)
            br = (data['Volume'] / 100000000) / ((data['High'] - data['Low']))
            EMV = dm / br 
            EMV_MA = pd.Series(EMV.rolling(ndays).mean(), name = 'EMV') 
            return EMV_MA
        
        if f"data/etfs/{ticker}.csv" in self.etf_files:
            df = pd.read_csv(f"data/etfs/{ticker}.csv").drop(columns = ['Adj Close'])
            df['Date'] = pd.to_datetime(df['Date'], format = "%Y-%m-%d")
            df = df.set_index('Date')
        elif f"data/stocks/{ticker}.csv" in self.stock_files:
            df = pd.read_csv(f"data/stocks/{ticker}.csv").drop(columns = ['Adj Close'])
            df['Date'] = pd.to_datetime(df['Date'], format = "%Y-%m-%d")
            df = df.set_index('Date')
        else:
            print(f'{ticker} not found')
            return pd.DataFrame()
        
        # Introduce normalized features
        df.insert(len(df.columns), "RSI14", calculate_rsi(df.Close, period = 14), True)
        df.insert(len(df.columns), "Log_ret", calculate_logReturns(df.Close), True)
        df.insert(len(df.columns), "MFI14", mfi(df.High, df.Low, df.Close, df.Volume, n = 14), True)
        df.insert(len(df.columns), "ATR14", atr(df.High, df.Low, df.Close, n = 14), True)
        df.insert(len(df.columns), "EMV14", EMV(df, ndays = 14), True)

        if ( # check if 30 days before the minimum time_range exists
            len(df[pd.to_datetime(self.time_range[0], format = "%Y-%m-%d") - pd.Timedelta(30, "d") : pd.to_datetime(self.time_range[0], format = "%Y-%m-%d")]) != 0
            ):
            return df[
                pd.to_datetime(self.time_range[0], format = "%Y-%m-%d") : pd.to_datetime(self.time_range[1], format = "%Y-%m-%d")
            ]
        else:
            print(f"{ticker} : Doesn't exist before {self.time_range[0]}")
            return pd.DataFrame()

    def create_dataset(self, filename):
        np.seterr(divide = 'ignore') 

        threshold = 0.05
        
        def rolling_window(arr, w_size, spacing):
            return [arr[i: i+w_size] for i in range(0, arr.shape[0] - w_size, spacing)]

        fileh = tb.open_file(filename, mode = 'w')
        hdf5_Features = fileh.create_earray(
            fileh.root,
            'Features',
            tb.Float64Atom(shape=()),
            (0, 200, 5),
            title = "Features",
            )
        
        hdf5_Labels = fileh.create_earray(
            fileh.root,
            'Labels',
            tb.Int8Atom(shape=()),
            (0, 3),
            title = "Labels",
            )
        
        symbols = self.get_meta().Symbol
        totals = [0, 0, 0]
        processed = 0
        invalid_symbols = []
        for symbol in symbols:
            processed += 1
            print(f'{processed}/{len(symbols)} : {symbol}')
            
            counts = [0, 0, 0]
            features = [] # shape = (# samples, 200 days, n features)
            target = [] # shape  = (# samples, label) label = [1, 0, 0]; [0, 1, 0]; [0, 0, 1]

            ticker = self.get_ticker(symbol)
            if not ticker.empty:
                data = rolling_window(ticker.to_numpy(), 210, 5) # shape = (# samples, 210 days, 5 features)
            else:
                invalid_symbols.append(symbol)
                continue
            
            for sample in data:
                future_10 = sample[-10:]
                prior_200 = sample[:200]
                last_day = prior_200[-1][3]
                if np.average(future_10[:, 3]) >= last_day + (threshold * last_day): # did it go up by a threshold?
                    target.append([0, 0, 1])
                    counts[2] += 1
                elif np.average(future_10[:, 3]) <= last_day - (threshold * last_day): # did it go down by a threshold?
                    target.append([1, 0, 0])
                    counts[0] += 1
                else:
                    target.append([0, 1, 0])
                    counts[1] += 1

                features.append(prior_200[:, [5, 6, 7, 8, 9]]) # keep only the TA indicators 
    
            # Balance Data
            balanced_features = []
            balanced_labels = []
            max_class = min(counts)
            if max_class == 0:
                # Scrap the entry
                invalid_symbols.append(symbol)
                print(f"{symbol} : Empty Class Present")
                continue

            for i in range(3):
                count = 0
                for data, lab in zip(features, target):
                    if lab.index(1) == i:
                        balanced_features.append(data)
                        balanced_labels.append(lab)
                        count += 1
                    if count == max_class:
                        break
                totals[i] += count

            hdf5_Features.append(np.array(balanced_features))
            hdf5_Labels.append(np.array(balanced_labels))

        fileh.close()
        print(f'Class Totals : {totals}')
        print("Didn't Pricess - ")
        print(invalid_symbols)
        np.seterr(divide = 'warn') 


In [None]:
data = NASDAQ_Data()
# aapl = data.get_ticker('AAPL')
# nvda = data.get_ticker('NVDA')
data.create_dataset(filename = 'data.h5')

In [None]:
filename = "data.h5"

with h5py.File(filename, "r") as f:
    # Print all root level object names (aka keys) 
    # these can be group or dataset names 
    print("Keys: %s" % f.keys())