In [9]:
import pandas as pd
import numpy as np
from numba import jit
from v2realbot.config import DATA_DIR
from appdirs import user_data_dir
from v2realbot.enums.enums import AggType
from datetime import datetime
from v2realbot.loader.aggregator_vectorized import generate_time_bars_nb, aggregate_trades
from v2realbot.loader.fetcher import load_data, fetch_daily_stock_trades, fetch_trades_parallel, prepare_trade_cache
import vectorbtpro as vbt
import pytz
from pathlib import Path
from dotenv import load_dotenv
import os

vbt.settings.set_theme("dark")
vbt.settings['plotting']['layout']['width'] = 1280
vbt.settings.plotting.auto_rangebreaks = True
# Set the option to display with pagination
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 10)  # Number of rows per page

def find_dotenv():
    """
    Searches for a .env file in the given directory or its parents and returns the path.

    Args:
        start_path: The directory to start searching from.

    Returns:
        Path to the .env file if found, otherwise None.
    """
    try:
        start_path = __file__
    except NameError:
        #print("Notebook probably")
        start_path = os.getcwd()  
        #print(start_path)       

    current_path = Path(start_path)
    for _ in range(6):  # Limit search depth to 5 levels
        dotenv_path = current_path / '.env'
        if dotenv_path.exists():
            return dotenv_path
        current_path = current_path.parent
    return None

ENV_FILE = find_dotenv()

#NALOADUJEME DOTENV ENV VARIABLES
if load_dotenv(ENV_FILE, verbose=True) is False:
    print(f"Error loading.env file {ENV_FILE}. Now depending on ENV VARIABLES set externally.")
else:
    print(f"Loaded env variables from file {ENV_FILE}")

ACCOUNT1_LIVE_API_KEY = os.environ.get('ACCOUNT1_LIVE_API_KEY')
ACCOUNT1_LIVE_SECRET_KEY = os.environ.get('ACCOUNT1_LIVE_SECRET_KEY')
DATA_DIR_NAME = os.environ.get('DATA_DIR_NAME', "ttools") #folder in datadir

DATA_DIR = user_data_dir(DATA_DIR_NAME, False)
TRADE_CACHE = Path(DATA_DIR)/"tradecache"
AGG_CACHE = Path(DATA_DIR)/"aggcache"
zoneNY = pytz.timezone('US/Eastern')


Loaded env variables from file /Users/davidbrazda/Documents/Development/python/.env


## ODSUD POKRACOVAT

- upravil jsem v2trading cache and aggcache dirs na path, otestit
- zvazit presunuti fetcheru a dalsich veci do ttools ??
- nechat tento ipynb jako dokumentaci a tool, jeste zbyva loaddotenv zde
- zdokumentovat vsechny zmeny, viz nize
- 


- otestit a zdokumentovat preparetradecache.py jako script
- otestit a zdokumentovat hlavni funkcni
    - get fetch data
    - a prepare trade data
- 
- also document the core principles of cache
- document all in the notebook here
- dobletest everything and deploy on the servers (delete data), ability to invoke preparers maybe by gui or just script, or to quickly see the available cache data (both caches) - maybe on gui
- nasadit zmeny na v2realbot a deployovat
 - a pohnout se dal k stratlab1 ke sve strategii a pokracovat v NI - portfolio analyzer pictures
- udelat hlavni PROJEKT A TASKY (napr. dat tam co jsme domluvili s MArtinem pro pristi schuzku)


**Trade cache**
- daily files keyed by symbol (BAC-2024-01-31.parquet)
- contain all trades for main and extended sessions of that day
- trade cache is stored every time day is requested

Tradedata can be prefetched by incoking `prepare_cache.py` script.
```bash
# Running without forcing remote fetch
python3 prepare_cache.py --symbols BAC AAPL --day_start 2024-10-14 --day_stop 2024-10-18 &

# Running with force_remote set to True
python3 prepare_cache.py --symbols BAC AAPL --day_start 2024-10-14 --day_stop 2024-10-18 --force_remote &
```

**Agg cache**
- contains aggregated data for specific range and conditions (minsize, ext vs main session etc...)
- reused only if the request of exactly the same attributtes are requested
- otherwise automatically agregated from trades 


In [15]:
#This is how to call LOAD function
symbol = ["BAC"]
#datetime in zoneNY 
day_start = datetime(2024, 5, 14, 9, 45, 0)
day_stop = datetime(2024, 10, 16, 15, 1, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)

#requested AGG
resolution = 1
agg_type = AggType.OHLCV
exclude_conditions = ['C','O','4','B','7','V','P','W','U','Z','F','9','M','6'] #None to defaults
minsize = 100
main_session_only = True
force_remote = True

ohlcv_df = load_data(symbol = symbol,
                     agg_type = agg_type,
                     resolution = resolution,
                     start_date = day_start,
                     end_date = day_stop,
                     #exclude_conditions = None,
                     minsize = 100,
                     main_session_only = False,
                     force_remote = False
                     )
bac_df = ohlcv_df["BAC"]

basic_data = vbt.Data.from_data(vbt.symbol_dict(ohlcv_df), tz_convert=zoneNY)
vbt.settings['plotting']['auto_rangebreaks'] = True
basic_data.ohlcv.plot()


Contains 108  market days


Processing market days: 100%|██████████| 108/108 [00:00<00:00, 33984.91it/s]


All 108 split files loaded in 14.326442003250122 seconds
filtrujeme 09:45:00 15:01:00
excluding conditions ['4', '6', '7', '9', 'B', 'C', 'F', 'M', 'O', 'P', 'U', 'V', 'W', 'Z']
minsize 100
local_df filtered
Saved to agg_cache /Users/davidbrazda/Library/Application Support/v2realbot/aggcache/BAC-AggType.OHLCV-1-2024-05-14T09-45-00-2024-10-16T15-01-00-4679BCFMOPUVWZ-100-False.parquet


In [2]:
#This becomes prepare trade cache function
symbols = ["BAC", "AAPL"]
#datetime in zoneNY 
day_start = datetime(2024, 10, 1, 9, 45, 0)
day_stop = datetime(2024, 10, 14, 15, 1, 0)
day_start = zoneNY.localize(day_start)
day_stop = zoneNY.localize(day_stop)
force_remote = False

prepare_trade_cache(symbols, day_start, day_stop, force_remote)

Started for BAC
Contains 10  market days


Processing market days: 100%|██████████| 10/10 [00:00<00:00, 3055.29it/s]

Finished for BAC
Started for AAPL





Contains 10  market days


Processing market days: 100%|██████████| 10/10 [00:00<00:00, 2787.10it/s]
Processing market days to fetch: 100%|██████████| 9/9 [00:00<00:00, 534.34it/s]


2024-10-01 00:00:00-04:00 2024-10-01 23:59:59.999999-04:00
Fetching from remote.
2024-10-02 00:00:00-04:00 2024-10-02 23:59:59.999999-04:00
2024-10-03 00:00:00-04:00 2024-10-03 23:59:59.999999-04:00
Fetching from remote.
Fetching from remote.
2024-10-04 00:00:00-04:00 2024-10-04 23:59:59.999999-04:00
2024-10-07 00:00:00-04:00 2024-10-07 23:59:59.999999-04:00
Fetching from remote.
2024-10-08 00:00:00-04:00 2024-10-08 23:59:59.999999-04:00
2024-10-09 00:00:00-04:00 2024-10-09 23:59:59.999999-04:00
2024-10-10 00:00:00-04:00 2024-10-10 23:59:59.999999-04:00
2024-10-11 00:00:00-04:00 2024-10-11 23:59:59.999999-04:00
Fetching from remote.


Fetching data:   0%|          | 0/9 [00:00<?, ?it/s]

Remote fetched completed. 2024-10-02 2024-10-03
Remote fetched completed. 2024-10-03 2024-10-04
Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-02.parquet
Fetching from remote.
Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-03.parquet
Fetching from remote.
Remote fetched completed. 2024-10-04 2024-10-05
Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-04.parquet
Fetching from remote.
Remote fetched completed. 2024-10-07 2024-10-08
Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-07.parquet
Fetching from remote.
Remote fetched completed. 2024-10-01 2024-10-02


Fetching data:  11%|█         | 1/9 [04:04<32:39, 244.99s/it]

Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-01.parquet
Remote fetched completed. 2024-10-08 2024-10-09
Remote fetched completed. 2024-10-10 2024-10-11
Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-08.parquet


Fetching data:  67%|██████▋   | 6/9 [04:45<01:51, 37.27s/it] 

Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-10.parquet
Remote fetched completed. 2024-10-09 2024-10-10
Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-09.parquet


Fetching data:  78%|███████▊  | 7/9 [04:59<01:05, 32.52s/it]

Remote fetched completed. 2024-10-11 2024-10-12


Fetching data: 100%|██████████| 9/9 [05:09<00:00, 34.41s/it]

Saved to CACHE /Users/davidbrazda/Library/Application Support/v2realbot/tradecache/AAPL-2024-10-11.parquet
Finished for AAPL





NameError: name 'ohlcv_df' is not defined

In [None]:
import pickle
from v2realbot.config import ACCOUNT1_PAPER_API_KEY, ACCOUNT1_PAPER_SECRET_KEY, DATA_DIR
import gzip

file_path = f"{DATA_DIR}/tradecache/BAC-1709044200-1709067600.cache.gz"

with gzip.open(file_path, 'rb') as fp:
    tradesResponse = pickle.load(fp)

tradesResponse

In [14]:
def convert_dict_to_multiindex_df(tradesResponse):
    # Create a DataFrame for each key and add the key as part of the MultiIndex
    dfs = []
    for key, values in tradesResponse.items():
        df = pd.DataFrame(values)
        # Rename columns
        # Select and order columns explicitly
        #print(df)
        df = df[['t', 'x', 'p', 's', 'i', 'c','z']]
        df.rename(columns={'t': 'timestamp', 'c': 'conditions', 'p': 'price', 's': 'size', 'x': 'exchange', 'z':'tape', 'i':'id'}, inplace=True)
        df['symbol'] = key  # Add ticker as a column
        df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert 't' from string to datetime before setting it as an index
        df.set_index(['symbol', 'timestamp'], inplace=True)  # Set the multi-level index using both 'ticker' and 't'
        df = df.tz_convert(zoneNY, level='timestamp')
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame with MultiIndex
    final_df = pd.concat(dfs)

    return final_df

# Convert and print the DataFrame
df = convert_dict_to_multiindex_df(tradesResponse)
df


Unnamed: 0_level_0,Unnamed: 1_level_0,exchange,price,size,id,conditions,tape
symbol,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BAC,2024-02-27 09:30:00.002164736-05:00,P,33.680,638,52983525158993,"[ , F, T]",A
BAC,2024-02-27 09:30:00.128029184-05:00,P,33.690,7,52983525159224,"[ , I]",A
BAC,2024-02-27 09:30:00.128032256-05:00,P,33.690,7,52983525159225,"[ , Q]",A
BAC,2024-02-27 09:30:00.261718272-05:00,K,33.700,3,52983525302111,"[ , F, I]",A
BAC,2024-02-27 09:30:00.349298176-05:00,D,33.695,1,71675256256563,"[ , I]",A
BAC,...,...,...,...,...,...,...
BAC,2024-02-27 15:59:59.996081408-05:00,T,34.270,1,62880189999698,"[ , I]",A
BAC,2024-02-27 15:59:59.996084480-05:00,T,34.270,100,62880189999699,[ ],A
BAC,2024-02-27 15:59:59.997648384-05:00,N,34.270,400,52983576998465,[ ],A
BAC,2024-02-27 15:59:59.998087168-05:00,T,34.270,1,62880189999929,"[ , I]",A


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 169811 entries, (0, 'BAC') to (169810, 'BAC')
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype              
---  ------  --------------   -----              
 0   c       169811 non-null  object             
 1   i       169811 non-null  int64              
 2   p       169811 non-null  float64            
 3   s       169811 non-null  int64              
 4   t       169811 non-null  datetime64[ns, UTC]
 5   x       169811 non-null  object             
 6   z       169811 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(2), object(3)
memory usage: 9.9+ MB


In [4]:
ohlcv_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 46344 entries, 2024-03-01 09:30:00-05:00 to 2024-03-04 15:59:59-05:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    46344 non-null  float64
 1   high    46344 non-null  float64
 2   low     46344 non-null  float64
 3   close   46344 non-null  float64
 4   volume  46344 non-null  float64
 5   trades  46344 non-null  float64
dtypes: float64(6)
memory usage: 2.5 MB


In [6]:
ohlcv_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 46344 entries, 2024-03-01 09:30:00-05:00 to 2024-03-04 15:59:59-05:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    46344 non-null  float64
 1   high    46344 non-null  float64
 2   low     46344 non-null  float64
 3   close   46344 non-null  float64
 4   volume  46344 non-null  float64
 5   trades  46344 non-null  float64
dtypes: float64(6)
memory usage: 2.5 MB


In [3]:
ohlcv_df = aggregate_trades(symbol=symbol, trades_df=df, resolution=1000, type="dollar")

In [5]:
ohlcv_df.index.strftime('%Y-%m-%d %H').unique()

Index(['2024-03-01 09', '2024-03-01 10', '2024-03-01 11', '2024-03-01 12',
       '2024-03-01 13', '2024-03-01 14', '2024-03-01 15', '2024-03-04 09',
       '2024-03-04 10', '2024-03-04 11', '2024-03-04 12', '2024-03-04 13',
       '2024-03-04 14', '2024-03-04 15'],
      dtype='object', name='time')

In [5]:
#ohlcv_df.groupby(ohlcv_df.index.date).size()
ohlcv_df.head(100)

Unnamed: 0_level_0,open,high,low,close,volume,trades
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-03-01 09:34:00.145446062-05:00,34.555,34.555,34.555,34.555,28.939372,1.0
2024-03-01 09:34:00.145447016-05:00,34.555,34.555,34.555,34.555,28.939372,1.0
2024-03-01 09:34:00.145447016-05:00,34.555,34.555,34.555,34.555,28.939372,1.0
2024-03-01 09:34:00.145447016-05:00,34.555,34.555,34.555,34.555,28.939372,1.0
2024-03-01 09:34:00.145447016-05:00,34.555,34.555,34.555,34.555,28.939372,1.0
...,...,...,...,...,...,...
2024-03-01 09:34:05.011623859-05:00,34.560,34.560,34.560,34.560,28.935185,1.0
2024-03-01 09:34:05.011623859-05:00,34.560,34.560,34.560,34.560,28.935185,1.0
2024-03-01 09:34:05.011623859-05:00,34.560,34.560,34.560,34.560,28.935185,1.0
2024-03-01 09:34:05.011623859-05:00,34.560,34.560,34.560,34.560,28.935185,1.0


In [6]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,exchange,price,size,id,conditions,tape
symbol,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BAC,2024-03-01 09:34:00.145446-05:00,D,34.5550,500.0,71675373899865,[ ],A
BAC,2024-03-01 09:34:00.864348-05:00,D,34.5563,157.0,71675373958977,[ ],A
BAC,2024-03-01 09:34:00.960608-05:00,D,34.5500,100.0,71675373961523,[ ],A
BAC,2024-03-01 09:34:01.584619-05:00,D,34.5550,100.0,71675373965623,[ ],A
BAC,2024-03-01 09:34:01.793712-05:00,D,34.5550,108.0,71675373966644,[ ],A
BAC,...,...,...,...,...,...,...
BAC,2024-03-04 15:54:59.940080-05:00,Y,35.1600,63.0,52983525230401,"[ , I]",A
BAC,2024-03-04 15:54:59.940107-05:00,Z,35.1550,65.0,52983526682176,"[ , I]",A
BAC,2024-03-04 15:54:59.940110-05:00,Z,35.1550,200.0,52983526682177,[ ],A
BAC,2024-03-04 15:54:59.940113-05:00,Z,35.1600,100.0,52983526682179,[ ],A


In [None]:
#access just BCA
df_filtered = df.loc["BAC"]

df_filtered.info()

In [None]:
df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()
ticks
timestamps = ticks[:, 0]


In [None]:
df_filtered= df_filtered.reset_index()
ticks = df_filtered[['timestamp', 'price', 'size']].to_numpy()

#timestamp to integer
# Extract the timestamps column (assuming it's the first column)
timestamps = ticks[:, 0]

# Convert the timestamps to Unix timestamps in seconds with microsecond precision
unix_timestamps_s = np.array([ts.timestamp() for ts in timestamps], dtype='float64')

# Replace the original timestamps in the NumPy array with the converted Unix timestamps
ticks[:, 0] = unix_timestamps_s

#ticks[:, 0] = pd.to_datetime(ticks[:, 0]).astype('int64') // 1_000_000_000  # Convert to Unix timestamp
ticks


In [None]:
ticks = ticks.astype(np.float64)
ticks

In [None]:

resolution = 1  # Example resolution of 60 seconds
ohlcv_bars = generate_time_bars_nb(ticks, resolution)

In [None]:
ohlcv_bars

In [None]:
# Convert the resulting array back to a DataFrame
columns = ['time', 'open', 'high', 'low', 'close', 'volume', 'trades']
ohlcv_df = pd.DataFrame(ohlcv_bars, columns=columns)
ohlcv_df['time'] = pd.to_datetime(ohlcv_df['time'], unit='s')
ohlcv_df.set_index('time', inplace=True)
ohlcv_df.index = ohlcv_df.index.tz_localize('UTC').tz_convert(zoneNY)
#ohlcv_df = ohlcv_df.loc["2024-03-1 15:50:00":"2024-03-28 13:40:00"]
#ohlcv_df.index.strftime('%Y-%m-%d %H').unique()

ohlcv_df

In [10]:
import pandas_market_calendars as mcal
import pandas as pd

# Define the NYSE calendar with extended hours
nyse_calendar = mcal.get_calendar('NYSE')
schedule = nyse_calendar.schedule(start_date='2023-01-01', end_date='2023-12-31', extra_sessions=True)

# Example: Get the schedule with pre- and post-market sessions
extended_hours = nyse_calendar.sessions_with_extra(start_date='2023-01-01', end_date='2023-12-31')

TypeError: MarketCalendar.schedule() got an unexpected keyword argument 'extra_sessions'