Below, I install my own implementation of Professor Boonstra's "memoize DataFrame to disk" feature. The source code can be found at [github.com/ethho/memoize](https://github.com/ethho/memoize).

In [7]:
!python3 -m pip install git+https://github.com/ethho/memoize.git

Collecting git+https://github.com/ethho/memoize.git
  Cloning https://github.com/ethho/memoize.git to /tmp/pip-req-build-_gytufy0
  Running command git clone --filter=blob:none --quiet https://github.com/ethho/memoize.git /tmp/pip-req-build-_gytufy0
  Resolved https://github.com/ethho/memoize.git to commit bef633bd22e4acde44cccb63399a176c6cef79b9
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25h

In [8]:
import json
import re
import os
from glob import glob
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import norm, probplot
import quandl
import functools
import plotly.express as px
import plotly.graph_objects as go
from joblib import Parallel, delayed
import multiprocessing
from multiprocessing import Pool
from src.ubacktester import (
    BacktestEngine, StrategyBase, PositionBase, FeedBase,
    PlotlyPlotter, FeedID, PriceFeed, px_plot, ClockBase
)
from memoize.dataframe import memoize_df

%matplotlib inline
pd.options.display.float_format = '{:,.4f}'.format

# 20230202_hw4_ho_ethan_12350006

@mpcs
@finm33550

Ethan Ho 2/2/2023

----

## Configuration & Helper Functions

The following cell contains helper functions and configuration options that I will use in this notebook.

In [9]:
def get_secrets(fp='./secrets.json'):
    """
    Reads secret values such as API keys from a JSON-formatted file at `fp`.
    """
    with open(fp, 'r') as f:
        data = json.load(f)
    return data

def get_quandl_api_key() -> str:
    """
    Returns Quandl API key stored in secrets.json.
    """
    secrets = get_secrets()
    key = secrets.get('NASTAQ_DATA_API_KEY')
    assert key, f"NASTAQ_DATA_API_KEY field in secrets.json is empty or does not exist"
    return key

def strip_str_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a DataFrame, strips values in columns with string or object
    dtype. I noticed that this was an issue when I saw some m_ticker values
    like "AAPL       " with trailing whitespace.
    """
    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
            df[col] = df[col].str.strip()
    return df

@memoize_df(cache_dir='/tmp/memoize')
def fetch_quandl_quotemedia_prices(
    start_date, end_date, ticker
) -> pd.DataFrame:
    df = quandl.get_table(
        'QUOTEMEDIA/PRICES',
        date={'gte': start_date, 'lte': end_date},
        ticker=ticker,
        api_key=get_quandl_api_key(),
        paginate=True,
    )
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by='date', inplace=True)
    return df

@memoize_df(cache_dir='/tmp/memoize')
def fetch_quandl_tbill_prices(
    start_date, end_date,
) -> pd.DataFrame:
    """Fetch table of treasury bill prices from Quandl."""
    df = quandl.get(
        ['USTREASURY/BILLRATES'],
        returns="pandas",
        start_date=start_date,
        end_date=end_date,
        ticker=ticker,
        api_key=get_quandl_api_key(),
    )
    df = df.reset_index().rename(columns={'Date': 'date'})
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by='date', inplace=True)
    return df

def unique_index_keys(df, level=0) -> List[str]:
    return df.index.get_level_values(level=level).unique().tolist()

def risk_free_rate(**kw) -> float:
    """Calculates risk-free rate R_f from the 3-month T-bill rate."""
    tbill_prices = fetch_quandl_tbill_prices(**kw)
    tbill_returns = tbill_prices['USTREASURY/BILLRATES - 13 Wk Coupon Equiv']
    return tbill_returns.mean()

In [16]:
start_date = '2022-01-01'
end_date = '2022-12-01'
# quandl.get_table(
#     'BCMGEX/ICM2021',
#     date={'gte': '2021-01-01', 'lte': '2021-02-01'},
#     api_key=get_quandl_api_key(),
# )

df = quandl.get(
    ['OWF/CBT_C_PY2_J2022_IVM'],
    returns="pandas",
    start_date=start_date,
    end_date=end_date,
    api_key=get_quandl_api_key(),
)

In [17]:
df

Unnamed: 0_level_0,OWF/CBT_C_PY2_J2022_IVM - Future,OWF/CBT_C_PY2_J2022_IVM - AtM,OWF/CBT_C_PY2_J2022_IVM - RR25,OWF/CBT_C_PY2_J2022_IVM - RR10,OWF/CBT_C_PY2_J2022_IVM - Fly25,OWF/CBT_C_PY2_J2022_IVM - Fly10,OWF/CBT_C_PY2_J2022_IVM - Beta1,OWF/CBT_C_PY2_J2022_IVM - Beta2,OWF/CBT_C_PY2_J2022_IVM - Beta3,OWF/CBT_C_PY2_J2022_IVM - Beta4,OWF/CBT_C_PY2_J2022_IVM - Beta5,OWF/CBT_C_PY2_J2022_IVM - Beta6,OWF/CBT_C_PY2_J2022_IVM - MinMoney,OWF/CBT_C_PY2_J2022_IVM - MaxMoney,OWF/CBT_C_PY2_J2022_IVM - DtE,OWF/CBT_C_PY2_J2022_IVM - DtT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-03-14,748.25,0.4425,0.0394,0.0945,0.0045,0.0131,0.1973,0.2381,8.1702,-10.5861,-181.325,456.7565,-0.2208,0.2387,25.24,60.0
2022-03-15,758.0,0.4231,0.0487,0.0889,0.001,-0.0004,0.3329,-0.4529,1.1326,-0.8217,-96.5535,478.0851,-0.185,0.2258,24.24,59.0
2022-03-16,730.0,0.4027,0.0544,0.0782,0.002,0.0044,0.4456,-0.3966,-10.8532,39.8816,58.8247,-229.1747,-0.1961,0.2367,23.24,58.0
2022-03-17,754.625,0.3892,0.0299,0.0723,0.0044,0.0054,0.1564,0.8219,17.7846,-95.8042,-556.4358,2797.9549,-0.1648,0.225,22.24,57.0
2022-03-18,741.875,0.4007,0.0374,0.0626,0.0012,0.0055,0.3039,-0.3545,-4.1539,39.9176,-41.6557,-96.7514,-0.1876,0.226,21.24,56.0
2022-03-21,756.25,0.4213,0.0445,0.071,0.0036,0.0054,0.361,0.3842,-4.1878,-25.3991,-62.8963,1037.1299,-0.1669,0.2228,18.24,53.0
2022-03-22,753.0,0.4286,0.0245,0.0567,0.0041,0.007,0.1418,0.8932,13.0715,-80.9108,-481.4728,2711.2333,-0.1704,0.2165,17.24,52.0
2022-03-23,757.875,0.4755,0.0508,0.093,0.0201,0.0826,0.34,3.5298,-9.4448,4.5048,89.4937,-103.3067,-0.2761,0.2259,16.24,51.0
2022-03-24,748.25,0.4468,0.0353,0.0555,0.0177,0.0715,0.2611,4.0081,-10.2867,-4.39,111.6518,-79.6258,-0.2547,0.2387,15.24,50.0
2022-03-25,754.0,0.4243,0.0397,0.074,0.0132,0.0537,0.3274,3.271,-9.5625,27.6678,75.5111,-448.0916,-0.2202,0.2311,14.24,49.0


# Fetch High Frequency Trading Data

In [4]:
!ls data/Crypto/2021/For_Homework/

book_narrow_BTC-USD_2021.delim	trades_narrow_BTC-USD_2021.delim
book_narrow_ETH-BTC_2021.delim	trades_narrow_ETH-BTC_2021.delim
book_narrow_ETH-USD_2021.delim	trades_narrow_ETH-USD_2021.delim


In [6]:
DATA_DIR = 'data/Crypto'

def read_delim_gz(fp: str) -> pd.DataFrame:
    if fp.endswith('.gz'):
        !gzip -d {fp}
        fp = fp[:-3]
    # assert os.path.isfile(fp)
    df = pd.read_csv(fp, delim_whitespace=True)
    df.rename(columns={
        'timestamp_utc_nanoseconds': 'dt',
    }, inplace=True)
    return df

book = read_delim_gz('data/Crypto/2021/For_Homework/book_narrow_BTC-USD_2021.delim')
trades = read_delim_gz('data/Crypto/2021/For_Homework/trades_narrow_BTC-USD_2021.delim')
book.head(1000).tail()

Unnamed: 0,Ask1PriceMillionths,Bid1PriceMillionths,Ask1SizeBillionths,Bid1SizeBillionths,Ask2PriceMillionths,Bid2PriceMillionths,Ask2SizeBillionths,Bid2SizeBillionths,received_utc_nanoseconds,dt,Mid
995,61607370000,61607360000,180694490,45000000,61608500000,61607350000,280000000,46621560,1618561363576815100,1618561363622659000,61607365000.0
996,61607370000,61607360000,152106580,45000000,61608500000,61607350000,280000000,46621560,1618561363581388000,1618561363629171000,61607365000.0
997,61607370000,61607360000,219190760,45000000,61608500000,61607350000,280000000,46621560,1618561363585827000,1618561363633613000,61607365000.0
998,61607370000,61607360000,152106580,45000000,61608500000,61607350000,280000000,46621560,1618561363593017000,1618561363641559000,61607365000.0
999,61607370000,61607360000,180694490,45000000,61608500000,61607350000,280000000,46621560,1618561363603331000,1618561363649797000,61607365000.0


In [7]:
trades.head(1000).tail()

Unnamed: 0,received_utc_nanoseconds,dt,PriceMillionths,SizeBillionths,Side
995,1618090477003403000,1618090477019409000,59182390000,16837980,1.0
996,1618090477101050000,1618090477110242000,59182390000,4152600,1.0
997,1618090477418420000,1618090477437993000,59183600000,151460,1.0
998,1618090478217125000,1618090478217639000,59183530000,1008600,1.0
999,1618090479200375000,1618090479216013000,59181260000,151460,1.0


Check that `Side` is integral for all values.

In [None]:
(trades['Side'] == trades['Side'].astype(int)).all()

In [None]:
2**63 / 9e18

In [27]:
@dataclass
class AccumulationStratBase(StrategyBase):
    USE_NS_DT = True
    
    def step(self):
        pass


@dataclass
class Trades(FeedBase):
    USE_NS_DT = True
    
    # timestamp_utc_nanoseconds: int
    name: str
    PriceMillionths: int = 0
    SizeBillionths: int = 0
    Side: int = 0
    
@dataclass
class Book(FeedBase):
    USE_NS_DT = True

    # timestamp_utc_nanoseconds: int
    Ask1PriceMillionths: int
    Bid1PriceMillionths: int
    Ask2PriceMillionths: int
    Bid2PriceMillionths: int
    Bid1SizeBillionths: int
    Ask1SizeBillionths: int
    Bid2SizeBillionths: int
    Ask2SizeBillionths: int

In [18]:
dti = pd.concat([book['dt'], trades['dt']]).sort_values().unique()

In [22]:
clock = ClockBase(dti)

In [28]:
trades_feed = Trades.from_df(trades)

In [None]:
def test_accumulate():
    
    be = BacktestEngine(clock=clock)
    be.add_feed(trades_feed, name='trades')
    strat1 = AccumulationStratBase(cash_equity=1e4)
    be.add_strategy(strat1)
    be.run()

    # strat1.plot(
    #     show=False,
    #     # include_cols=['daily_pct_returns'],
    #     # scale_cols={'nshort': 40, 'nlong': 40}
    #     include_cols=['returns', 'nshort', 'nlong', ],
    #     scale_cols={'nshort': 40, 'nlong': 40, }
    # )

test_accumulate()