<a href="https://colab.research.google.com/github/gsanc018/fml/blob/master/FML_Ch2Ex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Chapter 2: Financial Data Structures
This notebook is dedicated to the exercises of Advances in Financial Machine Learning by Marcos Lopez de Prado. We will use Bitcoin tick data for our study since it is very easy to get a hold of it.

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime

def compute_vwap(df):
    q = df['volume']
    p = df['price']
    vwap = np.sum(p * q) / np.sum(q)
    df['vwap'] = vwap
    return df
  
def ohlc(df):
    df['open'] = df.price.iloc[0]
    df['high'] = df.price.max()
    df['low'] = df.price.min()
    df['close'] = df.price.iloc[-1]
    return df[-1:]
  
# Raw Tick Data
data = pd.read_csv("https://api.bitcoincharts.com/v1/csv/krakenUSD.csv.gz", header = None)
data.columns = ['timestamp', 'price', 'volume']
data['timestamp'] = pd.to_datetime(data['timestamp'],unit='s')
data.tail()

Unnamed: 0,timestamp,price,volume
14756420,2019-06-01 10:20:39,8589.7,0.018661
14756421,2019-06-01 10:20:40,8589.7,0.018661
14756422,2019-06-01 10:20:42,8589.7,0.018661
14756423,2019-06-01 10:20:49,8590.0,0.016518
14756424,2019-06-01 10:22:21,8592.3,0.005818


#Time Bars

In [0]:
freq = '1D'
data_timeidx = data.set_index('timestamp')   
data_time_grp = data_timeidx.groupby(pd.Grouper(freq=freq)) 
#data_time_vwap = data_time_grp.apply(compute_vwap)

data_time_ohlc = data_timeidx['price'].groupby(pd.Grouper(freq=freq)).ohlc()
vol = data_timeidx['volume'].groupby(pd.Grouper(freq=freq)).sum().round()
data_time_ohlc = data_time_ohlc.assign(volume = vol )
data_time_ohlc = data_time_ohlc.reset_index()

In [0]:
data_time_ohlc.tail(5)

In [0]:
import altair as alt
open_close_color = alt.condition("datum.open < datum.close",
                                 alt.value("#06982d"),
                                 alt.value("#ae1325"))

rule = alt.Chart(data_time_ohlc.tail(5000)).mark_rule().encode(
    alt.X(
        'timestamp:T'),
    alt.Y(
        'low',
        title='Price',
        scale=alt.Scale(zero=False),
    ),
    alt.Y2('high'),
    color=open_close_color
)

bar = alt.Chart(data_time_ohlc.tail(5000)).mark_bar().encode(
    x='timestamp:T',
    y='open',
    y2='close',
    color=open_close_color
).interactive(bind_y=False)

rule + bar

In [0]:
time_count = data_timeidx['price'].resample('W', label='right').count()
len(time_count)
#time_count.tail(10)

This is 2.1(a). We form tick bars, volume and dollar bars from raw bitcoin tick data.

#Tick Bars


In [0]:
total_ticks = len(data) #total number of ticks for entire dataset
num_ticks_per_bar = 100000#total_ticks / num_time_bars
num_ticks_per_bar = round(num_ticks_per_bar, -3) # round to the nearest thousand
print('Total ticks:', total_ticks)
print('Ticks per bar:', num_ticks_per_bar)
data_tick_grp = data.reset_index().assign(grpId=lambda row: row.index // num_ticks_per_bar)
print('Number of ticks bars:', len(data_tick_grp.groupby(['grpId'])))

data_tick_ohlc =  data_tick_grp.groupby('grpId').apply(lambda x: ohlc(compute_vwap(x)))
data_tick_ohlc.index = data_tick_ohlc.index.droplevel()
data_tick_ohlc = data_tick_ohlc[~data_tick_ohlc.index.duplicated(keep='first')]


Total ticks: 14756425
Ticks per bar: 100000
Number of ticks bars: 148


In [0]:
#data_tick_ohlc.index = data_tick_ohlc.index.droplevel()
data_tick_ohlc = data_tick_ohlc.set_index('index')
data_tick_ohlc.head()

Unnamed: 0_level_0,timestamp,price,volume,dollar,grpId,vwap,open,high,low,close
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
99999,2016-03-13 03:50:50,406.66,0.0395,16.06307,0,364.147117,874.6704,918.05471,175.0,406.66
199999,2016-06-17 11:24:02,729.384,0.1861,135.738362,1,501.660994,408.717,776.709,405.0,729.384
299999,2016-08-02 18:52:42,578.89,1.046,605.51894,2,668.698676,729.384,781.148,551.494,578.89
399999,2016-10-30 22:09:46,703.987,1.1878,836.195759,3,604.5304,578.89,722.01,479.0,703.987
499999,2016-12-25 03:02:11,877.32,0.442136,387.894615,4,759.510953,703.987,923.869,673.407,877.32


#Volume Bars

In [0]:
data_cm_vol = data.assign(cmVol=data['volume'].cumsum()) 
total_vol = data_cm_vol.cmVol.values[-1]
vol_per_bar = 2e3
vol_per_bar = round(vol_per_bar, -2) # round to the nearest hundred
print('Total dollars:', total_vol)
print('Dollars per bar:', vol_per_bar)
data_vol_grp = data_cm_vol.assign(grpId=lambda row: row.cmVol // vol_per_bar)
print('Number of volume bars:', len(data_vol_grp.groupby(['grpId'])))
data_vol_ohlc =  data_vol_grp.groupby('grpId').apply(lambda x: ohlc(compute_vwap(x)))
data_vol_ohlc.index = data_vol_ohlc.index.droplevel()
data_vol_ohlc = data_vol_ohlc[~data_vol_ohlc.index.duplicated(keep='first')]

Total dollars: 5526756.603922224
Dollars per bar: 2000.0
Number of volume bars: 2764


In [0]:
data_vol_ohlc.tail()

Unnamed: 0,timestamp,price,volume,dollar,cmVol,grpId,vwap,open,high,low,close
14736999,2019-05-31 11:06:41,8232.3,0.011887,97.858832,5520000.0,2759.0,8273.43646,8110.1,8344.9,8110.1,8232.3
14740510,2019-05-31 13:14:19,8394.0,0.027231,228.578273,5522000.0,2760.0,8257.628394,8232.3,8442.1,8165.0,8394.0
14746493,2019-05-31 18:45:25,8395.6,0.047,394.5932,5524000.0,2761.0,8405.045876,8394.0,8474.0,8295.0,8395.6
14754050,2019-06-01 05:01:34,8545.0,0.009281,79.30324,5525999.0,2762.0,8511.345068,8395.0,8591.1,8387.9,8545.0
14756424,2019-06-01 10:22:21,8592.3,0.005818,49.98665,5526757.0,2763.0,8535.371288,8545.0,8622.5,8462.0,8592.3


#Dollar Bars

In [0]:
data['dollar'] = data.price * data.volume
data_cm_dollar = data.assign(cmDollar=data['dollar'].cumsum())
total_dollars = data_cm_dollar.cmDollar.values[-1]
dollars_per_bar = 3e7

print('Total dollars:', total_dollars)
print('Dollars per bar:', dollars_per_bar)
data_dollar_grp = data_cm_dollar.assign(grpId=lambda row: row.cmDollar // dollars_per_bar)
print('Number of dollar bars:', len(data_dollar_grp.groupby(['grpId'])))
data_dollar_ohlc =  data_dollar_grp.groupby('grpId').apply(lambda x: ohlc(compute_vwap(x)))
data_dollar_ohlc.index = data_dollar_ohlc.index.droplevel()
data_dollar_ohlc = data_dollar_ohlc[~data_dollar_ohlc.index.duplicated(keep='first')]

Total dollars: 29045334203.553448
Dollars per bar: 30000000.0
Number of dollar bars: 969


In [0]:
data_dollar_ohlc.head()

Unnamed: 0,timestamp,price,volume,dollar,cmDollar,grpId,vwap,open,high,low,close
118743,2016-03-25 08:50:20,415.46,8.438,3505.65148,29994400.0,0.0,377.707229,874.6704,918.05471,175.0,415.46
160998,2016-05-27 01:21:21,457.99,10.192,4667.83408,59998930.0,1.0,439.290361,414.738,470.0,405.0,457.99
194388,2016-06-16 11:26:50,746.71,0.01,7.4671,89999950.0,2.0,590.94605,458.0,747.46,450.55,746.71
230280,2016-06-24 02:49:43,654.654,0.15,98.1981,119998000.0,3.0,699.642116,747.0,781.148,551.494,654.654
279409,2016-07-24 15:59:41,661.18,0.104966,69.401169,149999600.0,4.0,661.482101,654.99,762.24,610.699,661.18


2.1(b) Count the number of bars produces per week, ie

7 Days/Week,
168 Hour/Week, or
 672 15Mins/Week, or
2016 5Mins/Week, or 
10,080 Mins/Week 

# Statistical Properties of Different Bar Types

In [0]:
dollar_ohlc_idx = data_dollar_ohlc.set_index('timestamp')
dollar_count = dollar_ohlc_idx['close'].resample('W', label='right').count()

In [0]:
tick_ohlc_idx = data_tick_ohlc.set_index('timestamp', inplace=True)
tick_count = data_tick_ohlc['close'].resample('W', label='right').count()

vol_ohlc_idx = data_vol_ohlc.set_index('timestamp', inplace=True)
vol_count = data_vol_ohlc['close'].resample('W', label='right').count()

dollar_ohlc_idx = data_dollar_ohlc.set_index('timestamp')
dollar_count = dollar_ohlc_idx['close'].resample('W', label='right').count()
#dollar_count = dollar_count.reset_index()
dollar_count.tail(5)

In [0]:
alt.Chart(vol_count).mark_line().encode(
    x='timestamp:T',
    y='close'
)

In [0]:
alt.Chart(dollar_count).mark_line().encode(
    x='timestamp:T',
    y='close'
)

Here we plot the weekly bar count of each of the four

In [0]:
tick_count = pd.DataFrame(tick_count)
vol_count = pd.DataFrame(vol_count)
dollar_count = pd.DataFrame(dollar_count)

source = pd.DataFrame(tick_count,vol_count,dollar_count)

In [0]:
tick_count = data_timeidx['price'].resample('W', label='right').count()
len(time_count)
#time_count.tail(10)

Next we compute the serial correlation of returns for the three bar types. What bar method has the **lowest** serial correlation?