In [34]:
def add_cme_expiry(df, resolution="hour"):
    if resolution == "hour":
        # The final settlement value is based on the CME CF Bitcoin Reference Rate (BRR) at 4:00 p.m. London time on the expiration day of the futures contract.
        # 17:00 SAST, 15:00 UTC
        df['cme_expiry'] = (df.datestamp.dt.day > df.datestamp.dt.days_in_month-7) & (df.datestamp.dt.weekday == 4) & (df.datestamp.dt.hour == 15)
    elif resolution == "day":
        df['cme_expiry'] = (df.datestamp.dt.day > df.datestamp.dt.days_in_month-7) & (df.datestamp.dt.weekday == 4)
    return df

def add_quarterly_expiry(df):
    """
    datestamp = pd.to_datetime("2019-03-28")
    print ((datestamp.day, datestamp.days_in_month))
    is_expire = (datestamp.day >= datestamp.days_in_month-7) # & (datestamp.dt.weekday == 4) & (datestamp.dt.month.isin([3, 6, 9, 12]))
    >>> is_expire
    True
    """
    df['quarterly_expiry'] = (df.datestamp.dt.day > df.datestamp.dt.days_in_month-7) & (df.datestamp.dt.weekday == 4) & (df.datestamp.dt.month.isin([3, 6, 9, 12]))
    return df

def add_funding(df):
    df["is_funding_payout"] = (df.datestamp.dt.hour.isin([6, 14, 22]) & (df.datestamp.dt.minute == 0))
    df_fund = pd.read_csv("data/bitmex_funding.csv", parse_dates=["timestamp"])
    df["local_timestamp"] = df.datestamp.dt.tz_localize('Africa/Johannesburg')
    df_fund["timestamp"] = df_fund.timestamp.dt.tz_convert('Africa/Johannesburg')
    merge_df = df.merge(df_fund, left_on="local_timestamp", right_on="timestamp", how="left")
    # print (merge_df.loc[merge_df["fundingRate"] > 0, ["is_funding_payout", "fundingRate"]])
    # df["funding_rate"] = merge_df["fundingRate"]
    merge_df["next_funding"] = merge_df["fundingRate"].bfill()
    merge_df["last_funding"] = merge_df["fundingRate"].ffill()
    return merge_df

def marker_elapsed(series, cap=None):
    steps = 0
    step_list = []
    for marker in series.tolist():
        steps += 1
        if marker:
            steps = 0
        if cap and steps > cap:
            steps = 0
        step_list.append(steps)
    return pd.Series(step_list).values

DIF_ADJ_BLOCKS = 2016
def add_dif_adj_h(df):
    if 'dif' in df.columns:
        df['DiffChanged'] = abs(df['dif'] - df['dif'].shift(1)) > 0.0
        # 144 should be one days worth of blocks
        df['blockadjustment'] = df['hgt'] % DIF_ADJ_BLOCKS == 0
        print (df.loc[df.DiffChanged, ['DiffChanged', 'blockadjustment', 'time']].tail(500))
    else:
        df_dif = pd.read_csv("./data/btc.com_diff.csv") # , parse_dates=["timestamp"]
        # is currently to the minute, CDD hourly is AM/PM format
        # df_dif["adj_date"] = df_dif.adj_date.dt.strftime("%Y-%m-%d %I:00 %s")
        df_dif["adj_date"] = pd.to_datetime(df_dif.timestamp, unit="s")
        df_dif["adj_date"] = pd.to_datetime(df_dif.adj_date.dt.strftime("%Y-%m-%d %H:00:00"))
        df = df.merge(df_dif, how="left", left_on="datestamp", right_on="adj_date")
        df["blockadjustment"] = pd.notnull(df.change)
        # df["adj_period"] = df["blockadjustment"].cumsum()
    return df

# prep daily bytetree
def add_dif_adj(df):
    if 'dif' in df.columns:
        df['DiffChanged'] = abs(df['dif'] - df['dif'].shift(1)) > 0.0
        # 144 should be one days worth of blocks
        df['blockadjustment'] = df['hgt'] % DIF_ADJ_BLOCKS < 144
    else:
        df_dif = pd.read_csv("./data/btc.com_diff.csv") # , parse_dates=["timestamp"]
        # is currently to the minute, CDD hourly is AM/PM format
        # df_dif["adj_date"] = df_dif.adj_date.dt.strftime("%Y-%m-%d %I:00 %s")
        df_dif["adj_date"] = pd.to_datetime(df_dif.timestamp, unit="s")
        df_dif["adj_date"] = pd.to_datetime(df_dif.adj_date.dt.strftime("%Y-%m-%d %H:00:00"))
        df = df.merge(df_dif, how="left", left_on="datestamp", right_on="adj_date")
        df["blockadjustment"] = pd.notnull(df.change)
        # df["adj_period"] = df["blockadjustment"].cumsum()
    return df

# df_m = add_funding(df)
# df_m[df_m.is_funding_payout]
# df_m[["is_funding_payout", "next_funding", "fundingRate"]].head(200)


In [26]:
# trying to consolidate colab FFT_Norm notebook locally
# !pip install scaleogram
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
from preprocess import normalise_data, detrend, preprocess_frame, load_frame
## from preprocess import add_cme_expiry

# df = pd.read_csv("DATASET.CSV")
df = load_frame()
# df = preprocess_frame(df)
print (df.tail(5))
# print (df.columns)

fig = px.line(df, x=df.index, y="value", log_y=True)
fig


close            txv           dtv       mtv  \
datestampindex                                                            
2020-06-03 02:00:00  9579.919229  299873.619993  2.868318e+09  0.054278   
2020-06-04 02:00:00  9724.521692  290777.898263  2.807577e+09  0.007634   
2020-06-05 02:00:00  9664.842772  325165.868376  3.161815e+09  0.007395   
2020-06-06 02:00:00  9624.578125  112470.553108  1.083564e+09  0.005221   
2020-06-07 02:00:00  9652.640582  104604.067195  1.002824e+09  0.005720   

                            dmv           xtv           dxt          t1v  \
datestampindex                                                             
2020-06-03 02:00:00  521.119209  17564.070619  1.672899e+08   659.765731   
2020-06-04 02:00:00   73.585497  44435.331572  4.321438e+08   930.908139   
2020-06-05 02:00:00   71.886313  44235.331360  4.317897e+08  1009.564603   
2020-06-06 02:00:00   50.310561   2017.302354  1.941867e+07   728.465052   
2020-06-07 02:00:00   54.845094   2677.373004

In [None]:
from make_dataset import compile_dataset
# compile_dataset("ccxt_binance_1m")
# compile_dataset("Bitstamp_BTCUSD_d.csv")
# compile_dataset("Bitstamp_BTCUSD_1h.csv")
compile_dataset("bytetree_1d_bitcoin.csv")

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots

def seasonal_violin(df, x_col, y_col):
    # fig = px.violin(df, y=y_col, x=x_col, box=True, points='all')
    fig = px.box(df, y=y_col, x=x_col) # , points='all'
    return fig

def p25(g):
    return np.percentile(g, 25)

def p75(g):
    return np.percentile(g, 75)

def index_plot(df, x_col, y_col=None, lines=None):
    import numpy as np

    if not lines:
        lines1 = [
            (1, 'mean', 'close'),
            (2, 'mean', 'fee'),
            (3, 'mean', 't1v'),
        ]
        lines = [
            (1, 'mean', y_col),
            # (2, 'mean', y_col),
            # (2, 'std', y_col),
            (2, 'p25', y_col),
            (2, 'median', y_col),
            (2, 'p75', y_col),
            # (3, 'mean', 'volume'),
            # (3, 'mean', 'volume usd'),
        ]

    total_rows = max([x[0] for x in lines])
    fig = make_subplots(rows=total_rows, cols=1, shared_xaxes=True)
    value_columns = set([x[2] for x in lines])

    df_pv = pd.pivot_table(df, index=[x_col], values=value_columns, aggfunc=[np.mean, np.median, np.std, p25, p75])
    for line in lines:
        row = line[0]
        avg_type = line[1]
        cur_y_col = line[2]
        if avg_type == 'std':
            mean_line = df_pv[[('mean', cur_y_col)]].values
            upper_band = mean_line + (df_pv[[('std', cur_y_col)]].values / 2)
            lower_band = mean_line - (df_pv[[('std', cur_y_col)]].values / 2)
            fig.add_scatter(x=df_pv.index, y=upper_band[:, 0], name="upper_std", row=row, col=1)
            fig.add_scatter(x=df_pv.index, y=lower_band[:, 0], name="lower_std", row=row, col=1)
        else:
            avg_line = df_pv[[(avg_type, cur_y_col)]].values
            fig.add_scatter(x=df_pv.index, y=avg_line[:, 0], name=avg_type + "_" + cur_y_col, row=row, col=1)
    return fig

'''
def seasonal_index_plot(df, x_col, y_col, use_violin=False):
    import numpy as np

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
    # fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

    # df_pv = pd.pivot_table(df, index=[x_col], values=[y_col], aggfunc=['mean', 'median', 'std'])
    # https://stackoverflow.com/questions/34260003/pandas-pivot-table-percentile
    def my25(g):
        return np.percentile(g, 25)

    def my75(g):
        return np.percentile(g, 75)

    df_pv = pd.pivot_table(df, index=[x_col], values=[y_col], aggfunc=[np.mean, np.median, np.std, my25, my75])
    # df_pv = pd.pivot_table(df, index=[x_col], values=[y_col, "volume"], aggfunc=[np.mean, np.median, np.std, my25, my75])

    mean_line = df_pv[[('median', y_col)]].values
    upper_band = df_pv[[('my75', y_col)]].values
    lower_band = df_pv[[('my25', y_col)]].values

    # mean_line = df_pv[[('mean', y_col)]].values
    # upper_band = mean_line + (df_pv[[('std', y_col)]].values / 2)
    # lower_band = mean_line - (df_pv[[('std', y_col)]].values / 2)
    
    fig.add_scatter(x=df_pv.index, y=mean_line[:, 0], name="mean", row=2, col=1)
    fig.add_scatter(x=df_pv.index, y=upper_band[:, 0], name="upper", row=2, col=1)
    fig.add_scatter(x=df_pv.index, y=lower_band[:, 0], name="lower", row=2, col=1)
    
    # can hide legend in BB, but nice
    fig.add_scatter(x=df_pv.index, y=mean_line[:, 0], name="mean", row=1, col=1)

    # mean_line = df_pv[[('mean', 'volume')]].values
    # fig.add_scatter(x=df_pv.index, y=mean_line[:, 0], name="volume", row=3, col=1)
    
    # fig.show()
    return fig
'''

def doy_to_mmdd(doy_series):
    # https://stackoverflow.com/questions/34258892/converting-year-and-day-of-year-into-datetime-index-in-pandas
    return pd.to_datetime(2020 * 1000 + doy_series, format='%Y%j')

def period_plot(period_col, x_col, y_col, label_fn=None):
    import plotly.graph_objects as go
    # fig = go.Figure()
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
    grouped = df.groupby([period_col])
    for group_name, group_df in grouped:
        # hovertext=group_df["datestamp"].astype(str) + ' ' + group_df["dow"].astype(str)
        fig.add_scatter(x=group_df[x_col], y=group_df[y_col], name=group_name, row=1, col=1)
        # fig.add_scatter(x=group_df.index, y=group_df["close"], name=group_name, row=2, col=1)
    df_agg = df.groupby(x_col)[[y_col]].mean()
    if label_fn:
        df_agg["xlabel"] = label_fn(df_agg.index)
    else:
        df_agg["xlabel"] = df_agg.index
    fig.add_scatter(x=df_agg["xlabel"], y=df_agg[y_col], name="agg", row=2, col=1)
    fig.update_layout(yaxis_type="log")
    return fig

# period_plot("year", "doy", "close", label_fn=doy_to_mmdd).show()
# period_plot("year", "doy", "percentchange").show()


In [None]:
# https://www.datacamp.com/community/tutorials/pandas-multi-index

def cluster_plot(df, instance_col, series_col, value_col):
    df["truncdate"] = pd.to_datetime(df["datestamp"]).dt.date

    piv = df.set_index([instance_col, series_col])
    # piv[["cme_expiry"]].to_csv("test.csv")
    piv = piv[[value_col]]
    piv = piv.unstack()
    piv.columns = piv.columns.droplevel(0)
    piv = piv.fillna(0)

    total_clusters = 2
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=total_clusters, random_state=42).fit(piv)
    piv["cluster"] = kmeans.labels_

    INDEX_PLOT = False
    if INDEX_PLOT:
        # labels clusters in df
        for instance in piv.index:
            df.loc[df[instance_col]==instance, "cluster"] = piv.loc[instance, "cluster"]

        for cluster in range(total_clusters):
            cl_df = df[df.cluster==cluster]
            seasonal_index_plot(cl_df, series_col, value_col).show()

    else:
        cl = pd.pivot_table(piv, index="cluster")
        cl = cl.stack().reset_index()
        cl = cl.rename(columns={0: "val"})
        for cluster in range(total_clusters):
            cl_df = cl[cl.cluster==cluster]
            px.line(cl_df, x=series_col, y="val").show()

instance_col = "period"
series_col = "bars_elapsed"

instance_col = "truncdate"
series_col = "hour"
# value_col = "value"
value_col = "percentchange"

cluster_plot(df_m, instance_col, series_col, value_col)

In [None]:
def show_marker_plot(df_m, REC_EVENT_COL, VALUE_COL, CAP):
    # CAP = None
    if CAP:
        # forward version
        df_m["bars_elapsed"] = marker_elapsed(df_m[REC_EVENT_COL], cap=CAP)
        # backward version
        # df_m["tminus_bars_elapsed"] = -marker_elapsed(df_m[REC_EVENT_COL][::-1], cap=CAP)[::-1]

        # df_m["bars_elapsed"] = time_since_marker(df_m, REC_EVENT_COL, cap=CAP) # , cap=DIF_ADJ_BLOCKS, cap=15, cap=31
        # set the boolean marker where we are capped as well marked
        df_m.loc[df_m.bars_elapsed == 0, REC_EVENT_COL] = True
    else:
        # forward version
        df_m["bars_elapsed"] = marker_elapsed(df_m[REC_EVENT_COL])
        # backward version
        df_m["tminus_bars_elapsed"] = -marker_elapsed(df_m[REC_EVENT_COL][::-1])[::-1]
        # print (df_m[["bars_elapsed", "tminus_bars_elapsed"]])
    df_m["period"] = df_m[REC_EVENT_COL].cumsum()

    # show marker locations
    fig = px.line(x=df_m.index, y=df_m.close, log_y=True)
    marks = df_m[df_m[REC_EVENT_COL]==True]
    fig.add_scatter(x=marks.index, y=marks.close, mode="markers", marker=dict(size=10, symbol=4, color="orange"), name="Marker")
    fig.show()

    # df = df[df['bars_elapsed'] < CAP]
    # df_m = df_m[pd.notnull(df_m['bars_elapsed'])]

    # print (df_m.next_funding.describe())
    # df_m = df_m[df_m.next_funding >= 0.0003]
    window_size = 0
    if window_size > 0:
        df_m = df_m[((df_m.bars_elapsed < window_size) | (df_m.tminus_bars_elapsed > -window_size))]
        df_m.loc[df_m["bars_elapsed"] >= window_size, ["bars_elapsed"]] = df_m["tminus_bars_elapsed"]

    index_plot(df_m, x_col="bars_elapsed", y_col=VALUE_COL).show()
    if "tminus_bars_elapsed" in df.columns:
        index_plot(df_m, x_col="tminus_bars_elapsed", y_col=VALUE_COL).show()

    period_plot(period_col="period", x_col="bars_elapsed", y_col="close").show()

    # seasonal_index_plot(df_m, x_col="bars_elapsed", y_col=VALUE_COL).show()
    # seasonal_index_plot(df_m, x_col="tminus_bars_elapsed", y_col=VALUE_COL).show()

In [None]:
# compile_dataset("Bitstamp_BTCUSD_d.csv")
# df = load_frame()

REC_EVENT_COL = "quarterly_expiry"
# VALUE_COL = "percentchange"
# VALUE_COL = "returns"
VALUE_COL = "close"
df_m = add_quarterly_expiry(df)
# CAP = 3 * 30
CAP = None

show_marker_plot(df_m, REC_EVENT_COL, VALUE_COL, CAP)

In [None]:
compile_dataset("Bitstamp_BTCUSD_d.csv")
df = load_frame()

# daily cme expiry
REC_EVENT_COL = "cme_expiry"
VALUE_COL = "close"
# VALUE_COL = "percentchange"
# VALUE_COL = "returns"
resolution = "day"
df_m = add_cme_expiry(df, resolution)
if resolution == "hour":
    CAP = 30 * 24
elif resolution == "day":
    CAP = 30

show_marker_plot(df_m, REC_EVENT_COL, VALUE_COL, CAP)

In [None]:
compile_dataset("Bitstamp_BTCUSD_1h.csv")
df = load_frame()

# daily cme expiry
REC_EVENT_COL = "cme_expiry"
VALUE_COL = "close"
# VALUE_COL = "percentchange"
# VALUE_COL = "returns"
resolution = "hour"
df_m = add_cme_expiry(df, resolution)
if resolution == "hour":
    CAP = 30 * 24
elif resolution == "day":
    CAP = 30

CAP = None
show_marker_plot(df_m, REC_EVENT_COL, VALUE_COL, CAP)

In [33]:
resolution = "day"
if resolution == "hour":
    # join on btc.com
    # compile_dataset("Bitstamp_BTCUSD_1h.csv")
    compile_dataset("bytetree_1h_bitcoin.csv")
    df = load_frame()
    
    df_m = add_dif_adj_h(df)
    CAP = DIF_ADJ_BLOCKS
    CAP = None
elif resolution == "day":
    compile_dataset("Bitstamp_BTCUSD_d.csv")
    # compile_dataset("bytetree_1d_bitcoin.csv")
    df = load_frame()

    df_m = add_dif_adj(df)
    CAP = 14*24
    CAP = None

# hourly block adjustment
REC_EVENT_COL = "blockadjustment"
# VALUE_COL = "percentchange"
# VALUE_COL = "returns"
VALUE_COL = "value"

show_marker_plot(df_m, REC_EVENT_COL, VALUE_COL, CAP)

KeyError: 'dif'

In [None]:
# event based, TODO split into cells
if False:
    REC_EVENT_COL = "is_funding_payout"
    VALUE_COL = "close"
    df_m = add_funding(df)
    CAP = None
elif False:
    # daily block adjustment
    REC_EVENT_COL = "blockadjustment"
    # VALUE_COL = "percentchange"
    VALUE_COL = "close"
    df_m = add_dif_adj(df)
    CAP = 14
elif False:
    # hourly block adjustment
    REC_EVENT_COL = "blockadjustment"
    # VALUE_COL = "percentchange"
    # VALUE_COL = "returns"
    VALUE_COL = "value"
    df_m = add_dif_adj_h(df)
    # Check this could be hourly/date based or bytree block based
    CAP = DIF_ADJ_BLOCKS
    CAP = 14*24
elif False:
    # daily cme expiry
    REC_EVENT_COL = "cme_expiry"
    VALUE_COL = "close"
    # VALUE_COL = "percentchange"
    # VALUE_COL = "returns"
    resolution = "day"
    df_m = add_cme_expiry(df, resolution)
    if resolution == "hour":
        CAP = 30 * 24
    elif resolution == "day":
        CAP = 30
elif True:
    REC_EVENT_COL = "quarterly_expiry"
    # VALUE_COL = "percentchange"
    # VALUE_COL = "returns"
    VALUE_COL = "close"
    df_m = add_quarterly_expiry(df)
    CAP = 3 * 30

show_marker_plot(df_m, REC_EVENT_COL, VALUE_COL, CAP)

In [None]:
# use markers in other notebooks
df_m.to_csv("dataset.csv", index=False)

In [None]:
# hourly kill zones
# VALUE_COL = "percentchange"
# VALUE_COL = "returns"
VALUE_COL = "close"
seasonal_index_plot(df, x_col="hour", y_col=VALUE_COL).show()
seasonal_violin(df, x_col="hour", y_col=VALUE_COL).show()

# TODO: K-Means groups, df.T inside of the pivot_table?
# cluster_groups(df, x_col="Hour", y_col=VALUE_COL).show()

In [None]:
# minute resolution kill zones
VALUE_COL = "close"
print (df)
df["timestamp"] = df["datestamp"].dt.strftime("%H:%M")
seasonal_index_plot(df, x_col="timestamp", y_col=VALUE_COL).show()

In [None]:
# annual seasonal
index_plot(df, x_col="doy", y_col="value").show()

In [None]:
# monthly seasonal
index_plot(df, x_col="dom", y_col="value").show()

In [None]:
# seasonal_index_plot(df, x_col="dayname", y_col="close").show()
# Monday=0, Sunday=6
seasonal_index_plot(df, x_col="dow", y_col="close").show()
# seasonal_index_plot(df, x_col="dom", y_col="close").show()


In [None]:
# Date Formatting or Changing Tick labels
# https://plotly.com/python/tick-formatting/
# tickmode = 'array', but still tricky for yearly
# plotting one sample year with date ticks and averages on shared xaxis?

fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [(x+1)*30 for x in range(12)],
        ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    )
)
fig.show()
for x in zip([(x+1)*28 for x in range(12)], ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']):
    print (x)