In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bisect
%matplotlib notebook
import tensorflow

In [71]:
INTERVAL = '10s'
HORIZON = pd.Timedelta(INTERVAL)*30  # forecast horizon
DATA_PATH = '/Users/felipe/bitcoin/{type}/{date}.csv.gz'
OUTPUT = '/Users/felipe/bitcoin/data/{date}-training.csv'
SPANS = [2, 3, 6, 15, 30, 60]

In [60]:
FOR_DATE = '20190515'

In [66]:
def read_trades(date):
    date = str(date)
    path = DATA_PATH.format(type='trades', date=date)
    t = pd.read_csv(path,
                    index_col='timestamp',
                    parse_dates=True,
                    infer_datetime_format=True,
                    usecols=['timestamp', 'symbol', 'price', 'side', 'size'])
    t.index.name = 'time'
    t = t[(t.symbol == 'XBTUSD') & (t.price > 1)]
    t = t.dropna()
    t.drop(columns={'symbol'}, inplace=True)
    t = t[['price', 'side', 'size']]
    return t

def read_quotes(date):
    date = str(date)
    path = DATA_PATH.format(type='quotes', date=date)
    t = pd.read_csv(path,
                    index_col='timestamp',
                    parse_dates=True,
                    infer_datetime_format=True,
                    usecols=['timestamp', 'symbol', 'bidPrice', 'askPrice', 'bidSize', 'askSize'])
    t.index.name = 'time'
    t = t[(t.symbol == 'XBTUSD') & (t.bidPrice > 1) & (t.askPrice > 1) & (t.bidPrice < t.askPrice)]
    t = t.dropna()
    t.drop(columns={'symbol'}, inplace=True)
    t = t[['bidPrice', 'askPrice', 'bidSize', 'askSize']]
    return t

In [67]:
def ema(df, spans, columns):
    """ computes ewm for each column, for each span in spans"""
    dfs = [df]
    for span in spans:
        cols = {i: 'E{}{}'.format(span, i) for i in columns}
        dfs.append(df[columns].ewm(span=span).mean().rename(columns=cols))
    return pd.concat(dfs, axis=1)

In [68]:
# cols 'symbol', 'side', 'price', 'size', 'tickDirection'
t = read_trades(FOR_DATE)

In [None]:
t.loc[t.side == 'Sell', 'size'] *= -1
t.rename(columns={'size': 'boughtSum'}, inplace=True)
t['soldSum'] = t['boughtSum']
t = t[['boughtSum', 'soldSum']]
t['boughtSum'].clip_lower(0, inplace=True)
t['soldSum'].clip_upper(0, inplace=True)
t['soldSum'] *= -1
t = t.resample(INTERVAL).agg('sum').fillna(method='ffill')
t = t[['boughtSum', 'soldSum']]

In [None]:
t.head()

In [None]:
len(t)

In [None]:
q = read_timeseries(filename=QUOTES_PATH,
                    cols=['bidPrice', 'askPrice', 'bidSize', 'askSize'])
q = q[['bidPrice', 'askPrice', 'bidSize', 'askSize']]

In [None]:
def add_fcst(q, horizon=HORIZON):
    with pd.option_context('mode.chained_assignment', None):
        bidMax = q['bidPrice'].rolling(horizon).max()
        askMin = q['askPrice'].rolling(horizon).min()
        q['longPnl'] = bidMax - q['askPrice']
        q['shortPnl'] = q['bidPrice'] - askMin
    return q

add_fcst(q)

In [None]:
q['spread'] = (q['askPrice'] - q['bidPrice']) / 0.5  # in Tick unit
q = q.resample(INTERVAL).agg('mean').fillna(method='ffill')
q.rename(columns={c: c+'Avg' for c in q.columns}, inplace=True)

In [None]:
q.head()

In [None]:
len(q)

In [None]:
df = pd.concat([t, q], axis=1)
Y_cols = ['longPnlAvg', 'shortPnlAvg']
X_cols = ['boughtSum', 'soldSum', 'bidPriceAvg', 'askPriceAvg', 'bidSizeAvg', 'askSizeAvg', 'spreadAvg']
df = df[Y_cols + X_cols]

In [None]:
len(df)

In [None]:
spans = [2, 3, 6, 15, 30, 60]

In [None]:
df = ema(df, spans, X_cols)

In [None]:
df.head()

In [72]:
df.to_csv(OUTPUT.format(date=FOR_DATE))

NameError: name 'df' is not defined

In [None]:
len(df)