In [1]:
from __future__ import division
from more_itertools import peekable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scp
import pickle
import magenta
import os, time, re, json
%matplotlib inline
from IPython.core.display import display, HTML
### change width of notebook display
display(HTML("<style>.container { width:70% !important; }</style>"))

import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# for exposing API
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from flask import Flask
from flask_restful import Resource, Api

JUPYTER_PICKLE_FILE = "config/shared_jupyter_data.pkl"
def write_shared_jupyter(key, value, path=JUPYTER_PICKLE_FILE, overwrite=False):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
        if overwrite:
            shared_jupyter_data = {key: value}
        else:
            shared_jupyter_data[key] = value
    else:
        shared_jupyter_data = {key: value}
    with open(path, 'wb') as fp: 
        pickle.dump(shared_jupyter_data, fp)

def read_shared_jupyter(key=None, path=JUPYTER_PICKLE_FILE):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
            if key is not None:
                if key in shared_jupyter_data:
                    return(shared_jupyter_data[key])
                else:
                    print("Not found!")
                    return(None)
            else:
                return(shared_jupyter_data)
    else:
        print("No data")

def pandasToJson(df):
    return(df.to_json(orient="split"))
def jsonToPandas(json):
    return(pd.read_json(json, orient="split"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

FIG_WIDTH = 1200
FIG_HEIGHT = 800

PITCH_MIN = 20
PITCH_MAX = 120
VELOCITY_MIN = 0
VELOCITY_MAX = 120

def hheader(x):
    print("#########################################")
    print("### {}".format(x))
    print("#########################################")

# Magenta dependencies:
# https://github.com/magenta/magenta

# Magenta uses pretty_midi to deal with midi files
import pretty_midi

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


# Read in music stream and make decisions

Each agent:
- Read in music stream
- Update predictive model
- Probabilistic decision rule: create order and submit

In [2]:
""" Each agent reads in music stream and responds
    (start with a single agent)
"""

""" Kalman filter parameters """

from scipy.stats import multivariate_normal
from statsmodels.tsa.stattools import acovf
from scipy.ndimage.interpolation import shift
np.random.seed(42)
np.set_printoptions(suppress=True) 

def pandas_fill(arr):
    df = pd.DataFrame(arr)
    df = df.fillna(method='ffill', axis=1, inplace=False)
    out = df.to_numpy()
    return out

# for now, keep the noise matrices as constants (can improve later)

# observations x: note, different dimension from Z (unlike sample code)
df_X = None
x_obs = None
[n,d] = [None, None]
x_obs_nonna = None

# number of lags (minus one)
L = 14

### Latent state equation - vector z_t
# z_t = A_t z_{t-1} + <other regressors> + w_t
At = np.eye(L) # (L x L) - paper says to leave this as identity (random walk)
Qt = np.diag(np.ones(L)) # (L x L) covariance of errors in state eq. - TUNE THIS

### Observation equation - scalar x_t
# x_t = C_t z_t + <other regressors> + v_t
Ct = np.ones((1,L)) # (1 x L) - ARIMA lags - WILL OVERRIDE
Rt = np.diag(np.ones(1)) # (1 x 1)

# Simulate some plausible values for Qt and Rt
# (could make Rt self-updating if time)
Qt = np.diag(np.random.gamma(1,0.1,size=L)) # covar. mat is symmetric
Rt = np.random.gamma(3, 4, size=np.diag(np.ones(1)).shape)

# Initial conditions (for the state variable z, stored in mu vars)
# z0 = x_obs[1:(L+1)]
# initialize to equal weight on all lags
z0 = np.repeat(1/L, L).reshape(-1, 1) # make state variable the WEIGHTS on lags NOT the LAGS themselves
P0 = Qt

#Objects to store predictions and filtering locations
Z = None
Zpred = None
Xpred = None

# store predictions
z = z0 # predicted zhat, ()
P = P0 # covariance of zhat, dim(L, L)
Ct = Ct0 = None

""" Iteration parameters """

from collections import OrderedDict
import requests

### Iteration parameters
STREAM_SLEEP_SEC = 1

""" Agent parameters """
WEALTH = 10000

""" Plotly parameters """
# keep it simple and update every iteration
DASH_PORT = 8122
PLOTLY_REFRESH_SEC = 0.5
PLOTLY_STREAM_SLEEP_SEC = PLOTLY_REFRESH_SEC * 5

### Visualize all historical data
server = Flask('my_app')
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = JupyterDash(__name__, server=server, external_stylesheets=external_stylesheets)
### use later for sending trade signals
#     api = Api(server)
#     class vizStreamSoFar(Resource):
#         """ Get current snapshot of the music stream.
#             $ curl "http://localhost:8122/read"
#         """
#         def get(self):
#             return({"currTime": currTime, "lastBar": pandasToJson(lastBar)})
#     api.add_resource(vizStreamSoFar, '/read')
app.layout = html.Div([
    html.H1("Agent sample statistics"),
    dcc.Graph(id='pitch_graph'),
    dcc.Interval(id='interval-component', interval=PLOTLY_REFRESH_SEC*1000, n_intervals=0)
# ])
], style={'width': '80%', 'float': 'left', 'height': '4.5rem'})
        
@app.callback(Output('pitch_graph', 'figure'), Input("interval-component", "n_intervals"))
def update_pitch_figure(n=0):
    """
    Update plotly figure. (Like ggplot2: color based on group)
    currBar (global): variable with the current data.
    """
    fig = px.line(df_preds, x="streaming_start_sec",
                  y=['raw', "preds"],
                  render_mode="webgl", template="plotly_dark",
        title="Observed pitch statistics",range_y=[PITCH_MIN, PITCH_MAX]).update_traces(mode='markers+lines')
#     fig = px.line(assetDataSoFar_df, x="streaming_start_sec",
#                   y=['pitch_mean'],
#                   render_mode="webgl", template="plotly_dark",
#         title="Observed pitch statistics",range_y=[PITCH_MIN, PITCH_MAX]).update_traces(mode='markers+lines')
    return(fig)

### Run app locally (inline cuts off output)
app.run_server(mode='external', port=DASH_PORT)

Dash is running on http://127.0.0.1:8122/

Dash app running on http://127.0.0.1:8122/


In [None]:
# Iteration = whether to ping for more data
# extra level of sampling on top of the raw music stream (which itself is sampled)
iterations = 1
assetDataSoFar = OrderedDict()
assetDataSoFar_df = None # always-concatenated copy of all historical data
modelInitialized = False
MAX_ARRAY_LEN = 10**4
while True:
    print("Iteration {} ...".format(iterations))
    
    ### Read in music stream (assume perpetual)
    currAssetData = requests.get("http://localhost:8100/read")
    if (currAssetData.status_code != 200):
        raise Exception("GET request failed.")
    currAsset = currAssetData.json()
    currAssetTime = currAsset['currTime']
    currAssetBar = jsonToPandas(currAsset['lastBar'])
    
    ### add to historical data if hasn't been seen yet
    for sx in range(currAssetBar.shape[0]):
        currAssetObs = currAssetBar.iloc[[sx]]
        currStreamingIx = currAssetObs.streaming_start_sec.values[0]
        if (currStreamingIx not in assetDataSoFar):
            assetDataSoFar[currStreamingIx] = currAssetObs
        else:
            print("Observation stored")
    
    ### concatenate all historical data so far (only if does not exist)
    assetDataSoFar_df = pd.concat(assetDataSoFar.values())

    ### Need enough data, otherwise won't have enough 
    ### to use lags (for Kalman model) or train historically (generally).
    if (iterations < 10):
        iterations += 1
        continue
        
    """ MODEL TRAINING AND ANALYSIS #######################################################
    """
    
    ### Define data so far, with and without current bar
    ### TODO(echow): refactor to make faster
    currIxs = assetDataSoFar_df.streaming_start_sec.isin(currAssetBar.streaming_start_sec)
    assetDataSoFar_withCurr_df = assetDataSoFar_df
    assetDataSoFar_noCurr_df = assetDataSoFar_df.loc[~currIxs]
    assetDataSoFar_onlyCurr_df = assetDataSoFar_df.loc[currIxs] # same as currAssetBarNotSeen
    
    ### if model is not already trained, train model
    if not modelInitialized:
        print(">>> initializing model and training on historical data until current ...")
        ######## Initialize model
        # observations x: note, different dimension from Z (unlike sample code)
        df_X = assetDataSoFar_noCurr_df[['pitch_mean']]
        x_obs = np.c_[df_X]
        [n,d] = x_obs.shape
        # create version of observations forward-filled
        x_obs_nonna = x_obs.copy()
        x_obs_nonna = pd.DataFrame(x_obs_nonna).ffill()[0].to_numpy().reshape(-1, 1)
        #Objects to store predictions and filtering locations
        Z = np.zeros((MAX_ARRAY_LEN,L))
        Zpred = np.zeros((MAX_ARRAY_LEN,L))
        Xpred = np.zeros((MAX_ARRAY_LEN,1))
        Ct0 = x_obs_nonna[0:(L)].T
        # fill forward if NAs at the start (edge case)
        if np.isnan(Ct0).any():
            Ct0 = pandas_fill(Ct0)
        Ct = Ct0
        
        ######## Train predictive model on historical data so far (will take some time)
        timerange = range(L, n-1) # data starts at 0 so start analysis at L for L lags
        for i in timerange:
            ### Get current observations xt, t
            # https://stats.stackexchange.com/questions/140990/using-kalman-filters-to-impute-missing-values-in-time-series
            x = np.array(x_obs[i])
            missing = any(np.isnan(x))
            ### Prediction step using previous data against new data ---------------------------------------
            # zhat, t|t-1
            z = At.dot(z)
            # Phat, t|t-1
            P = At.dot(P).dot(At.T) + Qt
            Zpred[i,:] = z.T
            xhat = Ct.dot(z)
            Xpred[i,:] = xhat
            ### Measurement update incorporating new data ---------------------------------------------------
            # handle missing
            if (missing):
                Z[i,:] = np.nan
                continue
            ### embed ARIMA within the Kalman filter
            Ct = x_obs_nonna[(i-L):(i)].T
            ### Calculate Kalman gain and update log-likehoo
            # Kalman gain Kt
            S = Ct.dot(P).dot(Ct.T) + Rt
            Kt = P.dot(Ct.T).dot( np.linalg.inv(S))
            ### Measurement update step
            z = z + Kt.dot(x - Ct.dot(z))
            Z[i,:] = z.T
            # update P, t|t
            P = P - Kt.dot(Ct).dot(P)
        modelInitialized = True
    
    """ Train on current bar of data """
    print(">>> updating model on current bar of data ...")
    # observations x: note, different dimension from Z (unlike sample code)
    df_X = assetDataSoFar_withCurr_df[['pitch_mean']]
    x_obs = np.c_[df_X]
    [n,d] = x_obs.shape
    Ct0 = x_obs_nonna[0:(L)].T
    # create version of observations forward-filled
    x_obs_nonna = x_obs.copy()
    x_obs_nonna = pd.DataFrame(x_obs_nonna).ffill()[0].to_numpy().reshape(-1, 1)
    # fill forward if NAs at the start (edge case)
    if np.isnan(Ct0).any():
        Ct0 = pandas_fill(Ct0)
    Ct = Ct0
    ### Update model on current bar of data (that hasn't been seen so far)
    ### Start at the index of the current bar which has already been appended to the historical series.
    currAssetBarTimeRange = np.where(assetDataSoFar_withCurr_df.streaming_start_sec.isin(currAssetBar.streaming_start_sec))[0]
    for j in currAssetBarTimeRange:
        ### Get current observations xt, t
        # https://stats.stackexchange.com/questions/140990/using-kalman-filters-to-impute-missing-values-in-time-series
        x = np.array(x_obs[j])
        missing = any(np.isnan(x))
        ### Prediction step using previous data against new data ---------------------------------------
        # zhat, t|t-1
        z = At.dot(z)
        # Phat, t|t-1
        P = At.dot(P).dot(At.T) + Qt
        Zpred[j,:] = z.T
        xhat = Ct.dot(z)
        Xpred[j,:] = xhat
        ### Measurement update incorporating new data ---------------------------------------------------
        # handle missing
        if (missing):
            Z[j,:] = np.nan
            continue
        ### embed ARIMA within the Kalman filter
        Ct = x_obs_nonna[(j-L):(j)].T
        ### Calculate Kalman gain and update log-likehoo
        # Kalman gain Kt
        S = Ct.dot(P).dot(Ct.T) + Rt
        Kt = P.dot(Ct.T).dot( np.linalg.inv(S))
        ### Measurement update step
        z = z + Kt.dot(x - Ct.dot(z))
        Z[j,:] = z.T
        # update P, t|t
        P = P - Kt.dot(Ct).dot(P)
    
    ### Update dataframe of predictions
    df_preds = pd.concat([
        assetDataSoFar_df[["streaming_start_sec"]].reset_index(drop=True),
        assetDataSoFar_df[["pitch_mean"]].reset_index(drop=True),
        pd.DataFrame(Xpred[range(assetDataSoFar_df.shape[0]),:], columns=['pred'])
    ], axis=1)
    df_preds.columns = ['streaming_start_sec', 'raw', 'preds']

    ### Take a short break between analyses (so plotly can catch up)
    ### should be >> plot auto-update interval so that all plots
    ### update basically at the same time. 
    time.sleep(STREAM_SLEEP_SEC) # 1 second is comfortable for nice UI
    iterations += 1

Iteration 1 ...
Iteration 2 ...
Iteration 3 ...
Iteration 4 ...
Iteration 5 ...
Observation stored
Observation stored
Observation stored
Observation stored
Observation stored
Iteration 6 ...
Iteration 7 ...
Iteration 8 ...
Iteration 9 ...
Iteration 10 ...
Observation stored
Observation stored
Observation stored
Observation stored
Observation stored
>>> initializing model and training on historical data until current ...
>>> updating model on current bar of data ...
Iteration 11 ...
>>> updating model on current bar of data ...
Iteration 12 ...
>>> updating model on current bar of data ...
Iteration 13 ...
>>> updating model on current bar of data ...
Iteration 14 ...
>>> updating model on current bar of data ...
Iteration 15 ...
>>> updating model on current bar of data ...
Iteration 16 ...
>>> updating model on current bar of data ...
Iteration 17 ...
>>> updating model on current bar of data ...
Iteration 18 ...
>>> updating model on current bar of data ...
Iteration 19 ...
>>> updat

In [None]:
raise Exception()

In [None]:
# # t=timerange # start from when have enough lags
# # t=range(0, n-1) # plot full time range
# t = range(0, assetDataSoFar_df.shape[0])

# ### Plot raw vs. predictions (on separate plots)
# df_plt = pd.DataFrame({
#     "time": t,
#     "raw": x_obs[t, 0],
#     "pred": Xpred[t, 0]
# #     "predicted": Zpred[timerange,0],
# #     "filtered": Z[timerange, 0]
# })
# df_plt.head()
# fig1 = px.line(df_plt, x="time", y=["raw"], template="plotly_dark",range_y=[0, PITCH_MAX])
# fig2 = px.line(df_plt, x="time", y=['pred'], template="plotly_dark",range_y=[0, PITCH_MAX])
# fig3 = px.line(df_plt, x="time", y=["raw", 'pred'], template="plotly_dark",range_y=[0, PITCH_MAX])

# ### Plot prediction errors
# df_plt = pd.DataFrame({
#     "time": t,
#     "error": (x_obs[t, 0] - Xpred[t, 0]),
#     "error_sqr": (x_obs[t,0] - Xpred[t, 0])**2
# })
# df_plt.head()
# fig4 = px.line(df_plt, x="time", y=["error"], template="plotly_dark")

# ### Plot Kalman-estimated coefficient paths (coefs on lags)
# df_coefs = pd.DataFrame(Zpred)
# df_coefs.columns = ["l{}".format(l) for l in range(df_coefs.shape[1])]
# df_coefs = df_coefs.reset_index()

# fig5 = px.line(df_coefs.loc[df_coefs['index'].isin(t),:], x="index",
#                y=[c for c in df_coefs.columns if c != "index"], template="plotly_dark")

# print("Error stats for {} lags: ")
# print("MSE: {}\n\n".format(df_plt[['error_sqr']].mean()))
# print(df_plt[['error']].describe())
# # display(fig1)
# # display(fig2)
# display(fig3)
# display(fig4)
# display(fig5)

# Offline results: analysis on the whole time series

In [None]:
assetDataSoFar_df = pd.concat(assetDataSoFar.values())
display(assetDataSoFar_df)
display(currAssetBar)

In [None]:
"""
plot time series of interest
"""
import seaborn as sns
sns.lineplot(data=assetDataSoFar_df, x="streaming_start_sec", y="pitch_mean")

In [None]:
"""
OFFLINE NEED TO MOVE ONLINE

Find structural breaks: Kalman filter + CUSUM or some other online test
Inspo: file:///C:/Users/echow/AppData/Local/Temp/remotesensing-12-03135-v2.pdf
"""
from scipy.stats import multivariate_normal
from statsmodels.tsa.stattools import acovf
from scipy.ndimage.interpolation import shift
np.random.seed(42)

np.set_printoptions(suppress=True) 

def pandas_fill(arr):
    df = pd.DataFrame(arr)
    df = df.fillna(method='ffill', axis=1, inplace=False)
    out = df.to_numpy()
    return out

# for now, keep the noise matrices as constants (can improve later)

# observations x: note, different dimension from Z (unlike sample code)
df_X = assetDataSoFar_df[['pitch_mean']]
x_obs = np.c_[df_X]
[n,d] = x_obs.shape

# create version of observations forward-filled
x_obs_nonna = x_obs.copy()
x_obs_nonna = pd.DataFrame(x_obs_nonna).ffill()[0].to_numpy().reshape(-1, 1)

# number of lags (minus one)
L = 14

### Latent state equation - vector z_t
# z_t = A_t z_{t-1} + <other regressors> + w_t
At = np.eye(L) # (L x L) - paper says to leave this as identity (random walk)
Qt = np.diag(np.ones(L)) # (L x L) covariance of errors in state eq. - TUNE THIS

### Observation equation - scalar x_t
# x_t = C_t z_t + <other regressors> + v_t
Ct = np.ones((1,L)) # (1 x L) - ARIMA lags - WILL OVERRIDE
Rt = np.diag(np.ones(1)) # (1 x 1)

# df_x_obs_nonna_lags = []
# for l in range(0, L):
#     curr = shift(x_obs_nonna.reshape(-1,), l, cval=np.nan)
#     df_curr = pd.DataFrame(curr, columns=["l{}".format(l)])
#     df_x_obs_nonna_lags.append(df_curr)
# df_x_obs_nonna_lags = pd.concat(df_x_obs_nonna_lags, axis=1).dropna()
# cov_mat_lags = df_x_obs_nonna_lags.cov()

# # Tune Qt with historical covariances
# Qt = cov_mat_lags

# Simulate some plausible values for Qt and Rt
# (could make Rt self-updating if time)
Qt = np.diag(np.random.gamma(1,0.1,size=L)) # needs to be symmetric to be a proper covariance matrix
Rt = np.random.gamma(3, 4, size=np.diag(np.ones(1)).shape)
# At, Qt, Ct, Rt

In [None]:
# Initial conditions (for the state variable z, stored in mu vars)
# z0 = x_obs[1:(L+1)]
z0 = np.repeat(1/L, L).reshape(-1, 1) # make state variable the WEIGHTS on lags NOT the LAGS themselves
P0 = Qt

#Objects to store predictions and filtering locations
Z = np.zeros((n,L))
Zpred = np.zeros((n,L))
Xpred = np.zeros((n,L))

# store predictions
z = z0 # predicted zhat, ()
P = P0 # covariance of zhat, dim(L, L)
Ct0 = x_obs_nonna[0:(L)].T
# fill forward if NAs at the start (edge case)
if np.isnan(Ct).any():
    Ct0 = pandas_fill(Ct0)
Ct = Ct0

timerange = range(L, n-1) # data starts at 0 so start analysis at L for L lags
for i in timerange:
        
    ### Get current observations xt, t
    # if missing value NA (resulting in missing Kalman predictions) then
    # just run the prediction step and continue
    # https://stats.stackexchange.com/questions/140990/using-kalman-filters-to-impute-missing-values-in-time-series
    x = np.array(x_obs[i])
    missing = any(np.isnan(x))
    
    ### Prediction step using previous data against new data ---------------------------------------
    ### (can update variables directly bc don't require t-1|t-1 thereafter)
    # zhat, t|t-1
    z = At.dot(z)
    # Phat, t|t-1
    P = At.dot(P).dot(At.T) + Qt
    Zpred[i,:] = z.T
    xhat = Ct.dot(z)
    Xpred[i,:] = xhat
    
    if not missing:
        print("Predicted: {:5f}. Actual: {:5f}. (Error: {:5f})".format(float(x), float(xhat), float(x - xhat)))
    else:
        print("... NaN value in x data, this is normal. Continue.")
    
    ### Implement CUSUM for change point detection - later
    ### (use low p-value and update every time)
    
    ### Measurement update incorporating new data ---------------------------------------------------
    
    # handle missing
    if (missing):
        Z[i,:] = np.nan
        continue
    
    ### embed ARIMA within the Kalman filter
    Ct = x_obs_nonna[(i-L):(i)].T

    ### Calculate Kalman gain and update log-likehoo
    # Kalman gain Kt
    S = Ct.dot(P).dot(Ct.T) + Rt
    Kt = P.dot(Ct.T).dot( np.linalg.inv(S))
    # update log-likelihood
    
    ### Measurement update step
    # update zhat, t
    # this Ct dot Z is just way too big, i.e. doesn't estimate x well.
    # So 
    z = z + Kt.dot(x - Ct.dot(z))
    Z[i,:] = z.T
    # update P, t|t
    P = P - Kt.dot(Ct).dot(P)

In [None]:
### Plot predictions
t=timerange # start from when have enough lags
# t=range(0, n-1) # plot full time range

df_plt = pd.DataFrame({
    "time": t,
    "raw": x_obs[t, 0],
    "pred": Xpred[t, 0]
#     "predicted": Zpred[timerange,0],
#     "filtered": Z[timerange, 0]
})
df_plt.head()
fig1 = px.line(df_plt, x="time", y=["raw", "pred"], template="plotly_dark")

### Plot prediction errors
df_plt = pd.DataFrame({
    "time": t,
    "error": (x_obs[t, 0] - Xpred[t, 0]),
    "error_sqr": (x_obs[t,0] - Xpred[t, 0])**2
})
df_plt.head()
fig2 = px.line(df_plt, x="time", y=["error"], template="plotly_dark")

### Plot Kalman-estimated coefficient paths (coefs on lags)
df_coefs = pd.DataFrame(Zpred)
df_coefs.columns = ["l{}".format(l) for l in range(df_coefs.shape[1])]
df_coefs = df_coefs.reset_index()

fig3 = px.line(df_coefs, x="index",
               y=[c for c in df_coefs.columns if c != "index"], template="plotly_dark")

print("Error stats for {} lags: ")
print("MSE: {}\n\n".format(df_plt[['error_sqr']].mean()))
print(df_plt[['error']].describe())
display(fig1)
display(fig2)
display(fig3)

In [None]:
"""
start with one single time series - e.g. trading pitch_mean
build Bayesian forecasting model based on that pitch_mean
build trading rules on top of that, e.g. if think will go up with high confidence or rather rapidly, then submit buy signal
"""