In [1]:
#!/usr/bin/env python3
import pandas as pd
import lz4.frame
import gzip
import io
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np
from glob import glob
from plumbum.cmd import rm
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import TimeDistributed
from keras.models import Sequential
from keras import regularizers
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

  (fname, cnt))
  (fname, cnt))
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def plotline(data):
    plt.figure()
    plt.plot(data)
    plt.legend()
    plt.show()

def event_count(time_series, data_name):
    time_series = time_series[['Fill Price (USD)']].values
    upevents = 0
    downevents = 0
    sameprice = 0
    prev_obv = time_series[0]
    for obv in time_series[1:]:
        if obv > prev_obv:
            upevents += 1
        elif obv < prev_obv:
            downevents += 1
        elif obv == prev_obv:
            sameprice += 1
        prev_obv = obv
    print('=== Event counts on %s ===' % data_name)
    print('upevents')
    print(upevents)
    print('downevents')
    print(downevents)
    print('sameprice')
    print(sameprice)
    print()

def mse(time_series, data_name):
    time_series = time_series[['Fill Price (USD)']].values
    total_squared_error = 0
    total_absolute_error = 0
    prev_obv = time_series[0]
    for obv in time_series[1:]:
        total_squared_error += (obv - prev_obv)**2
        total_absolute_error += abs(obv - prev_obv)
        prev_obv = obv
    num_predictions = len(time_series) - 1
    mean_squared_error = total_squared_error / num_predictions
    mean_absolute_error = total_absolute_error / num_predictions
    root_mean_squared_error = np.sqrt(mean_squared_error)
    print('=== baseline on %s ===' % data_name)
    print('total squared error')
    print(total_squared_error)
    print('total absolute error')
    print(total_absolute_error)
    print('mean squared error')
    print(mean_squared_error)
    print('mean absolute error')
    print(mean_absolute_error) 
    print('root mean squared error')
    print(root_mean_squared_error) 
    print()

In [3]:
def show_summary_statistics():
    #event_count(small_set, 'small')
    train_set = df.iloc[0:num_samples_training]
    dev_set = df.iloc[num_samples_training:num_samples_training+num_samples_dev]
    test_set = df.iloc[num_samples_training+num_samples_dev:]
    event_count(train_set, 'train')
    event_count(dev_set, 'dev')
    event_count(test_set, 'test')
    mse(train_set, 'train')
    mse(dev_set, 'dev')
    mse(test_set, 'test')
#show_summary_statistics()

In [4]:
def preprocess(data):
    values = np.array(data)
    values = values.reshape(-1,1)
    values = values.astype('float32') 
    return values

In [5]:
def plot_losses(model_history, title):
    plt.figure()
    plt.plot(model_history.history['loss'], label='Train')
    plt.plot(model_history.history['val_loss'], label='Dev')
    plt.xlabel('Epochs'); plt.ylabel('Loss (mse)')
    plt.title(title)
    plt.legend(); plt.show()

In [6]:
def inverse_transform_pricescaler(data, Y_prevrawprice, fitted_scaler):
    return fitted_scaler.inverse_transform(preprocess(data))

def inverse_transform_percentdiff(data, Y_prevrawprice, fitted_scaler=None):
    orig_prices = Y_prevrawprice
    change = orig_prices * data
    return orig_prices + change
    #return fitted_scaler.inverse_transform(preprocess(data))

#print(Y_test_prevrawprice)
#print(inverse_transform_percentdiff(Y_test, Y_test_prevrawprice))

inverse_transform = inverse_transform_percentdiff

In [7]:
def plot_predictions(model, X_test, Y_test, Y_prevrawprice, title, inverse=False, scaler=None):
    y_hat = model.predict(X_test)

    if inverse:
        y_hat = inverse_transform(y_hat, Y_prevrawprice, scaler)
        Y_test = inverse_transform(Y_test, Y_prevrawprice, scaler)

    plt.plot(y_hat, label='Predicted')
    plt.plot(Y_test, label='True')
    plt.xlabel('Time'); 

    if inverse:
        plt.ylabel('Price')
    else:
        plt.ylabel('RESCALED Price')

    plt.title(title)
    plt.legend(); plt.show()

In [8]:
def calculate_MSE_RMSE(model, scaler, X_test, Y_test, Y_prevrawprice, model_name):
    y_hat = model.predict(X_test)
    y_hat_inverse = inverse_transform(y_hat, Y_prevrawprice, scaler)
    Y_test_inverse = inverse_transform(Y_test, Y_prevrawprice, scaler)
    mse = mean_squared_error(Y_test_inverse, y_hat_inverse)
    rmse = np.sqrt(mean_squared_error(Y_test_inverse, y_hat_inverse))
    print('%s:' % model_name)
    print('Test MSE: %.3f' % mse)
    print('Test RMSE: %.3f' % rmse)
    print()

In [9]:
def train_evaluate(model, model_name, 
                   X_train, Y_train, Y_train_prevrawprice, X_dev, Y_dev, Y_dev_prevrawprice, X_test, Y_test, Y_test_prevrawprice,
                   lag=10, batch_size=100, epochs=10, verbose=1):

    # Train model
    history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
                      validation_split=0.05, verbose=verbose, shuffle=False)
    #train_evaluate_showresults(history, model, model_name, 
    #                 X_train, Y_train, X_dev, Y_dev, X_test, Y_test,
    #                 lag, batch_size, epochs, verbose)
    return history

In [10]:
def train_evaluate_showresults(history, model, model_name, 
                   X_train, Y_train, Y_train_prevrawprice, X_dev, Y_dev, Y_dev_prevrawprice, X_test, Y_test, Y_test_prevrawprice,
                   lag=10, batch_size=100, epochs=10, verbose=1):
    # Plot losses, predictions, and calculate MSE and RMSE
    plot_losses(history, 'Loss\n(%s)' % model_name)
    plot_predictions(model, X_dev, Y_dev, Y_dev_prevrawprice, 'Test Predictions\n(%s)' % model_name)
    plot_predictions(model, X_dev, Y_dev, Y_dev_prevrawprice, 'Test Predictions\n(%s)' % model_name, inverse=True, scaler=price_scaler)
    calculate_MSE_RMSE(model, price_scaler, X_dev, Y_dev, Y_dev_prevrawprice, '%s' % model_name)

In [11]:
def evaluate_test(model, model_name, 
                   X_train, Y_train, Y_train_prevrawprice, X_dev, Y_dev, Y_dev_prevrawprice, X_test, Y_test, Y_test_prevrawprice,
                   lag=10, batch_size=100, epochs=10, verbose=1):
    # Plot losses, predictions, and calculate MSE and RMSE
    #plot_losses(history, 'Loss\n(%s)' % model_name)
    plot_predictions(model, X_test, Y_test, Y_test_prevrawprice, 'Test Predictions\n(%s)' % model_name)
    plot_predictions(model, X_test, Y_test, Y_test_prevrawprice, 'Test Predictions\n(%s)' % model_name, inverse=True, scaler=price_scaler)
    calculate_MSE_RMSE(model, price_scaler, X_test, Y_test, Y_test_prevrawprice, '%s' % model_name)

In [12]:
def initialize_model(X_train, loss, optimizer, num_LSTMs, num_units, dropout):
    
    LSTM_input_shape = [X_train.shape[1], X_train.shape[2]]
    print('input shape is')
    print(LSTM_input_shape)

    # DEFINE MODEL
    model = Sequential()

    if num_LSTMs == 2:
            model.add(LSTM(num_units[0], input_shape=LSTM_input_shape, return_sequences=True))
            model.add(Dropout(dropout))

            model.add(LSTM(num_units[1], return_sequences=True))
        
    if num_LSTMs == 3:
            model.add(LSTM(num_units[0], input_shape=LSTM_input_shape, return_sequences=True))
            model.add(Dropout(dropout))

            model.add(LSTM(num_units[1], return_sequences=True))
            model.add(Dropout(dropout))
            
            model.add(LSTM(num_units[2], return_sequences=True))

    model.add(TimeDistributed(Dense(1)))
    model.add(Activation('linear'))

    
    model.compile(loss=loss, optimizer=optimizer)
    
    return model

In [13]:
import os.path

def load_data():
    if not os.path.isfile('cboe/parquet_preprocessed_subset_only_BTCUSD_merged.parquet'):
        files = sorted(glob('cboe/parquet_preprocessed_BTCUSD_merged/*.parquet'))
        all_dataframes = []
        for file in files:
            print(file)
            df = pq.read_table(file).to_pandas()
            all_dataframes.append(df)
        result = pd.concat(all_dataframes)
        pq.write_table(pa.Table.from_pandas(result), 'cboe/parquet_preprocessed_subset_only_BTCUSD_merged.parquet', compression='snappy')
    df = pq.read_table('cboe/parquet_preprocessed_subset_only_BTCUSD_merged.parquet').to_pandas();
    print(df.dtypes)
    print(df.shape)
    return df

In [14]:
def split_X(df):
    n_all = df.shape[0]
    n_train = round(n_all * 0.9)
    n_dev   = round(n_all * 0.05)
    n_test  = round(n_all * 0.05)
    print('n_all:  ', n_all)
    print('n_train:', n_train)
    print('n_dev:  ', n_dev)
    print('n_test: ', n_test)

    X_train = df.iloc[:n_train, 1:16].values.astype('float32')
    X_dev   = df.iloc[n_train:n_train+n_dev, 1:16].values.astype('float32')
    X_test  = df.iloc[n_train+n_dev:, 1:16].values.astype('float32')
    print(X_train.shape)
    print(X_dev.shape)
    print(X_test.shape)

    return X_train, X_dev, X_test

In [15]:
def split_Y(df):
    n_all = df.shape[0]
    n_train = round(n_all * 0.9)
    n_dev   = round(n_all * 0.05)
    n_test  = round(n_all * 0.05)
    Y_train = df.iloc[:n_train, -1:].values.astype('float32')
    Y_dev   = df.iloc[n_train:n_train+n_dev, -1:].values.astype('float32')
    Y_test  = df.iloc[n_train+n_dev:, -1:].values.astype('float32')
    print(Y_train.shape)
    print(Y_dev.shape)
    print(Y_test.shape)
    
    return Y_train, Y_dev, Y_test

In [16]:
def df_to_parquet(df, outfile):
    pq.write_table(pa.Table.from_pandas(df), outfile, compression='snappy')

In [23]:
def evaluate_model(model, history, X_train, X_dev, X_test, Y_train, Y_dev, Y_test):
    train_loss = history.history['loss'][-1]
    dev_loss = history.history['val_loss'][-1]
    test_loss = model.evaluate(X_test, Y_test, verbose=0)
    
    y_hat_train = model.predict(X_train)
    y_hat_dev   = model.predict(X_dev)
    y_hat_test  = model.predict(X_test)
    
    train_prop_correct = np.sum(np.sign(y_hat_train) == np.sign(Y_train)) / (Y_train_final.shape[0] * Y_train_final.shape[1])
    dev_prop_correct   = np.sum(np.sign(y_hat_dev)   == np.sign(Y_dev))   / (Y_dev_final.shape[0]   * Y_dev_final.shape[1])
    test_prop_correct  = np.sum(np.sign(y_hat_test)  == np.sign(Y_test))  / (Y_test_final.shape[0]  * Y_test_final.shape[1])
    
    evaluation = [train_loss, dev_loss, test_loss, train_prop_correct, dev_prop_correct, test_prop_correct]
    return evaluation

In [18]:
def create_sequenced_data(data, window, step, y=True):
    sequenced = []
    for minute in range(0, len(data) - window, step):
        chunk = data[minute:minute+window]
        sequenced.append(chunk)
    sequenced = np.array(sequenced)
    return sequenced

In [21]:
batch_size = 8192 #16384 #32768 #4096
epochs = 100
verbose = 2
loss = 'mean_squared_error'
optimizer = 'adagrad' #'adam'
#num_LSTM = 2
#n_units = [256, 256]
num_LSTM = 3
n_units = [256, 256, 256]
dropout = 0.1

model = initialize_model(X_train_final, loss, optimizer, num_LSTM, n_units, dropout)

history = model.fit(X_train_final, Y_train_final, batch_size=batch_size, epochs=epochs,
                      validation_data=(X_dev_final, Y_dev_final), verbose=verbose, shuffle=False) 

input shape is
[30, 15]
Train on 35264 samples, validate on 1958 samples
Epoch 1/200
 - 4s - loss: 10.9643 - val_loss: 0.0129
Epoch 2/200
 - 2s - loss: 0.0088 - val_loss: 0.0063
Epoch 3/200
 - 2s - loss: 0.0072 - val_loss: 0.0055
Epoch 4/200
 - 2s - loss: 0.0063 - val_loss: 0.0049
Epoch 5/200
 - 2s - loss: 0.0056 - val_loss: 0.0044
Epoch 6/200
 - 2s - loss: 0.0051 - val_loss: 0.0040
Epoch 7/200
 - 2s - loss: 0.0046 - val_loss: 0.0036
Epoch 8/200
 - 2s - loss: 0.0042 - val_loss: 0.0033
Epoch 9/200
 - 2s - loss: 0.0039 - val_loss: 0.0031
Epoch 10/200
 - 2s - loss: 0.0036 - val_loss: 0.0029
Epoch 11/200
 - 2s - loss: 0.0034 - val_loss: 0.0027
Epoch 12/200
 - 2s - loss: 0.0032 - val_loss: 0.0025
Epoch 13/200
 - 2s - loss: 0.0030 - val_loss: 0.0023
Epoch 14/200
 - 2s - loss: 0.0028 - val_loss: 0.0022
Epoch 15/200
 - 2s - loss: 0.0026 - val_loss: 0.0020
Epoch 16/200
 - 2s - loss: 0.0025 - val_loss: 0.0019
Epoch 17/200
 - 2s - loss: 0.0024 - val_loss: 0.0018
Epoch 18/200
 - 2s - loss: 0.0023 

 - 2s - loss: 3.1950e-04 - val_loss: 1.4374e-04
Epoch 139/200
 - 2s - loss: 3.1782e-04 - val_loss: 1.4244e-04
Epoch 140/200
 - 2s - loss: 3.1589e-04 - val_loss: 1.4140e-04
Epoch 141/200
 - 2s - loss: 3.1440e-04 - val_loss: 1.4040e-04
Epoch 142/200
 - 2s - loss: 3.1248e-04 - val_loss: 1.3931e-04
Epoch 143/200
 - 2s - loss: 3.1099e-04 - val_loss: 1.3822e-04
Epoch 144/200
 - 2s - loss: 3.0949e-04 - val_loss: 1.3721e-04
Epoch 145/200
 - 2s - loss: 3.0781e-04 - val_loss: 1.3633e-04
Epoch 146/200
 - 2s - loss: 3.0670e-04 - val_loss: 1.3522e-04
Epoch 147/200
 - 2s - loss: 3.0435e-04 - val_loss: 1.3438e-04
Epoch 148/200
 - 2s - loss: 3.0225e-04 - val_loss: 1.3345e-04
Epoch 149/200
 - 2s - loss: 3.0128e-04 - val_loss: 1.3260e-04
Epoch 150/200
 - 2s - loss: 3.0004e-04 - val_loss: 1.3161e-04
Epoch 151/200
 - 2s - loss: 2.9847e-04 - val_loss: 1.3069e-04
Epoch 152/200
 - 2s - loss: 2.9711e-04 - val_loss: 1.2989e-04
Epoch 153/200
 - 2s - loss: 2.9591e-04 - val_loss: 1.2906e-04
Epoch 154/200
 - 2s - 

In [24]:
a = evaluate_model(model, history, X_train_final, X_dev_final, X_test_final, Y_train_final, Y_dev_final, Y_test_final)

In [42]:
# Concatenate dataframes
files = sorted(glob('cboe/parquet_preprocessed_BTCUSD_merged/*.parquet'))[451:]
all_dataframes = []
for file in files:
    df = pq.read_table(file).to_pandas()
    all_dataframes.append(df)
df = pd.concat(all_dataframes)

#
X_train, X_dev, X_test = split_X(df)
Y_train, Y_dev, Y_test = split_Y(df)

window_size = 30
step = 30

X_train = create_sequenced_data(X_train, window=window_size, step=step, y=False)
X_dev   = create_sequenced_data(X_dev,   window=window_size, step=step, y=False)
X_test  = create_sequenced_data(X_test,  window=window_size, step=step, y=False)

Y_train = create_sequenced_data(Y_train, window=window_size, step=step, y=True)
Y_dev   = create_sequenced_data(Y_dev,   window=window_size, step=step, y=True)
Y_test  = create_sequenced_data(Y_test,  window=window_size, step=step, y=True)

print('Train, dev, test shapes:')
print(X_train_final.shape)
print(X_dev_final.shape)
print(X_test_final.shape)
print(Y_train_final.shape)
print(Y_dev_final.shape)
print(Y_test_final.shape)

n_all:   587753
n_train: 528978
n_dev:   29388
n_test:  29388
(528978, 15)
(29388, 15)
(29387, 15)
(528978, 1)
(29388, 1)
(29387, 1)
Train, dev, test shapes:
(17632, 30, 15)
(979, 30, 15)
(979, 30, 15)
(17632, 30, 1)
(979, 30, 1)
(979, 30, 1)


In [48]:
# Initialize output dataframe
outfile = 'cboe/grid_search.parquet'
columns = ['num_epochs', 'loss', 'optimizer', 'batch_size', 'num_LSTMs', 'num_units',
           'train_loss', 'dev_loss', 'test_loss', 'train_prop_correct', 'dev_prop_correct', 'test_prop_correct']
df_output = pd.DataFrame(columns=columns)
pq.write_table(pa.Table.from_pandas(df_output), outfile, compression='snappy')

In [None]:
batch_size = 8192
num_epochs = 100
verbose = 1
loss = 'mean_squared_error'
optimizers = ['adagrad', 'adam', 'rmsprop']
num_LSTMs = [2,3]
num_units_2 = [[128, 256], [256, 256]]
num_units_3 = [[128, 256, 256], [256, 256, 256], [256, 512, 512]]
dropout = 0.1

count = 0
for optimizer in optimizers:
    for num_LSTM in num_LSTMs:
        if num_LSTM == 2:
            num_units = num_units_2
        elif num_LSTM == 3:
            num_units = num_units_3
        for n_units in num_units:
            # Load output dataframe
            df_output = pq.read_table(outfile).to_pandas()

            # Initialize model
            model = initialize_model(X_train, loss, optimizer, num_LSTM, n_units, dropout)

            # Train model
            if verbose:
                verbose=1
                print(count, '/', 15)
            else:
                verbose=0
            history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=num_epochs,
              validation_data=(X_dev, Y_dev), verbose=0, shuffle=False) 

            # Evaluate model
            evaluate = evaluate_model(model, history, X_train, X_dev, X_test, Y_train, Y_dev, Y_test)

            # Write to dataframe and save
            row = [num_epochs, loss, optimizer, batch_size, num_LSTM, str(n_units)]
            row.extend(evaluate)
            df_output.loc[len(df_output)] = row
            df_to_parquet(df_output, outfile)

            count += 1

input shape is
[30, 15]
0 / 15
input shape is
[30, 15]
1 / 15
input shape is
[30, 15]
2 / 15
input shape is
[30, 15]
3 / 15
