In [1]:
import pandas as pd
import numpy as np

from keras import Input
from keras.engine import Model
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
from keras.layers import Concatenate, concatenate
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [46]:

# features is a list of strings of feature names 

def build_model(features, data_length):
    
    inputs_list = [] 
    for feature_name in features:
        inputs_list.append((Input(shape=(data_length,1), name=feature_name)))
    
    layers = [] 
    for i, input_name in enumerate(inputs_list): 
        layers.append(LSTM(64, return_sequences=False)(inputs_list[i]) )
        
    output = concatenate(layers) 
    output = Dense(3, activation='softmax', name='IsSpike')(output)
    
    model = Model(
        inputs = inputs_list,
        outputs = [output]
    )
    
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    
    return model    

data_length = 10


In [18]:
from sklearn.preprocessing import MinMaxScaler

master_df = pd.read_csv('C:/Users/Shoya/surf/data/master_df.csv', encoding='latin1')
df = master_df[['Timestamp', 'Close', 'Volume_(BTC)', 'Volume_(Currency)', 'Date(UTC)', 'Bitcoin (Adj.Overlap)', 
               'Close Price % Change', 'Close Price % Change (Abs)', 'Is Spike']]

# lag inputs depending on data_length 
df['Price_lagged'] = df['Close']#.shift(data_length)
df['Volume_BTC'] = df['Volume_(BTC)']#.shift(data_length)
df['Bitcoin_Adj'] = df['Bitcoin (Adj.Overlap)']#.shift(data_length)

df = df.dropna()
cols = ['Volume_BTC','Bitcoin_Adj', 'Close', 'Price_lagged']

# Stationalize Data by taking log differences
data_array = np.diff(np.log(df[cols]), axis=0)

# Min-Max Scale 

scalers = {}
datas = [] 

df_scaled = pd.DataFrame(columns=cols)

for i in range(len(cols)): 
    scalers[cols[i]] = MinMaxScaler()
    #print('data', data_array[:,i])
    
    col_data = data_array[:,i]
    col_data = np.reshape(col_data, (len(col_data), 1))
    
    data = scalers[cols[i]].fit_transform( col_data )  #:, np.newaxis
    #print('scaled', data)
    data = np.reshape(data, (1, len(data)))
    df_scaled[cols[i]] = data[0]
    
df_scaled['Is Spike'] = df['Is Spike']
df_scaled.dropna(inplace=True)
display(df_scaled.head())
display(df_scaled.tail())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Volume_BTC,Bitcoin_Adj,Close,Price_lagged,Is Spike
1,0.776791,0.47197,0.484557,0.484557,1.0
2,0.463316,0.439996,0.538331,0.538331,0.0
3,0.725079,0.529463,0.520715,0.520715,-1.0
4,0.210661,0.416611,0.566098,0.566098,0.0
5,0.594148,0.445509,0.568881,0.568881,0.0


Unnamed: 0,Volume_BTC,Bitcoin_Adj,Close,Price_lagged,Is Spike
28073,0.458128,0.453625,0.551945,0.551945,0.0
28074,0.598396,0.445234,0.530895,0.530895,0.0
28075,0.488,0.48561,0.539689,0.539689,0.0
28076,0.553633,0.461015,0.682808,0.682808,-1.0
28077,0.632874,0.453625,0.53215,0.53215,0.0


In [59]:
from keras.utils.np_utils import to_categorical

# split and reshape data to feed into RNN

# X_timestamp = df_scaled['Timestamp'].values
X_volume = df_scaled['Volume_BTC'].values
X_trends = df_scaled['Bitcoin_Adj'].values
X_lagged_price = df_scaled['Price_lagged'].values

Y_is_spike = df_scaled['Is Spike'].values 

train_size = int(len(X_volume) * 0.85)
train_size = int(train_size/data_length) * data_length

test_size_index = int(len(X_volume)/data_length)*data_length

X_train_volume = []
X_test_volume = [] 
X_train_trends = []
X_test_trends = []
X_train_lagged_price = []
X_test_lagged_price = []
Y_train_is_spike = [] 
Y_test_is_spike = [] 

for i in range(train_size-data_length):
    vol_temp = []
    trends_temp = []
    price_temp = []
    for j in range(data_length):
        vol_temp.append(X_volume[i+j])
        trends_temp.append(X_trends[i+j])
        price_temp.append(X_lagged_price[i+j])
    X_train_volume.append(vol_temp)
    X_train_trends.append(trends_temp)
    X_train_lagged_price.append(price_temp)
    
    Y_train_is_spike.append(Y_is_spike[i+data_length])

for i in range(test_size_index-train_size-data_length):
    vol_temp = []
    trends_temp = [] 
    price_temp = [] 
    for j in range(data_length):
        vol_temp.append(X_volume[train_size+i+j])
        trends_temp.append(X_trends[train_size+i+j])
        price_temp.append(X_lagged_price[train_size+i+j])
    X_test_volume.append(vol_temp)
    X_test_trends.append(trends_temp)
    X_test_lagged_price.append(price_temp)
    
    Y_test_is_spike.append(Y_is_spike[train_size+i+data_length])
    
X_train_volume = np.array(X_train_volume)
X_test_volume =  np.array(X_test_volume)
X_train_trends = np.array(X_train_trends)
X_test_trends = np.array(X_test_trends)
X_train_lagged_price = np.array(X_train_lagged_price)
X_test_lagged_price = np.array(X_test_lagged_price)
Y_train_is_spike =  np.array(Y_train_is_spike)
Y_test_is_spike = np.array(Y_test_is_spike)
    
    
Y_train_is_spike_onehot = to_categorical(Y_train_is_spike, num_classes=3)
Y_test_is_spike_onehot = to_categorical(Y_test_is_spike,num_classes=3)
display(Y_train_is_spike)

# y = pd.DataFrame(Y_train_is_spike_onehot)
# y['actual'] = Y_train_is_spike
# display(y.head(25))
    
# display(X_train_trends.shape)
# display(Y_train_is_spike.shape)

#display(X_train_lagged_price)
#display(Y_train_is_spike)

# df_train = pd.DataFrame(X_train_lagged_price)
# df_train['label'] = Y_train_is_spike
# display(df_train.tail(20))
# display(df_scaled.head(30))
# display(df_train.head(30))

#--------------------------------

# # X_train_timestamp, X_test_timestamp = X_timestamp[:train_size], X_timestamp[train_size:test_size_index ]
# X_train_volume, X_test_volume = X_volume[:train_size], X_volume[train_size:test_size_index ]
# X_train_trends, X_test_trends = X_trends[:train_size], X_trends[train_size:test_size_index ]
# X_train_lagged_price, X_test_lagged_price = X_lagged_price[:train_size], X_lagged_price[train_size:test_size_index ]

# # becasue I lagged the x inputs, I should forward the Y's by the data_length as well 
# Y_train_is_spike, Y_test_is_spike = Y_is_spike[data_length:train_size], Y_is_spike[train_size+data_length:test_size_index ]


# # X.shape is (samples, timesteps, dimension) 
# # timestemps is 15, samples is just however many nobs there are (but it doesn't matter, so it should be None)


X_train_volume = np.reshape(X_train_volume, (X_train_volume.shape[0],data_length,1) ) 
X_train_trends = np.reshape(X_train_trends, (X_train_trends.shape[0],data_length,1) ) 
X_train_lagged_price = np.reshape(X_train_lagged_price, (X_train_lagged_price.shape[0], data_length, 1))

X_test_volume = np.reshape(X_test_volume, (X_test_volume.shape[0],data_length,1) ) 
X_test_trends = np.reshape(X_test_trends, (X_test_trends.shape[0],data_length,1) )  
X_test_lagged_price = np.reshape(X_test_lagged_price, (X_test_lagged_price.shape[0],data_length,1))


# # X_train_timestamp = np.reshape(X_train_timestamp, (int(X_train_timestamp.shape[0]/data_length),data_length,1) ) 
# X_train_volume = np.reshape(X_train_volume, (int(X_train_volume.shape[0]/data_length),data_length,1) ) 
# X_train_trends = np.reshape(X_train_trends, (int(X_train_trends.shape[0]/data_length),data_length,1) ) 
# X_train_lagged_price = np.reshape(X_train_lagged_price, (int(X_train_lagged_price.shape[0]/data_length), data_length, 1))

# # X_test_timestamp = np.reshape(X_test_timestamp, (int(X_test_timestamp.shape[0]/data_length),data_length,1) ) 
# X_test_volume = np.reshape(X_test_volume, (int(X_test_volume.shape[0]/data_length),data_length,1) ) 
# X_test_trends = np.reshape(X_test_trends, (int(X_test_trends.shape[0]/data_length),data_length,1) )  
# X_test_lagged_price = np.reshape(X_test_lagged_price, (int(X_test_lagged_price.shape[0]/data_length),data_length,1))


# # Don't need the 1 for the third dimension for Y's??


# Y_train_is_spike = np.reshape(Y_train_is_spike, (int(Y_train_is_spike.shape[0]/data_length),  data_length) ) 
# Y_test_is_spike = np.reshape(Y_test_is_spike, (int(Y_test_is_spike.shape[0]/data_length),  data_length) )

#-----------------------------------


# instead of using input 1,2,3,4,5,6,7,8,9,10 to predict output for 11,12,13,14,15,16,17,18,19,20
# I want to use input 1,2,3,4,5,6,7,8,9,10 to predict output for 11, then 2,3,4,5,6,7,8,9,10,11 to predict output for 12 

# right now I am actually feeding input 1,2,3,4,5,6,7,8,9,10 to predict output for 1,2,3,4,5,6,7,8,9,10. 
# instead I should at least feed 1,2,3..8,9,10 to predict 11,12,13,14,15,16,17,18,19,20 -> lag everything by data_length! 

array([-1.,  0.,  0., ...,  0.,  0.,  0.])

In [56]:
features = ['Volume_BTC', 'Bitcoin_Adj', 'Price_lagged']

rnn = build_model(features, 10) 

tensorboard_callback = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

history = rnn.fit(
    [
        #X_train_timestamp,
        X_train_volume,
        X_train_trends,
        X_train_lagged_price
    ],
    [
        Y_train_is_spike_onehot
    ]
    ,
    validation_data=(
        [
            #X_test_timestamp,
            X_test_volume,
            X_test_trends,
            X_test_lagged_price
        ],
        [
            Y_test_is_spike_onehot
        ]),
    epochs=20,
    batch_size=32,
    callbacks=[
      tensorboard_callback
    ],
    verbose=1
)

Train on 23850 samples, validate on 4200 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [57]:
score = rnn.evaluate(
    [
        #X_test_timestamp,
        X_test_volume,
        X_test_trends,
        X_test_lagged_price
    ],
    [
        Y_test_is_spike_onehot
    ])

print('\n')
print("Accuracy: %.2f%%" % (score[1]*100))


Accuracy: 59.71%


In [65]:
yhat = rnn.predict( 
    [
        #X_test_timestamp,
        X_test_volume,
        X_test_trends,
        X_test_lagged_price
    ],
    verbose=0
)

display(yhat)

inverted_yhat = np.argmax(yhat,axis=1) #returns INDICES of max 
onehot_to_val_dict = {0: 0, 1: 1, 2:-1 }

inverted_yhat_arr = np.asarray(inverted_yhat)
predicted = [onehot_to_val_dict[i] for i in inverted_yhat_arr]


df_pred_output = pd.DataFrame(predicted, columns=['predicted'])
df_pred_output['actual'] = Y_test_is_spike
#df_pred_output['index_output'] = inverted_yhat
display(df_pred_output)

# correct = (df_pred_output['actual'].values == df_pred_output['predicted'].values)
# accuracy = correct.sum() / correct.size
# display(accuracy)

array([[ 0.89756489,  0.08310197,  0.01933317],
       [ 0.84383726,  0.09476137,  0.0614014 ],
       [ 0.95636821,  0.01410425,  0.02952744],
       ..., 
       [ 0.51034886,  0.39668763,  0.09296344],
       [ 0.41741222,  0.50381827,  0.07876941],
       [ 0.39037132,  0.52225763,  0.08737103]], dtype=float32)

Unnamed: 0,predicted,actual
0,0,-1.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
5,0,0.0
6,0,0.0
7,0,0.0
8,0,0.0
9,0,0.0


0.5971428571428572

In [66]:
# serialize model to JSON# serial 
model_json = rnn.to_json()
with open("model_classification.json", "w") as json_file:
    json_file.write(model_json)
    
# serialize weights to HDF5
rnn.save_weights("model_classification.h5")

In [68]:
from sklearn import metrics

print(metrics.confusion_matrix(df_pred_output['actual'], df_pred_output['predicted']))

[[ 222  586  151]
 [  64 1908  330]
 [  34  527  378]]
