### Data Import / Feature Engineering / Preprocessing / Batching

In [2]:
import pandas as pd
import numpy as np
import quandl
import math
import time
import h5py
import threading

In [3]:

# Load minutely_data from a csv file

minutely_data = pd.read_csv('coinbase_minute.csv')
minutely_data.set_index('Timestamp', inplace = True)

In [4]:
# Get lagged returns

parameter_lagged_return_1_window = 15
parameter_lagged_return_2_window = 30

minutely_data['return_answer'] = np.log(minutely_data.Close /  minutely_data.Close.shift(1))

minutely_data['return'] = np.log(minutely_data.Close.shift(1) /  minutely_data.Close.shift(2))
minutely_data['lagged_return_1'] = np.log(minutely_data.Close.shift(1) / minutely_data.Close.shift(parameter_lagged_return_1_window+1))
minutely_data['lagged_return_2'] = np.log(minutely_data.Close.shift(1) / minutely_data.Close.shift(parameter_lagged_return_2_window+1))

In [5]:
# Define what the zscore is

def zscore(x, window):
    r = x.rolling(window=window)
    m = r.mean().shift(1)
    s = r.std(ddof=0).shift(1)
    z = (x-m)/s
    return z

In [6]:
# Zscore on return ranges

parameter_zscore_return_window = 1*60*24
parameter_zscore_return_1_window = 1*60*24
parameter_zscore_return_2_window = 5*60*24

minutely_data['zscore_return'] = zscore(minutely_data['return'], parameter_zscore_return_window)
minutely_data['zscore_return_1'] = zscore(minutely_data['return'], parameter_zscore_return_1_window)
minutely_data['zscore_return_2'] = zscore(minutely_data['return'], parameter_zscore_return_2_window)

parameter_zscore_lagged_return_1_window = 1*60*24
parameter_zscore_lagged_return_2_window = 1*60*24

minutely_data['zscore_lagged_return_1'] = zscore(minutely_data['lagged_return_1'], parameter_zscore_lagged_return_1_window)
minutely_data['zscore_lagged_return_2'] = zscore(minutely_data['lagged_return_2'], parameter_zscore_lagged_return_2_window)

In [7]:
# Price moving averages

parameter_price_to_moving_average_return_1_window = 15
parameter_price_to_moving_average_return_2_window = 60
parameter_price_to_moving_average_return_3_window = 240

minutely_data['price_to_moving_average_price_1'] = minutely_data.Close.shift(1) / minutely_data.Close.rolling(parameter_price_to_moving_average_return_1_window).mean().shift(1)
minutely_data['price_to_moving_average_price_2'] = minutely_data.Close.shift(1) / minutely_data.Close.rolling(parameter_price_to_moving_average_return_2_window).mean().shift(1)
minutely_data['price_to_moving_average_price_3'] = minutely_data.Close.shift(1) / minutely_data.Close.rolling(parameter_price_to_moving_average_return_3_window).mean().shift(1)

parameter_zscore_price_to_moving_average_return_1_window = 5*60*24
parameter_zscore_price_to_moving_average_return_2_window = 5*60*24
parameter_zscore_price_to_moving_average_return_3_window = 5*60*24

minutely_data['zscore_price_to_moving_average_price_1'] = zscore(minutely_data['price_to_moving_average_price_1'],parameter_zscore_price_to_moving_average_return_1_window)
minutely_data['zscore_price_to_moving_average_price_2'] = zscore(minutely_data['price_to_moving_average_price_2'],parameter_zscore_price_to_moving_average_return_2_window)
minutely_data['zscore_price_to_moving_average_price_3'] = zscore(minutely_data['price_to_moving_average_price_3'],parameter_zscore_price_to_moving_average_return_3_window)

In [8]:
# Price moving average crossovers

parameter_crossover_top_moving_average_1_window = 15
parameter_crossover_bottom_moving_average_1_window = 30

parameter_crossover_top_moving_average_2_window = 60
parameter_crossover_bottom_moving_average_2_window = 240

minutely_data['moving_average_crossover_1'] = minutely_data.Close.rolling(parameter_crossover_top_moving_average_1_window).mean().shift(1) / minutely_data.Close.rolling(parameter_crossover_bottom_moving_average_1_window).mean().shift(1) 
minutely_data['moving_average_crossover_2'] = minutely_data.Close.rolling(parameter_crossover_top_moving_average_2_window).mean().shift(1) / minutely_data.Close.rolling(parameter_crossover_bottom_moving_average_2_window).mean().shift(1)

parameter_zscore_moving_average_crossover_1_window = 5*60*24
parameter_zscore_moving_average_crossover_2_window = 5*60*24

minutely_data['zscore_moving_average_crossover_1'] = zscore(minutely_data['moving_average_crossover_1'], parameter_zscore_moving_average_crossover_1_window)
minutely_data['zscore_moving_average_crossover_2'] = zscore(minutely_data['moving_average_crossover_2'], parameter_zscore_moving_average_crossover_2_window)

In [9]:
# Price acceleration

parameter_acceleration_top_price_to_moving_average_1_window = 15
parameter_acceleration_top_price_to_moving_average_2_window = 60

parameter_acceleration_bottom_price_to_moving_average_1_window = 15
parameter_acceleration_bottom_price_to_moving_average_2_window = 60

parameter_acceleration_average_bottom_price_to_moving_average_1_window = 15
parameter_acceleration_average_bottom_price_to_moving_average_2_window = 15

minutely_data['temp_1'] = minutely_data.Close.shift(1) / minutely_data.Close.rolling(parameter_acceleration_bottom_price_to_moving_average_1_window).mean().shift(1)
minutely_data['temp_2'] = minutely_data.Close.shift(1) / minutely_data.Close.rolling(parameter_acceleration_bottom_price_to_moving_average_2_window).mean().shift(1)

minutely_data['acceleration_1'] = (minutely_data.Close.shift(1) / minutely_data.Close.rolling(parameter_acceleration_top_price_to_moving_average_1_window).mean().shift(1)) / (minutely_data.temp_1.rolling(parameter_acceleration_average_bottom_price_to_moving_average_1_window).mean())
minutely_data['acceleration_2'] = (minutely_data.Close.shift(1) / minutely_data.Close.rolling(parameter_acceleration_top_price_to_moving_average_2_window).mean().shift(1)) / (minutely_data.temp_2.rolling(parameter_acceleration_average_bottom_price_to_moving_average_2_window).mean())

minutely_data = minutely_data.drop(['temp_1','temp_2'], axis=1)

parameter_zscore_acceleration_1_window = 15*60*24
parameter_zscore_acceleration_2_window = 15*60*24

minutely_data['zscore_acceleration_1'] = zscore(minutely_data['acceleration_1'], parameter_zscore_acceleration_1_window)
minutely_data['zscore_acceleration_2'] = zscore(minutely_data['acceleration_2'], parameter_zscore_acceleration_2_window)

In [10]:
# Zscore on trading volume

parameter_zscore_volume_1_window = 1*60*24
parameter_zscore_volume_2_window = 5*60*24
parameter_zscore_volume_3_window = 15*60*24

minutely_data['zscore_volume_1'] = zscore(minutely_data['Volume_(Currency)'].shift(1), parameter_zscore_volume_1_window)
minutely_data['zscore_volume_2'] = zscore(minutely_data['Volume_(Currency)'].shift(1), parameter_zscore_volume_2_window)
minutely_data['zscore_volume_3'] = zscore(minutely_data['Volume_(Currency)'].shift(1), parameter_zscore_volume_3_window)

In [11]:
# Volume moving average

parameter_volume_moving_average_1_window = 15
parameter_volume_moving_average_2_window = 60
parameter_volume_moving_average_3_window = 240

minutely_data['volume_moving_average_1'] = minutely_data['Volume_(Currency)'].shift(1) / minutely_data['Volume_(Currency)'].rolling(parameter_volume_moving_average_1_window).mean().shift(1)
minutely_data['volume_moving_average_2'] = minutely_data['Volume_(Currency)'].shift(1) / minutely_data['Volume_(Currency)'].rolling(parameter_volume_moving_average_2_window).mean().shift(1)
minutely_data['volume_moving_average_3'] = minutely_data['Volume_(Currency)'].shift(1) / minutely_data['Volume_(Currency)'].rolling(parameter_volume_moving_average_3_window).mean().shift(1)

parameter_zscore_volume_moving_average_1_window = 15*24*60
parameter_zscore_volume_moving_average_2_window = 15*24*60
parameter_zscore_volume_moving_average_3_window = 15*24*60

minutely_data['zscore_volume_moving_average_1'] = zscore(minutely_data['volume_moving_average_1'], parameter_zscore_volume_moving_average_1_window)
minutely_data['zscore_volume_moving_average_2'] = zscore(minutely_data['volume_moving_average_2'], parameter_zscore_volume_moving_average_2_window)
minutely_data['zscore_volume_moving_average_3'] = zscore(minutely_data['volume_moving_average_3'], parameter_zscore_volume_moving_average_3_window)

In [12]:
# Price volatility

parameter_price_volatility_1_window = 15
parameter_price_volatility_2_window = 60
parameter_price_volatility_3_window = 240

minutely_data['price_volatility_1'] = minutely_data['return'].rolling(parameter_price_volatility_1_window).std(0).shift(1)
minutely_data['price_volatility_2'] = minutely_data['return'].rolling(parameter_price_volatility_2_window).std(0).shift(1)
minutely_data['price_volatility_3'] = minutely_data['return'].rolling(parameter_price_volatility_3_window).std(0).shift(1)

parameter_zscore_price_volatility_1_window = 15*60*24
parameter_zscore_price_volatility_2_window = 15*60*24
parameter_zscore_price_volatility_3_window = 15*60*24

minutely_data['zscore_price_volatility_1'] = zscore(minutely_data['price_volatility_1'], parameter_zscore_price_volatility_1_window)
minutely_data['zscore_price_volatility_2'] = zscore(minutely_data['price_volatility_2'], parameter_zscore_price_volatility_2_window)
minutely_data['zscore_price_volatility_3'] = zscore(minutely_data['price_volatility_3'], parameter_zscore_price_volatility_3_window)

In [13]:
# Change in volatility

parameter_change_in_volatility_top_standard_deviation_1_window = 15
parameter_change_in_volatility_top_standard_deviation_2_window = 60

parameter_change_in_volatility_bottom_standard_deviation_1_window = 15
parameter_change_in_volatility_bottom_standard_deviation_2_window = 60

parameter_change_in_volatility_average_bottom_standard_deviation_1_window = 60
parameter_change_in_volatility_average_bottom_standard_deviation_2_window = 240

minutely_data['temp_1'] = minutely_data['return'].rolling(parameter_change_in_volatility_bottom_standard_deviation_1_window).std(0).shift(1)
minutely_data['temp_2'] = minutely_data['return'].rolling(parameter_change_in_volatility_bottom_standard_deviation_2_window).std(0).shift(1)

minutely_data['change_in_volatility_1'] = minutely_data['return'].rolling(parameter_change_in_volatility_top_standard_deviation_1_window).std(0).shift(1) / minutely_data.temp_1.rolling(parameter_change_in_volatility_average_bottom_standard_deviation_1_window).mean()
minutely_data['change_in_volatility_2'] = minutely_data['return'].rolling(parameter_change_in_volatility_top_standard_deviation_2_window).std(0).shift(1) / minutely_data.temp_2.rolling(parameter_change_in_volatility_average_bottom_standard_deviation_2_window).mean()

minutely_data = minutely_data.drop(['temp_1','temp_2'], axis=1)

parameter_zscore_change_in_volatility_1_window = 15*60*24
parameter_zscore_change_in_volatility_2_window = 15*60*24

minutely_data['zscore_change_in_volatility_1'] = zscore(minutely_data['change_in_volatility_1'], parameter_zscore_change_in_volatility_1_window)
minutely_data['zscore_change_in_volatility_2'] = zscore(minutely_data['change_in_volatility_2'], parameter_zscore_change_in_volatility_2_window)

In [14]:
# Moving average convergence divergence

parameter_fast_exponential_moving_average_window = 3*60*24
parameter_slow_exponential_moving_average_window = 5*60*24
parameter_signal_moving_average_convergence_divergence_window = 1*60*24

minutely_data['moving_average_convergence_divergence'] = minutely_data['Close'].ewm(span = parameter_fast_exponential_moving_average_window, min_periods = parameter_fast_exponential_moving_average_window).mean().shift(1) - minutely_data['Close'].ewm(span = parameter_slow_exponential_moving_average_window, min_periods = parameter_slow_exponential_moving_average_window).mean().shift(1)
minutely_data['moving_average_convergence_divergence_signal'] = minutely_data['moving_average_convergence_divergence'].ewm(span = parameter_signal_moving_average_convergence_divergence_window, min_periods = parameter_signal_moving_average_convergence_divergence_window).mean()
minutely_data['moving_average_convergence_divergence_histogram'] = minutely_data['moving_average_convergence_divergence'] - minutely_data['moving_average_convergence_divergence_signal']

In [15]:
parameter_zscore_moving_average_convergence_divergence_window = 5*60*24
parameter_zscore_moving_average_convergence_divergence_signal_window = 5*60*24
parameter_zscore_moving_average_convergence_divergence_histogram_window = 5*60*24

minutely_data['zscore_moving_average_convergence_divergence'] = zscore(minutely_data['moving_average_convergence_divergence'], parameter_zscore_moving_average_convergence_divergence_window)
minutely_data['zscore_moving_average_convergence_divergence_signal'] = zscore(minutely_data['moving_average_convergence_divergence_signal'], parameter_zscore_moving_average_convergence_divergence_signal_window)
minutely_data['zscore_moving_average_convergence_divergence_histogram'] = zscore(minutely_data['moving_average_convergence_divergence_histogram'], parameter_zscore_moving_average_convergence_divergence_histogram_window)

In [16]:
# Stochastics

parameter_stochastics_k_window = 5*60*24
parameter_stochastics_d_window = 3*60*24

minutely_data['stochastics_k'] = (minutely_data.Close.shift(1) - minutely_data['Close'].rolling(parameter_stochastics_k_window).min().shift(1)) / (minutely_data['Close'].rolling(parameter_stochastics_k_window).max().shift(1) - minutely_data['Close'].rolling(parameter_stochastics_k_window).min().shift(1))
minutely_data['stochastics_d'] = minutely_data['stochastics_k'].rolling(parameter_stochastics_d_window).mean()

In [17]:
parameter_zscore_stochastics_k_window = 5*60*24
parameter_zscore_stochastics_d_window = 5*60*24

minutely_data['zscore_stochastics_k'] = zscore(minutely_data['stochastics_k'], parameter_zscore_stochastics_k_window)
minutely_data['zscore_stochastics_d'] = zscore(minutely_data['stochastics_d'], parameter_zscore_stochastics_d_window)

In [18]:
# Import daily data from qunadl

def get_json_data_blockchain(link):
    return pd.io.json.json_normalize(list(pd.read_json(link)['values']))

def get_quandl_data_from_blockchain(name, asset, token):
    data = quandl.get(asset, authtoken = token)
    data.index = data.index.astype(np.int64) // 10**9
    data.index.names = ['Timestamp']
    data.rename(columns={'Value': name}, inplace= True)
    return data

In [19]:
# Token and create different datasets

token = 'u6Z2ph3bWw3kVxpHRzVf'

data_sets = []

parameter_zscore_market_capitalization_USD_window = 15
parameter_zscore_transaction_volume_USD_window = 15
parameter_zscore_bitcoin_supply_window = 15

market_capitalization_USD = get_quandl_data_from_blockchain('market_capitalization_USD', 'BCHAIN/MKTCP', token)
data_sets.append(market_capitalization_USD)

transaction_volume_USD = get_quandl_data_from_blockchain('transaction_volume_USD', 'BCHAIN/ETRVU', token)
data_sets.append(transaction_volume_USD)

bitcoin_supply = get_quandl_data_from_blockchain('bitcoin_supply', 'BCHAIN/TOTBC', token)
data_sets.append(bitcoin_supply)

daily_data = pd.concat(data_sets, axis = 1)
                       
daily_data['zscore_market_capitalization_USD'] = zscore(daily_data['market_capitalization_USD'].shift(1), parameter_zscore_market_capitalization_USD_window)
daily_data['zscore_transaction_volume_USD'] = zscore(daily_data['transaction_volume_USD'].shift(1), parameter_zscore_transaction_volume_USD_window)
daily_data['zscore_bitcoin_supply'] = zscore(daily_data['bitcoin_supply'].shift(1), parameter_zscore_bitcoin_supply_window)

In [20]:
# Calculate network value to transactions ratio and signal

parameter_network_value_to_transactions_ratio_window = 15

parameter_zscore_network_value_to_transactions_ratio_window = 15
parameter_zscore_network_value_to_transactions_ratio_signal_window = 15

daily_data['network_value_to_transactions_ratio'] = daily_data.market_capitalization_USD.shift(1) / daily_data.transaction_volume_USD.shift(1)
daily_data['network_value_to_transactions_ratio_moving_average'] = daily_data.network_value_to_transactions_ratio.rolling(parameter_network_value_to_transactions_ratio_window).mean()

daily_data['zscore_network_value_to_transactions_ratio'] = zscore(daily_data['network_value_to_transactions_ratio'], parameter_zscore_network_value_to_transactions_ratio_window)
daily_data['zscore_network_value_to_transactions_ratio_signal'] = zscore(daily_data['network_value_to_transactions_ratio_moving_average'], parameter_zscore_network_value_to_transactions_ratio_signal_window)

In [21]:
# Have the same start and end dates for the entire dataset

minutely_start_stamp = minutely_data.index[0]
minutely_end_stamp = minutely_data.index[-1]

daily_start_stamp = daily_data.index[0]
daily_end_stamp = daily_data.index[-1]

#start_stamp = max(minutely_start_stamp, daily_start_stamp)
start_stamp = 1422835200
end_stamp = min(minutely_end_stamp, daily_end_stamp)

minutely_data = minutely_data.loc[start_stamp:end_stamp]
daily_data = daily_data.loc[start_stamp:end_stamp]

print ("Start: ", start_stamp," End: ", end_stamp)

Start:  1422835200  End:  1530057600


In [22]:
with pd.option_context('mode.use_inf_as_null', True):
    filtered_minutely_data = minutely_data[minutely_data.isnull().any(axis =1)]
    filtered_daily_data = daily_data[daily_data.isnull().any(axis =1)]

In [23]:
# Drop layers that were used to derive zscores

minutely_data = minutely_data.drop(['Open',
                                    'High',
                                    'Low',
                                    'Close',
                                    'Volume_(BTC)',
                                    'Volume_(Currency)',
                                    'Weighted_Price',
                                    'price_to_moving_average_price_1', 
                                    'price_to_moving_average_price_2', 
                                    'price_to_moving_average_price_3', 
                                    'moving_average_crossover_1' , 
                                    'moving_average_crossover_2', 
                                    'acceleration_1', 
                                    'acceleration_2', 
                                    'volume_moving_average_1', 
                                    'volume_moving_average_2', 
                                    'volume_moving_average_3', 
                                    'price_volatility_1',
                                    'price_volatility_2', 
                                    'price_volatility_3', 
                                    'change_in_volatility_1',
                                    'change_in_volatility_2',
                                    'moving_average_convergence_divergence', 
                                    'moving_average_convergence_divergence_signal', 
                                    'moving_average_convergence_divergence_histogram', 
                                    'stochastics_k', 
                                    'stochastics_d' ], axis = 1)

In [24]:
daily_data = daily_data.drop(['market_capitalization_USD', 
                              'transaction_volume_USD', 
                              'bitcoin_supply', 
                              'network_value_to_transactions_ratio', 
                              'network_value_to_transactions_ratio_moving_average'], axis =1)

In [25]:
minutely_data.columns

Index(['return_answer', 'return', 'lagged_return_1', 'lagged_return_2',
       'zscore_return', 'zscore_return_1', 'zscore_return_2',
       'zscore_lagged_return_1', 'zscore_lagged_return_2',
       'zscore_price_to_moving_average_price_1',
       'zscore_price_to_moving_average_price_2',
       'zscore_price_to_moving_average_price_3',
       'zscore_moving_average_crossover_1',
       'zscore_moving_average_crossover_2', 'zscore_acceleration_1',
       'zscore_acceleration_2', 'zscore_volume_1', 'zscore_volume_2',
       'zscore_volume_3', 'zscore_volume_moving_average_1',
       'zscore_volume_moving_average_2', 'zscore_volume_moving_average_3',
       'zscore_price_volatility_1', 'zscore_price_volatility_2',
       'zscore_price_volatility_3', 'zscore_change_in_volatility_1',
       'zscore_change_in_volatility_2',
       'zscore_moving_average_convergence_divergence',
       'zscore_moving_average_convergence_divergence_signal',
       'zscore_moving_average_convergence_divergenc

In [26]:
def clean_data(minutely_data ,daily_data, batch_size, x_window_size, y_window_size):

    num_rows = len(minutely_data)

    x_minutely_data = []
    y_minutely_data = []

    x_daily_data = []
    y_daily_data = []

    i = 0

    difference_in_frequency = (1 / (24*60))

    while((i + x_window_size + y_window_size) <= num_rows): #changed from num_rows to 20

        # Create minutely windows

        x_window_minutely_data = minutely_data[i: (i+x_window_size)]
        y_window_minutely_data = minutely_data[(i+x_window_size):(i+x_window_size+y_window_size)]

        # Determine start days for daily windows

        x_daily_start_value = math.ceil(i*difference_in_frequency)
        x_daily_end_value = math.floor((i+x_window_size)*difference_in_frequency)

        y_daily_start_value = math.ceil((difference_in_frequency * (i+x_window_size)))
        y_daily_end_value = math.floor((difference_in_frequency * (i+x_window_size+y_window_size)))

        x_window_daily_data = daily_data[x_daily_start_value: x_daily_end_value]
        y_window_daily_data = daily_data[y_daily_start_value: y_daily_end_value]

        x_window_daily_data = pd.concat([x_window_daily_data, x_window_minutely_data['return_answer']], axis = 1, join = "outer")
        abs_base_daily_data = x_window_daily_data.iloc[0]   
        
        x_window_daily_data['return_answer'] = x_window_minutely_data['return_answer']

        y_window_daily_data = pd.concat([y_window_daily_data, y_window_minutely_data['return_answer']], axis =1, join ="outer")

        y_window_daily_data['return_answer'] = y_window_minutely_data['return_answer']
        
        # Create the average testing values to solve for

        minutely_values = list(x_window_minutely_data.columns).index("return_answer")
        
        y_average_minutely_data = np.nanmean(y_window_minutely_data.values[:, minutely_values])
        y_average_daily_data = y_average_minutely_data

        x_minutely_data.append(x_window_minutely_data.values)
        y_minutely_data.append(y_average_minutely_data)

        x_daily_data.append(x_window_daily_data.values)
        y_daily_data.append(y_average_daily_data)    

        i += 1

        # Once there are no more batches

        if (i % batch_size == 0):

            x_np_arr_minutely_data = np.array(x_minutely_data)
            y_np_arr_minutely_data = np.array(y_minutely_data)

            x_np_arr_daily_data = np.array(x_daily_data)
            y_np_arr_daily_data = np.array(y_daily_data)

            x_minutely_data = []
            y_minutely_data = []

            x_daily_data = []
            y_daily_data = []
            
            yield (x_np_arr_minutely_data, y_np_arr_minutely_data, x_np_arr_daily_data, y_np_arr_daily_data)


In [27]:
minutely_data = minutely_data
daily_data = daily_data
batch_size = 512
x_window_size = 256
y_window_size = 32

dropout_rate = 0.20

In [28]:
def create_clean_data(filename_out, minutely_data ,daily_data, batch_size, x_window_size, y_window_size):
    
    print (">>>>>>> Creating x & y data files for minutely & daily data ........","\n")
    data_generation = clean_data(minutely_data=minutely_data, daily_data = daily_data, batch_size = batch_size, x_window_size = x_window_size, y_window_size = y_window_size)
    
    i = 0
    
    with h5py.File(filename_out, "w") as hf:
        x_minutely, y_minutely, x_daily, y_daily = next(data_generation)
        
        rcount_minutely_x = x_minutely.shape[0]
        dset_minutely_x = hf.create_dataset("x_minutely", shape = x_minutely.shape, maxshape = (None, x_minutely.shape[1], x_minutely.shape[2]), chunks = True)
        dset_minutely_x[:] = x_minutely
                
        rcount_minutely_y = y_minutely.shape[0]
        dset_minutely_y = hf.create_dataset("y_minutely", shape = y_minutely.shape, maxshape = (None,), chunks = True)
        dset_minutely_y[:] = y_minutely
                
        rcount_daily_x = x_daily.shape[0]
        dset_daily_x = hf.create_dataset("x_daily", shape = x_daily.shape, maxshape=(None, x_daily.shape[1], x_daily.shape[2]), chunks = True)
        dset_daily_x[:] = x_daily
                
        rcount_daily_y = y_daily.shape[0]
        dset_daily_y = hf.create_dataset("y_daily", shape = y_daily.shape, maxshape=(None,), chunks = True)
        dset_daily_y[:] = y_daily
                        
        for x_minutely_batch, y_minutely_batch, x_daily_batch, y_daily_batch in data_generation:

            print (">>>> Creating data files | Batch: ", i, end ="\r")
            print("\n")

            # Minutely data points
            
            dset_minutely_x.resize(rcount_minutely_x + x_minutely_batch.shape[0], axis = 0)
            dset_minutely_x[rcount_minutely_x:] = x_minutely_batch
            rcount_minutely_x += x_minutely_batch.shape[0]
            
            dset_minutely_y.resize(rcount_minutely_y + y_minutely_batch.shape[0], axis = 0)
            dset_minutely_y[rcount_minutely_y:] = y_minutely_batch
            rcount_minutely_y += y_minutely_batch.shape[0]
                        
            # Daily data points          

            dset_daily_x.resize(rcount_daily_x + x_daily_batch.shape[0], axis =0 )
            dset_daily_x[rcount_daily_x:] = x_daily_batch
            rcount_daily_x += x_daily_batch.shape[0]
            
            dset_daily_y.resize(rcount_daily_y + y_daily_batch.shape[0], axis = 0)
            dset_daily_y[rcount_daily_y:] = y_daily_batch
            rcount_daily_y += y_daily_batch.shape[0]            
            
            i += 1

    print (">>>>>>> Clean datasets have been created in file: `", filename_out + ".h5`")

In [29]:
#create_clean_data(filename_out = "clean_data.h5", minutely_data = minutely_data ,daily_data = daily_data, batch_size = batch_size, x_window_size = x_window_size, y_window_size = y_window_size)

print ("Done")

Done


### Load From Saved Data 

No need to run the code above (Start Here)

In [30]:
def generate_clean_data(filename, batch_size, start_index):
    with h5py.File(filename, 'r') as hf:
        i = start_index
        while True:
            x_minutely_data = hf["x_minutely"][i:i+batch_size]
            y_minutely_data = hf["y_minutely"][i:i+batch_size]
            x_daily_data = hf["x_daily"][i:i+batch_size]
            y_daily_data = hf["y_daily"][i:i+batch_size]         
            i += batch_size
            yield (x_minutely_data, y_minutely_data)
            
            """x_daily_data, y_daily_data"""

In [31]:
data_generate_training = generate_clean_data(filename = "clean_data.h5", batch_size = 512, start_index = 0)

In [32]:
i = 0

for x,y in data_generate_training:
    i +=1
    if i == 10:
        break
    print (i,x[511][200][31])
    

1 -0.846467
2 -0.558348
3 0.193369
4 1.38644
5 2.22658
6 2.46676
7 2.34207
8 2.01049
9 1.75559


### Simple LSTM Model

In [33]:
# Phased LSTM will be implemented later using : https://github.com/fferroni/PhasedLSTM-Keras/blob/master/phased_lstm_keras/PhasedLSTM.py

# Simple LSTM will only use minutely data

import time
from keras.layers import Dense, TimeDistributed, Activation, Dropout, LSTM
from keras.models import Sequential, load_model

Using TensorFlow backend.


In [34]:
true_values = []

In [35]:
def architecture(layers, dropout_rate):
    
    model = Sequential()
    
    model.add(LSTM(units = layers[1], input_shape = (None, layers[0]), return_sequences = True))
    model.add(Dropout(dropout_rate))
    
    model.add(LSTM(units = layers[2], return_sequences = False))
    model.add(Dropout(dropout_rate))
    
    model.add(Dense(units = layers[3]))
    model.add(Activation("tanh"))
    
    start_time = time.time()
    
    model.compile(loss = "mse", optimizer = "Adam")
    
    print (">>> Run Time: ", time.time()- start_time)
    
    return model    

In [36]:
def fit_model_threaded(model, data_generate_training, x_window_size, steps_per_epoch, epochs, dropout_rate, fileout):
    
    model = architecture([ncols, x_window_size, x_window_size, 1], dropout_rate = dropout_rate)
    
    model.fit_generator(generator = data_generate_training,
                       steps_per_epoch = steps_per_epoch,
                       epochs = epochs)
    
    model.save(fileout)
    
    print (">>>>> Model trained and saved in", fileout)
    
    return

In [37]:
with h5py.File("clean_data.h5", "r") as hf:
    nrows = hf["x_minutely"].shape[0]
    ncols = hf["x_minutely"].shape[2]
    
#print (nrows, ncols)

In [38]:
training_percentage = 0.796789911149326
epochs = 2

fileout = "saved_model.h5"

number_training = int(training_percentage * nrows)
steps_per_epoch = int((number_training / epochs) / batch_size)

print (">>> The clean data has", nrows, "data rows.", number_training, "rows with", steps_per_epoch, "steps per epoch")


>>> The clean data has 1786368 data rows. 1423359 rows with 1389 steps per epoch


In [39]:
model = architecture([ncols,x_window_size, x_window_size, 1], dropout_rate = dropout_rate)

>>> Run Time:  0.031200408935546875


In [40]:
#training = threading.Thread(target = fit_model_threaded, args = [model, data_generate_training,  x_window_size, steps_per_epoch, epochs, dropout_rate,fileout ] )

In [41]:
#training.start()

In [42]:
data_generate_testing = generate_clean_data(filename = "clean_data.h5", batch_size = 512, start_index = number_training)

In [43]:
number_testing = nrows - number_training
steps_per_epoch_test = int((number_testing) / batch_size)

print (">>> Testing the model on", number_testing, "data rows with", steps_per_epoch, "steps")

>>> Testing the model on 363009 data rows with 1389 steps


In [44]:
def generator_strip_xy(data_generate_testing, true_values):
    for x_minutely, y_minutely in data_generate_testing:
        true_values += list(y_minutely)
        yield x_minutely

In [None]:
predictions = model.predict_generator(generator_strip_xy(data_generate_testing, true_values), steps = steps_per_epoch_test)

In [56]:
with h5py.File("model_predictions.h5", "w") as hf:
    dset_p = hf.create_dataset("predictions", data = predictions)
    dset_y = hf.create_dataset("true_values", data = true_values)

NameError: name 'predictions' is not defined

In [47]:
import matplotlib.pyplot as plt

def plot_results(predicted_data, true_data):
    fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k')
    ax = fig.add_subplot(111)
    ax.plot(true_data, label='True Data')
    plt.plot(predicted_data, label='Prediction')
    plt.legend()
    plt.show()

In [48]:
plot_results(predictions[:10000], true_values[:10000])

NameError: name 'predictions' is not defined

In [49]:
data_generate_testing = generate_clean_data(filename = "clean_data.h5", batch_size = 512, start_index = number_training)

In [50]:
data_x, true_values = next(data_generate_testing)

In [51]:
def predict_sequences_multiple(model, data, window_size, prediction_len):
    #Predict sequence of 50 steps before shifting prediction run forward by 50 steps
    prediction_seqs = []
    for i in range(int(len(data)/prediction_len)):
        curr_frame = data[i*prediction_len]
        predicted = []
        for j in range(prediction_len):
            predicted.append(model.predict(curr_frame[np.newaxis,:,:])[0,0])
            curr_frame = curr_frame[1:]
            curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
        prediction_seqs.append(predicted)
    return prediction_seqs

In [52]:
predictions_multiple = predict_sequences_multiple(
    model,
    data_x,
    data_x[0].shape[0],
    y_window_size)

In [53]:
def plot_results_multiple(predicted_data, true_data, prediction_len):
    fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k')
    ax = fig.add_subplot(111)
    ax.plot(true_data, label='True Data')
    #Pad the list of predictions to shift it in the graph to it's correct start
    for i, data in enumerate(predicted_data):
        padding = [None for p in range(i * prediction_len)]
        plt.plot(padding + data, label='Prediction')
        plt.legend()
    plt.show()

In [54]:
plot_results_multiple(predictions_multiple, true_values, y_window_size)

In [55]:
print ("Done")

Done
