In [1]:
import pandas as pd
import math
import datetime
import pandas as pd
from ta.momentum import RSIIndicator, WilliamsRIndicator, UltimateOscillator
from ta.trend import SMAIndicator, EMAIndicator, MACD
from ta.volatility import BollingerBands
from ta.volume import OnBalanceVolumeIndicator
from ta.volume import AccDistIndexIndicator


blockchain_data = pd.read_csv('data/blockchain_data.csv')
blockchain_data = blockchain_data.drop(columns=['Unnamed: 0', 'index', 'id'])
blockchain_data = blockchain_data.drop_duplicates(subset=['date', 'symbol'], keep='first')

ohlcv_data = pd.read_csv('data/ohlcv_data.csv')
ohlcv_data = ohlcv_data.drop(columns=['Unnamed: 0', 'level_0', 'time'])
ohlcv_data = ohlcv_data.drop_duplicates(subset=['date', 'symbol'], keep='first')




# Feature Engineering
features = pd.DataFrame(ohlcv_data[['symbol', 'date']])
fsyms = ['ETH', 'BTC', 'DOGE', 'MATIC']

for fsym in fsyms:
	sma_5 = SMAIndicator(close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'], window=5)
	sma_30 = SMAIndicator(close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'], window=30)
	sma_60 = SMAIndicator(close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'], window=60)

	ema_5 = EMAIndicator(close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'], window=5)
	ema_30 = EMAIndicator(close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'], window=30)
	ema_60 = EMAIndicator(close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'], window=60)	

	macd = MACD(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		window_slow=26, 
		window_fast=12, 
		window_sign=9
	)

	rsi = RSIIndicator(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		window=14)
	

	obv = OnBalanceVolumeIndicator(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['volumefrom']
	)

	adi = AccDistIndexIndicator(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['high'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['low'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['volumefrom']
	)

	willr = WilliamsRIndicator(
		high=ohlcv_data[ohlcv_data['symbol'] == fsym]['high'], 
		low=ohlcv_data[ohlcv_data['symbol'] == fsym]['low'],
		close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		lbp=14
	)

	ult_osc = UltimateOscillator(
		high=ohlcv_data[ohlcv_data['symbol'] == fsym]['high'],
		low=ohlcv_data[ohlcv_data['symbol'] == fsym]['low'],
		close=ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		window1=7,
		window2=14,
		window3=28
	)

	features.loc[features['symbol'] == fsym, 'sma_5'] = sma_5.sma_indicator()
	features.loc[features['symbol'] == fsym, 'sma_30'] = sma_30.sma_indicator()
	features.loc[features['symbol'] == fsym, 'sma_60'] = sma_60.sma_indicator()

	features.loc[features['symbol'] == fsym, 'ema_5'] = ema_5.ema_indicator()
	features.loc[features['symbol'] == fsym, 'ema_30'] = ema_30.ema_indicator()
	features.loc[features['symbol'] == fsym, 'ema_60'] = ema_60.ema_indicator()

	features.loc[features['symbol'] == fsym, 'rsi'] = rsi.rsi()
	features.loc[features['symbol'] == fsym, 'macd_diff'] = macd.macd_diff()

	features.loc[features['symbol'] == fsym, 'OBV'] = obv.on_balance_volume()
	features.loc[features['symbol'] == fsym, 'ADI'] = adi.acc_dist_index()
	features.loc[features['symbol'] == fsym, 'WILLR'] = willr.williams_r()

	features.loc[features['symbol'] == fsym, 'ULTOSC'] = ult_osc.ultimate_oscillator()

# Create outcomes DataFrame
outcomes = pd.DataFrame(ohlcv_data[['symbol', 'date']])

for fsym in fsyms:
	outcomes.loc[outcomes['symbol'] == fsym, 'close_1'] = ohlcv_data[ohlcv_data['symbol'] == fsym]['close'].pct_change(1)
	outcomes.loc[outcomes['symbol'] == fsym, 'close_3'] = ohlcv_data[ohlcv_data['symbol'] == fsym]['close'].pct_change(3)
	outcomes.loc[outcomes['symbol'] == fsym, 'close_5'] = ohlcv_data[ohlcv_data['symbol'] == fsym]['close'].pct_change(5)
	outcomes.loc[outcomes['symbol'] == fsym, 'close_7'] = ohlcv_data[ohlcv_data['symbol'] == fsym]['close'].pct_change(7)

	outcomes.loc[outcomes['symbol'] == fsym, 'direction_1'] = outcomes.loc[outcomes['symbol'] == fsym, 'close_1'].apply(lambda x: 1 if x > 0 else 0)
	outcomes.loc[outcomes['symbol'] == fsym, 'direction_3'] = outcomes.loc[outcomes['symbol'] == fsym, 'close_3'].apply(lambda x: 1 if x > 0 else 0)
	outcomes.loc[outcomes['symbol'] == fsym, 'direction_5'] = outcomes.loc[outcomes['symbol'] == fsym, 'close_5'].apply(lambda x: 1 if x > 0 else 0)
	outcomes.loc[outcomes['symbol'] == fsym, 'direction_7'] = outcomes.loc[outcomes['symbol'] == fsym, 'close_7'].apply(lambda x: 1 if x > 0 else 0)


features_eth = features[features['symbol'] == 'ETH']
features_eth = features_eth.drop(columns=['symbol'])
features_eth['date'] = features_eth.date.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
features_eth = features_eth.set_index('date')

outcomes_eth = outcomes[outcomes['symbol'] == 'ETH']
outcomes_eth['date'] = outcomes_eth.date.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
outcomes_eth = outcomes_eth.set_index('date')


nonnull_index = outcomes_eth.merge(features_eth, how='inner', on='date').dropna().index

features_eth = features_eth.loc[nonnull_index, :]
outcomes_eth = outcomes_eth.loc[nonnull_index, :]


features_eth_blockchain = blockchain_data[blockchain_data['symbol'] == 'ETH']

features_eth_blockchain = features_eth_blockchain.drop(columns=['symbol', 'time'])
features_eth_blockchain['date'] = features_eth_blockchain.date.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
features_eth_blockchain = features_eth_blockchain.set_index('date')

# nonnull_index = outcomes_eth.merge(features_eth_blockchain, how='inner', on='date').dropna().index

features_eth_blockchain = features_eth_blockchain.loc[nonnull_index, :]
# outcomes_eth = outcomes_eth.loc[nonnull_index, :]

print(features_eth.shape, features_eth_blockchain.shape, outcomes_eth.shape)

(2491, 12) (2491, 14) (2491, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outcomes_eth['date'] = outcomes_eth.date.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))


In [13]:
import numpy as np

# Set the sequence length - this is the timeframe used to make a single prediction
sequence_length = 50

# Prediction Index
index_Close = ohlcv_data.close

# Split the training data into train and train data sets
# As a first step, we get the number of rows to train the model on 80% of the data 
train_data_len = math.ceil(features_eth.shape[0] * 0.8)

# Create the training and test data
train_data = features_eth.iloc[:train_data_len, :]
test_data = features_eth.iloc[train_data_len - sequence_length:, :]

# The RNN needs data with the format of [samples, time steps, features]
# Here, we create N samples, sequence_length time steps per sample, and 6 features
def partition_dataset(sequence_length, data):
    x, y = [], []
    data_len = data.shape[0]
    for i in range(sequence_length, data_len):
        x.append(data.iloc[i-sequence_length:i,:]) #contains sequence_length values 0-sequence_length * columsn
        y.append(data.iloc[i, index_Close]) #contains the prediction values for validation,  for single-step prediction
    
    # Convert the x and y to numpy arrays
    x = np.array(x)
    y = np.array(y)
    return x, y

# Generate training data and test data
x_train, y_train = partition_dataset(sequence_length, train_data)
x_test, y_test = partition_dataset(sequence_length, test_data)

# Print the shapes: the result is: (rows, training_sequence, features) (prediction value, )
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

# Validate that the prediction value and the input match up
# The last close price of the second input sample should equal the first prediction value
print(x_train[1][sequence_length-1][index_Close])
print(y_train[0])

IndexError: positional indexers are out-of-bounds

In [12]:
partition_dataset(sequence_length, train_data)


InvalidIndexError: (slice(0, 50, None), slice(None, None, None))