# Data Extraction

In [6]:
from cryptocompare.blockchain_data import get_blockchain_data
from cryptocompare.ohlcv import get_ohlcv
import pandas as pd
import ta
from ta.volatility import AverageTrueRange
from ta.momentum import RSIIndicator
from ta.trend import MACD
from ta.volatility import BollingerBands
from ta.volume import OnBalanceVolumeIndicator
from ta.volume import AccDistIndexIndicator

fsyms = ['ETH', 'BTC', 'DOGE', 'MATIC']

# Collect data
ohlcv_data = pd.DataFrame()
blockchain_data = pd.DataFrame()

for fsym in fsyms:
    ohlcv_data_temp = get_ohlcv(fsym)
    ohlcv_data_temp['symbol'] = fsym
    ohlcv_data = pd.concat([ohlcv_data, ohlcv_data_temp])

    blockchain_data_temp = get_blockchain_data(fsym)
    blockchain_data = pd.concat([blockchain_data, blockchain_data_temp])

ohlcv_data = ohlcv_data.reset_index().drop(columns=['index'])


---- Extracting price data for ETH ... ---- 

Run number: 1 

Response received
Date range: From 2017-01-25 00:00:00 To 2022-07-18 01:00:00
Run number: 2 

Response received
Date range: From 2011-08-05 01:00:00 To 2017-01-25 00:00:00
No more data received. Terminating the loop at run 2 

---- Extracting blockchain data for ETH ... ---- 

Run number: 1 

Response received
Date range: From 2017-01-25 00:00:00 To 2022-07-18 01:00:00
Run number: 2 

Response received
Date range: From 2011-08-05 01:00:00 To 2017-01-25 00:00:00
Run number: 3 

No more data received. Terminating the loop at run 3 

---- Extracting price data for BTC ... ---- 

Run number: 1 

Response received
Date range: From 2017-01-25 00:00:00 To 2022-07-18 01:00:00
Run number: 2 

Response received
Date range: From 2011-08-05 01:00:00 To 2017-01-25 00:00:00
Run number: 3 

Response received
Date range: From 2006-02-12 00:00:00 To 2011-08-05 01:00:00
No more data received. Terminating the loop at run 3 

---- Extracting bl

# Feature Engineering

## Load data from CSV file

In [34]:
import pandas as pd

blockhain_data = pd.read_csv('data/blockchain_data.csv')
blockhain_data = blockhain_data.drop(columns=['Unnamed: 0', 'index', 'id'])

ohlcv_data = pd.read_csv('data/ohlcv.csv')
ohlcv_data = ohlcv_data.drop(columns=['Unnamed: 0', 'level_0', 'time'])

In [35]:
# Merge ohlcv and blockhain data
df = pd.merge(ohlcv_data, blockhain_data, how='inner', on=['date', 'symbol'])

In [31]:
df

Unnamed: 0,high,low,open,volumefrom,volumeto,close,conversionType,conversionSymbol,date,symbol,time,zero_balance_addresses_all_time,unique_addresses_all_time,new_addresses,active_addresses,transaction_count,transaction_count_all_time,large_transaction_count,average_transaction_value,block_height,hashrate,difficulty,block_time,block_size,current_supply
0,27.7900,0.7809,0.7812,5.358456e+04,1.486083e+05,2.7730,multiply,BTC,2015-08-07 01:00:00,ETH,1438905600,45,10306,10306,1389,2035,2035,3,987.052939,50613.0,0.088342,1.470839e+12,16.649383,632.0,7.227992e+07
1,2.5810,0.5958,2.7730,7.225580e+05,5.835435e+05,0.8076,multiply,BTC,2015-08-08 01:00:00,ETH,1438992000,110,10719,413,1207,2344,4379,6,5795.271048,55869.0,0.096483,1.586124e+12,16.439391,667.0,7.230786e+07
2,0.9581,0.6043,0.8076,7.371196e+05,5.475280e+05,0.7428,multiply,BTC,2015-08-09 01:00:00,ETH,1439078400,144,10993,274,1112,1312,5691,1,888.990757,60992.0,0.101360,1.709480e+12,16.865482,618.0,7.233504e+07
3,0.7628,0.5990,0.7428,5.859170e+05,4.011071e+05,0.6846,multiply,BTC,2015-08-10 01:00:00,ETH,1439164800,462,11668,675,1429,2027,7718,4,825.080113,66247.0,0.111855,1.837696e+12,16.429197,631.0,7.236286e+07
4,1.1650,0.6548,0.6846,1.479696e+06,1.567649e+06,1.0590,multiply,BTC,2015-08-11 01:00:00,ETH,1439251200,1774,13565,1897,2696,4955,12673,2,300.157758,71527.0,0.124450,2.036391e+12,16.363137,692.0,7.239089e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11196,0.7155,0.6132,0.6385,2.643461e+08,1.739811e+08,0.7090,direct,,2022-07-14 01:00:00,MATIC,1657756800,950867,1389609,1706,3795,5997,4007089,187,37940.216530,,,,,,8.006804e+09
11197,0.7344,0.6760,0.7090,1.595893e+08,1.121123e+08,0.7032,direct,,2022-07-15 01:00:00,MATIC,1657843200,952043,1391227,1618,3901,5486,4012575,134,24568.470012,,,,,,8.006804e+09
11198,0.7439,0.6823,0.7032,1.050752e+08,7.529363e+07,0.7292,direct,,2022-07-16 01:00:00,MATIC,1657929600,953141,1393168,1941,4082,5449,4018024,68,9627.543546,,,,,,8.006804e+09
11199,0.7891,0.7188,0.7292,1.483191e+08,1.125593e+08,0.7525,direct,,2022-07-17 01:00:00,MATIC,1658016000,954252,1395153,1985,4122,5920,4023944,66,11709.433129,,,,,,8.006804e+09


In [51]:
from cryptocompare.blockchain_data import get_blockchain_data
from cryptocompare.ohlcv import get_ohlcv
import pandas as pd
import ta
from ta.volatility import AverageTrueRange
from ta.momentum import RSIIndicator
from ta.trend import MACD
from ta.volatility import BollingerBands
from ta.volume import OnBalanceVolumeIndicator
from ta.volume import AccDistIndexIndicator

# Feature Engineering
features = pd.DataFrame(ohlcv_data[['symbol', 'date']])
fsyms = ['ETH', 'BTC', 'DOGE', 'MATIC']

for fsym in fsyms:
	rsi = RSIIndicator(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		window=14
	)

	macd = MACD(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close']
	)

	bollinger_bands = BollingerBands(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close']
	)

	obv = OnBalanceVolumeIndicator(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['volumefrom']
	)

	adi = AccDistIndexIndicator(
		ohlcv_data[ohlcv_data['symbol'] == fsym]['high'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['low'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['close'],
		ohlcv_data[ohlcv_data['symbol'] == fsym]['volumefrom']
	)

	features.loc[features['symbol'] == fsym, 'rsi'] = rsi.rsi()
	features.loc[features['symbol'] == fsym, 'macd_line'] = macd.macd()
	features.loc[features['symbol'] == fsym, 'macd_signal'] = macd.macd_signal()
	features.loc[features['symbol'] == fsym, 'macd_histogram'] = macd.macd_diff()
	features.loc[features['symbol'] == fsym, 'mavg'] = bollinger_bands.bollinger_mavg()
	features.loc[features['symbol'] == fsym, 'bollinger_hband'] = bollinger_bands.bollinger_hband()
	features.loc[features['symbol'] == fsym, 'bollinger_lband'] = bollinger_bands.bollinger_lband()
	features.loc[features['symbol'] == fsym, 'OBV'] = obv.on_balance_volume()
	features.loc[features['symbol'] == fsym, 'ADI'] = adi.acc_dist_index()

# Create outcomes DataFrame
outcomes = pd.DataFrame(ohlcv_data[['symbol', 'date']])

for fsym in fsyms:
	outcomes.loc[outcomes['symbol'] == fsym, 'close_1'] = ohlcv_data[ohlcv_data['symbol'] == fsym]['close'].pct_change(1)
	# outcomes['close_1'] = ohlcv_data[ohlcv_data['symbol'] == fsym]['close'].pct_change(1)
	# outcomes['close_5'] = ohlcv_data[ohlcv_data['symbol'] == fsym]['close'].pct_change(5)
	# outcomes['close_10'] = ohlcv_data[ohlcv_data['symbol'] ==  fsym]['close'].pct_change(10)


In [53]:
features_eth = features[features['symbol'] == 'ETH']
outcomes_eth = outcomes[outcomes['symbol'] == 'ETH']


In [56]:
outcomes_eth

Unnamed: 0,symbol,date,close_1,close_5,close_10
0,ETH,2015-08-07 01:00:00,,,
1,ETH,2015-08-08 01:00:00,,,
2,ETH,2015-08-09 01:00:00,,,
3,ETH,2015-08-10 01:00:00,,,
4,ETH,2015-08-11 01:00:00,,,
...,...,...,...,...,...
2534,ETH,2022-07-14 01:00:00,,,
2535,ETH,2022-07-15 01:00:00,,,
2536,ETH,2022-07-16 01:00:00,,,
2537,ETH,2022-07-17 01:00:00,,,


In [57]:
outcomes


Unnamed: 0,symbol,date,close_1,close_5,close_10
0,ETH,2015-08-07 01:00:00,,,
1,ETH,2015-08-08 01:00:00,,,
2,ETH,2015-08-09 01:00:00,,,
3,ETH,2015-08-10 01:00:00,,,
4,ETH,2015-08-11 01:00:00,,,
...,...,...,...,...,...
11188,MATIC,2022-07-14 01:00:00,0.110415,0.202714,0.444580
11189,MATIC,2022-07-15 01:00:00,-0.008181,0.231524,0.371830
11190,MATIC,2022-07-16 01:00:00,0.036974,0.295665,0.388688
11191,MATIC,2022-07-17 01:00:00,0.031953,0.404966,0.337302


In [49]:
outcomes['direction'] = outcomes['close_1'].apply(lambda x: 1 if x > 0 else -1)

In [50]:
outcomes

Unnamed: 0,close_1,close_5,close_10,direction
0,,,,-1
1,-0.708763,,,-1
2,-0.080238,,,-1
3,-0.078352,,,-1
4,0.546889,,,1
...,...,...,...,...
2534,0.069973,-0.019704,0.037180,1
2535,0.032426,0.054758,0.087697,1
2536,0.101589,0.237882,0.144170,1
2537,-0.013522,0.289771,0.081940,-1


In [None]:
split_point = int(0.80*len(df))