The goal of this project is to analyze the four significant cryptocurrencies: Bitcoin, LightCoin, Ethereum, and Bitcoin Cash. Based on the analysis, predict the future tradings and recommend whether to buy or sell one of these bitcoins. The attributes we are going to use for prediction are the closing stocks of the cryptocurrencies, as mentioned earlier. 

In [7]:
import pandas as pd
cryptos = ['BCH-USD', 'BTC-USD', 'ETH-USD', 'LTC-USD']
datasets = {}
for crypto in cryptos:
#     print(f'crypto_data/{crypto}.csv')
    datasets[crypto] = pd.read_csv('crypto_data/'+ str(crypto) +'.csv', names=['time', 'low', 'high', 'open', 'close', 'volume'])
    
datasets['BCH-USD'].head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,871.650024,871.72998,871.650024,871.719971,5.675361
1,1528968720,870.859985,871.719971,871.719971,870.859985,26.856577
2,1528968780,870.099976,871.090027,871.090027,870.099976,1.1243
3,1528968840,868.830017,870.950012,868.830017,870.789978,1.749862
4,1528968900,870.0,870.0,870.0,870.0,1.6805


In [11]:
for cryp in datasets:
    print('Summary of the cryptocurrency:', cryp)
    print(datasets[cryp].describe().iloc[1:, 1:])

Summary of the cryptocurrency: LTC-USD
             low        high        open       close        volume
mean   77.074011   77.135464   77.106147   77.106538    117.740514
std    12.665598   12.666443   12.665300   12.665563    285.366124
min    49.330002   49.599998   49.580002   49.560001      0.000060
25%    72.500000   72.540001   72.519997   72.519997     12.073304
50%    79.870003   79.930000   79.900002   79.900002     34.492237
75%    84.379997   84.440002   84.405003   84.410004    104.343212
max   102.699997  103.059998  103.000000  103.040001  10263.191406
Summary of the cryptocurrency: BTC-USD
              low         high         open        close      volume
mean  6771.924414  6775.026861  6773.515356  6773.521546    6.026660
std    641.329630   641.386880   641.351570   641.354134   16.640351
min   5777.000000  5787.259766  5781.609863  5778.109863    0.001915
25%   6340.000000  6342.987671  6341.479980  6341.470215    0.824953
50%   6535.000000  6538.000000  6536.3950

### Combine the datasets so that these cryptos share time constants. Then we are going to use only the closing prices and volumes for each crypto and remove other columns. 

In [68]:
combined_df = pd.merge(datasets['BCH-USD'], datasets['BTC-USD'], how='inner', on='time').merge(datasets['ETH-USD'], how='inner', on='time').merge(datasets['LTC-USD'], how='inner', on='time')
combined_df = combined_df.iloc[:, [0, 4, 5, 9, 10, 13, 14, 17, 18]]
combined_df.columns = ['TIME', 'BCH-USD_CL', 'BCH-USD_VOL', 'BTC-USD_CL', 'BTC-USD_VOL', 'ETH-USD_CL', 'ETH-USD_VOL', 'LTC-USD_CL', 'LTC-USD_VOL']
print(datasets['LTC-USD'].shape)
print(datasets['ETH-USD'].shape)
print(combined_df.shape)
combined_df.head(10)

(101883, 6)
(102831, 6)
(86117, 9)


Unnamed: 0,TIME,BCH-USD_CL,BCH-USD_VOL,BTC-USD_CL,BTC-USD_VOL,ETH-USD_CL,ETH-USD_VOL,LTC-USD_CL,LTC-USD_VOL
0,1528968720,870.859985,26.856577,6487.379883,7.706374,486.019989,486.01001,96.669998,96.589996
1,1528968780,870.099976,1.1243,6479.410156,3.088252,486.0,486.0,96.57,96.57
2,1528968840,870.789978,1.749862,6479.410156,1.4041,486.0,485.75,96.57,96.57
3,1528968900,870.0,1.6805,6479.97998,0.753,485.75,486.0,96.540001,96.5
4,1528968960,869.98999,1.669014,6480.0,1.4909,486.0,486.0,96.519997,96.459999
5,1528969020,869.450012,0.8652,6477.220215,2.73195,485.98999,485.98999,96.529999,96.510002
6,1528969080,869.98999,23.534929,6480.0,2.17424,485.98999,485.98999,96.529999,96.480003
7,1528969140,870.0,2.3,6479.990234,0.9031,485.98999,485.98999,96.550003,96.480003
8,1528969200,870.320007,9.255514,6478.660156,3.258786,486.0,485.98999,96.43,96.43
9,1528969260,870.650024,2.7956,6478.660156,1.970352,486.0,486.0,96.400002,96.400002


### Create a new column called target that shows whether five minutes later the stock price will increase or decrease 

In [69]:
SEQ_TRAIN_LEN = 120 # observe stock price of the past 120 minutes
SEQ_PRED_LEN = 5 # predict stock price 5 minutes into the future
CRYTO_PRED = "BCH-USD" # predict closing stock of the cryptocurrency

# if the future price is higher then we should buy that crypto
# therefore 1 represents recommend buy the crypto
# 0 represents recommend sell the crypto
def redefine_target(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
    
combined_df['TARGET'] = combined_df[CRYTO_PRED + '_CL'].shift(-SEQ_PRED_LEN)

combined_df['TARGET'] = list(map(redefine_target, combined_df[CRYTO_PRED + '_CL'], combined_df['TARGET']))
combined_df.head(10)

Unnamed: 0,TIME,BCH-USD_CL,BCH-USD_VOL,BTC-USD_CL,BTC-USD_VOL,ETH-USD_CL,ETH-USD_VOL,LTC-USD_CL,LTC-USD_VOL,TARGET
0,1528968720,870.859985,26.856577,6487.379883,7.706374,486.019989,486.01001,96.669998,96.589996,0
1,1528968780,870.099976,1.1243,6479.410156,3.088252,486.0,486.0,96.57,96.57,0
2,1528968840,870.789978,1.749862,6479.410156,1.4041,486.0,485.75,96.57,96.57,0
3,1528968900,870.0,1.6805,6479.97998,0.753,485.75,486.0,96.540001,96.5,1
4,1528968960,869.98999,1.669014,6480.0,1.4909,486.0,486.0,96.519997,96.459999,1
5,1528969020,869.450012,0.8652,6477.220215,2.73195,485.98999,485.98999,96.529999,96.510002,1
6,1528969080,869.98999,23.534929,6480.0,2.17424,485.98999,485.98999,96.529999,96.480003,1
7,1528969140,870.0,2.3,6479.990234,0.9031,485.98999,485.98999,96.550003,96.480003,1
8,1528969200,870.320007,9.255514,6478.660156,3.258786,486.0,485.98999,96.43,96.43,1
9,1528969260,870.650024,2.7956,6478.660156,1.970352,486.0,486.0,96.400002,96.400002,1


In [70]:
combined_df = combined_df.sort_values('TIME')
combined_df.tail(20)
# print(combined_df.shape)
# combined_df.dropna(inplace=True)
# print(combined_df.shape)

Unnamed: 0,TIME,BCH-USD_CL,BCH-USD_VOL,BTC-USD_CL,BTC-USD_VOL,ETH-USD_CL,ETH-USD_VOL,LTC-USD_CL,LTC-USD_VOL,TARGET
86097,1535214060,531.719971,3.638192,6709.259766,0.763849,279.079987,279.079987,58.02,58.02,0
86098,1535214120,532.090027,0.760812,6707.77002,4.130928,279.089996,279.079987,58.02,58.009998,0
86099,1535214180,532.200012,0.30529,6707.77002,0.505357,279.089996,279.359985,58.02,58.02,0
86100,1535214240,532.190002,0.383416,6708.0,0.520057,279.359985,279.359985,58.02,58.02,0
86101,1535214300,531.400024,2.457184,6708.0,1.762736,279.350006,279.359985,58.02,58.009998,1
86102,1535214360,531.679993,0.30883,6707.810059,1.297226,279.350006,279.350006,57.990002,57.990002,0
86103,1535214420,531.630005,1.565413,6707.799805,2.604481,279.359985,279.359985,57.990002,57.990002,0
86104,1535214480,531.630005,0.016864,6707.759766,2.444304,279.359985,279.100006,57.990002,57.990002,0
86105,1535214540,531.469971,3.007327,6705.740234,3.068317,279.089996,279.089996,57.990002,57.990002,1
86106,1535214600,531.47998,0.18162,6705.740234,2.210403,279.089996,279.089996,58.009998,57.990002,0


### Separate the last 10% data for prediction.

In [71]:
time_values = combined_df['TIME'].values
last_ten_perc_start = time_values[int(len(time_values) * 0.9)]
train_set = combined_df.loc[combined_df['TIME'] <= last_ten_perc_start]
test_set = combined_df.loc[combined_df['TIME'] > last_ten_perc_start]
print(train_set.shape)
print(test_set.shape)
print(train_set.shape[0] / (train_set.shape[0] + test_set.shape[0]))
print(last_ten_perc_start)

(77506, 10)
(8611, 10)
0.9000081284763751
1534586340


In [82]:
from sklearn import preprocessing as prep
from collections import deque
import numpy as np
import random


def preprocess(df):
    new_df = df.copy(deep=True)
    for col in df.columns.values:
        if col != 'TIME' and col != 'TARGET':
            new_df[col] = new_df[col].pct_change()
            new_df.dropna(inplace=True)
            new_df[col] = prep.scale(new_df[col].values)
            new_df.dropna(inplace=True)
    new_df.dropna(inplace=True)
    
    seq_data = []
    prev_days = deque(maxlen=SEQ_TRAIN_LEN)
    
    for i in new_df.values:
        prev_days.append([x for x in i[1:-1]])
        
        if len(prev_days) == SEQ_TRAIN_LEN:
            seq_data.append([np.array(prev_days), i[-1]])
            
    random.shuffle(seq_data)
    
    # balancing the data (the number of zeros and ones should be same)
    buy = []
    sell = []
    
    for seq, tgt in seq_data:
        if tgt == 0:
            sell.append([seq, tgt])
        else:
            buy.append([seq, tgt])
            
    random.shuffle(buy)
    random.shuffle(sell)
    
    low = min(len(buy), len(sell))
    buy = buy[:low]
    sell =sell[:low]
    
    seq_data = buy + sell
    
    random.shuffle(seq_data)
    
    X = []
    y = []
    
    for seq, tgt in seq_data:
        X.append(seq)
        y.append(tgt)
        
    return np.array(X), y
    
train_x, train_y = preprocess(train_set)
test_x, test_y = preprocess(test_set)

import shelve
crypt_db = shelve.open('crypt.db')
crypt_db['train_x'] = train_x
crypt_db['train_y'] = train_y
crypt_db['test_x'] = test_x
crypt_db['test_y'] = test_y

In [84]:
import shelve
crypt_db = shelve.open('crypt.db')
train_x = crypt_db['train_x']
train_y = crypt_db['train_y']
test_x = crypt_db['test_x']
test_y = crypt_db['test_y']
print(len(crypt_db['train_x']))
print(len(crypt_db['test_x']))
print(len(crypt_db['train_y']))
print(len(crypt_db['test_y']))

71078
7606
71078
7606


In [88]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

EPOCHS = 5
BATCH_SIZE = 64

model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))


adam_optimizer = tf.keras.optimizers.Adam(lr=0.001, decay=1e-5)

model.compile(loss='sparse_categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])

checkpoint = ModelCheckpoint('best_model', verbose=2, save_best_only=True, mode='max', monitor='val_acc')

history = model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(test_x, test_y), callbacks=[checkpoint])

Train on 71078 samples, validate on 7606 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.56114, saving model to best_model
Epoch 2/5
  640/71078 [..............................] - ETA: 1:07:06 - loss: 0.6851 - acc: 0.5594

KeyboardInterrupt: 