In [1]:
import json
import requests
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error
# import torch
# import torch.nn as nn
# import torch

In [2]:
# Replace with your own path
technicals = True

df = pd.read_csv('/Users/timwu0/Documents/CS329P/afterhours_crypto/preprocessing/crypto_data.csv')
coins = df['coin'].unique()
# print(coins)

#convert coin to one-hot vectors
for coin in coins:
    df[coin] = df['coin'] == coin



In [3]:
def relative_strength_idx(df, n=14):
    close = df['close']
    delta = close.diff()
    # delta = delta[1:]
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0
    pricesDown[pricesDown > 0] = 0
    rollUp = pricesUp.rolling(n).mean()
    rollDown = pricesDown.abs().rolling(n).mean()
    rs = rollUp / rollDown
    rs.fillna(1.0, inplace=True)
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

def MACD(df, short=12, long=26):
    EMA_short = pd.Series(df['close'].ewm(span=short, min_periods=short).mean())
    EMA_long = pd.Series(df['close'].ewm(span=long, min_periods=long).mean())
    return EMA_short - EMA_long 
'''
if technicals:
    RSI = pd.Dataframe()
    MACD = pd.Dataframe()
    MACD_signal = pd.Dataframe()
    for coin in coins:
        RSI = pd.concat([RSI, relative_strength_idx(df.loc[df[coin]]).fillna(50)])
        df['MACD'] = pd.Series(MACD(df)).fillna(0)
        df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean()).fillna(0)
'''
if technicals:
    df['RSI'] = relative_strength_idx(df)
    df['MACD'] = pd.Series(MACD(df)).fillna(0)
    df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean()).fillna(0)
df

Unnamed: 0,date,high,low,open,close,volume,adjclose,coin,p,BTC-USD,...,AVAX-USD,LTC-USD,XMR-USD,ETC-USD,REP-USD,MAID-USD,STEEM-USD,RSI,MACD,MACD_signal
0,1483228800,1003.080017,958.698975,963.658020,998.325012,147775008.0,998.325012,BTC-USD,1.035974,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
1,1483315200,1031.390015,996.702026,998.617004,1021.750000,222184992.0,1021.750000,BTC-USD,1.023165,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
2,1483401600,1044.079956,1021.599976,1021.599976,1043.839966,185168000.0,1043.839966,BTC-USD,1.021770,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
3,1483488000,1159.420044,1044.400024,1044.400024,1154.729980,344945984.0,1154.729980,BTC-USD,1.105640,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
4,1483574400,1191.099976,910.416992,1156.729980,1013.380005,510199008.0,1013.380005,BTC-USD,0.876073,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22620,1637712000,0.673348,0.607530,0.621976,0.612812,27702150.0,0.612812,STEEM-USD,0.985266,False,...,False,False,False,False,False,False,True,50.727463,0.002363,0.002923
22621,1637798400,0.695432,0.623388,0.643072,0.631901,45568311.0,0.631901,STEEM-USD,0.982629,False,...,False,False,False,False,False,False,True,54.625830,0.003910,0.003120
22622,1637884800,0.819299,0.673438,0.683161,0.702681,204342166.0,0.702681,STEEM-USD,1.028573,False,...,False,False,False,False,False,False,True,66.873809,0.010723,0.004641
22623,1637971200,0.734130,0.674832,0.703990,0.706345,38813453.0,0.706345,STEEM-USD,1.003345,False,...,False,False,False,False,False,False,True,66.795869,0.016231,0.006959


In [30]:
def train_test_split(df, test_size, train_size):
    train_data = pd.DataFrame(columns=df.columns)
    test_data = pd.DataFrame(columns=df.columns)
    for coin in coins:
        df_coin = df.loc[df['coin'] == coin]
        
        split = int(train_size * len(df_coin))
        test_end = min(int((train_size + test_size) * len(df_coin)), len(df_coin))
        
        train_data = pd.concat([train_data, df_coin.iloc[:split]]) 
        test_data = pd.concat([test_data, df_coin.iloc[split:test_end]]) 
        # print(train_data.tail())
    
    # print(train_data.columns)
    mean_p = train_data['p'].mean()
    std_p = train_data['p'].std()
    train_data['p'] = ((train_data['p']-mean_p)/std_p)#.round(1)
    test_data['p'] = ((test_data['p']-mean_p)/std_p)#.round(1)
    

    return train_data.drop(['coin'], axis=1), test_data.drop(['coin'], axis=1)
train, test = train_test_split(df, 0.15, 0.5)
print(train.shape, test.shape)

(11307, 27) (3388, 27)


In [31]:
window_len = 22
test_size = 0.15

def extract_window_data(df, window_len):
    window_data = []
    
    for idx in range(len(df) - window_len):
        tmp = df[idx: (idx + window_len)].copy()
        
        window_data.append(tmp.values)
    return np.array(window_data)

def prepare_data(df, target_col, window_len, test_size, train_size):
    train_data, test_data = train_test_split(df, test_size=test_size, train_size=train_size)
    # print(train_data.shape)
    x_train = extract_window_data(train_data, window_len)
    # print(X_train.shape)
    x_test = extract_window_data(test_data, window_len)
    
    y_train = train_data[target_col][window_len:].values
    y_test = test_data[target_col][window_len:].values

    #print(len(train_data[target_col][:-window_len].values-1))
    #print(len(y_train))
    
    # print(X_train, y_train)

    return train_data, test_data, x_train, x_test, y_train, y_test


In [32]:
epochs=5
loss='mse'
optimizer = 'adam'
dropout=0.5
num_layers=4
cell_size=8
dense_units=95
technicals=False
model_type='rnn'
neurons=100

In [37]:
folds=5
min_train_size=0.4
test_size=0.15

train_sizes = []
for k in range(folds):
    train_sizes.append(min_train_size + k / (folds - 1) * (1 - (test_size + min_train_size)))
for train_size in train_sizes:
    _, _, _, x_test, _, y_test = prepare_data(df, 'p', window_len=window_len, test_size=test_size, train_size=train_size)
    print(x_test[0], y_test[0])

[[1545004800 3597.91796875 3253.123046875 3253.123046875 3545.86474609375
  5409247918.0 3545.86474609375 0.9921496141496653 True False False False
  False False False False False False False False False False False False
  39.15272677942676 -408.3693097567161 -465.0253310046285]
 [1545091200 3701.349365234375 3487.169189453125 3544.761474609375
  3696.05908203125 5911325473.0 3696.05908203125 0.43460433339490817 True
  False False False False False False False False False False False False
  False False False 42.29552339765154 -361.698872620369
  -444.36003932777663]
 [1545177600 3949.322998046875 3687.22998046875 3706.824951171875
  3745.95068359375 6810689119.0 3745.95068359375 0.055956950255685665
  True False False False False False False False False False False False
  False False False False 49.73878099701321 -317.0318340257054
  -418.8943982673624]
 [1545264000 4191.228515625 3728.974609375 3742.195068359375
  4134.44140625 8927129279.0 4134.44140625 1.1669270359577029 True Fal

[[1579824000 8514.6669921875 8266.8408203125 8405.5673828125
  8445.4345703125 24397913026.0 8445.4345703125 0.0025809878916814265
  True False False False False False False False False False False False
  False False False False 56.669991850924 276.36505065916936
  312.1936611765318]
 [1579910400 8458.453125 8296.21875 8440.119140625 8367.84765625
  19647331549.0 8367.84765625 -0.17531770852973314 True False False False
  False False False False False False False False False False False False
  58.09928740398727 243.4809193756737 298.4511128163602]
 [1579996800 8602.4013671875 8325.498046875 8364.41015625 8596.830078125
  22177678796.0 8596.830078125 0.3106770556473585 True False False False
  False False False False False False False False False False False False
  59.567109329941886 233.2086915467171 285.4026285624316]
 [1580083200 8977.7265625 8597.30859375 8597.30859375 8909.8193359375
  28647338393.0 8909.8193359375 0.4251653399513515 True False False False
  False False False Fa

[[1614902400 49396.4296875 46542.515625 48527.03125 48927.3046875
  48625928883.0 48927.3046875 0.042138828690909846 True False False False
  False False False False False False False False False False False False
  36.6466578202507 1506.0809695799326 2226.1298573892827]
 [1614988800 49147.21875 47257.52734375 48899.23046875 48912.3828125
  34363564661.0 48912.3828125 -0.06706889592959633 True False False False
  False False False False False False False False False False False False
  36.107801060300176 1398.0561749491753 2060.515120901261]
 [1615075200 51384.3671875 48918.6796875 48918.6796875 51206.69140625
  43137459378.0 51206.69140625 0.5693700981345616 True False False False
  False False False False False False False False False False False False
  38.14947663836393 1480.5108684014413 1944.5142704012972]
 [1615161600 52314.0703125 49506.0546875 51174.1171875 52246.5234375
  48597428048.0 52246.5234375 0.21605513035659993 True False False False
  False False False False False Fa

In [34]:
from rnn_model import rnn
from lstm_model import lstm

# min_MSE = 999# np.float(inf)
# min_MSE_k = 0
MSE_sum = 0
for i, train_size in enumerate(train_sizes):
    train, test, x_train, x_test, y_train, y_test = prepare_data(df, 'p', window_len=window_len, test_size=test_size, train_size=train_size)
    features = tf.convert_to_tensor(x_train, dtype=tf.float32)
    labels = tf.expand_dims(tf.convert_to_tensor(y_train, dtype=tf.float32), axis=1)
    print('Size of fold train data:', features.shape, labels.shape)
    if model_type == 'rnn':
        model = rnn(features=features, 
            labels=labels, 
            dropout=dropout, 
            num_layers=num_layers, 
            cell_size=cell_size, 
            dense_units=dense_units,
            technicals=technicals)
    elif model_type == 'lstm':
        model = lstm(features=features, 
            labels=labels, 
            dropout=dropout, 
            neurons=neurons, 
            dense_units=dense_units,
            technicals=technicals)
    model.fit(features, labels, epochs=epochs, shuffle=True)
    preds = model.predict(tf.convert_to_tensor(x_test, dtype=tf.float32))
    MSE = ((preds - y_test) ** 2).mean()
    print('Preds: ', y_test.shape, preds)
    #if MSE < min_MSE:
    #    min_MSE = min(min_MSE, MSE)
    #    min_MSE_k = i + 1
    MSE_sum += MSE 
        
    print('Fold number', i + 1, 'MSE: ', MSE)
print('MSE: ', MSE_sum/len(train_sizes))
    

Size of fold train data: (9020, 22, 27) (9020, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3367,) [[0.00374845]
 [0.00374845]
 [0.00374845]
 ...
 [0.00372928]
 [0.00372928]
 [0.00372928]]
Fold number 1 MSE:  0.3721344972963347
Size of fold train data: (11562, 22, 27) (11562, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3378,) [[0.00170314]
 [0.00170314]
 [0.00170314]
 ...
 [0.00296891]
 [0.00296891]
 [0.00296891]]
Fold number 2 MSE:  0.5775268888390612
Size of fold train data: (14115, 22, 27) (14115, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3367,) [[0.00169149]
 [0.00169149]
 [0.00169149]
 ...
 [0.00533524]
 [0.00533524]
 [0.00533524]]
Fold number 3 MSE:  0.6249110957895857
Size of fold train data: (16658, 22, 27) (16658, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3367,) [[0.00082284]
 [0.00082284]
 [0.00082284]
 ...
 [0.00136542]
 [0.00136542]
 [0.00136542]]
Fold number 4 MSE:  1.1623611365137196
Size of fold t

In [None]:
preds = model.predict(tf.convert_to_tensor(x_test, dtype=tf.float32))

print(preds[:20])
print(y_test[:20])

print(y_test.mean(), preds.mean(), (y_test - preds).mean())

print('MAE: ', (np.absolute(preds - y_test)).mean())
print('MSE: ', ((preds - y_test) ** 2).mean())
