In [18]:
import json
import requests
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error
# import torch
# import torch.nn as nn
# import torch

In [33]:
# Replace with your own path
technicals = False
normalize_by_coin = True

df = pd.read_csv('/Users/timwu0/Documents/CS329P/afterhours_crypto/preprocessing/crypto_data.csv')
coins = df['coin'].unique()
# print(coins)

#convert coin to one-hot vectors
for coin in coins:
    df[coin] = df['coin'] == coin



In [20]:
def relative_strength_idx(df, n=14):
    close = df['close']
    delta = close.diff()
    # delta = delta[1:]
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0
    pricesDown[pricesDown > 0] = 0
    rollUp = pricesUp.rolling(n).mean()
    rollDown = pricesDown.abs().rolling(n).mean()
    rs = rollUp / rollDown
    rs.fillna(1.0, inplace=True)
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

def MACD(df, short=12, long=26):
    EMA_short = pd.Series(df['close'].ewm(span=short, min_periods=short).mean())
    EMA_long = pd.Series(df['close'].ewm(span=long, min_periods=long).mean())
    return EMA_short - EMA_long 
'''
if technicals:
    RSI = pd.Dataframe()
    MACD = pd.Dataframe()
    MACD_signal = pd.Dataframe()
    for coin in coins:
        RSI = pd.concat([RSI, relative_strength_idx(df.loc[df[coin]]).fillna(50)])
        df['MACD'] = pd.Series(MACD(df)).fillna(0)
        df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean()).fillna(0)
'''
if technicals:
    df['RSI'] = relative_strength_idx(df)
    df['MACD'] = pd.Series(MACD(df)).fillna(0)
    df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean()).fillna(0)
df

Unnamed: 0,date,high,low,open,close,volume,adjclose,coin,p,BTC-USD,...,AVAX-USD,LTC-USD,XMR-USD,ETC-USD,REP-USD,MAID-USD,STEEM-USD,RSI,MACD,MACD_signal
0,1483228800,1003.080017,958.698975,963.658020,998.325012,147775008.0,998.325012,BTC-USD,1.035974,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
1,1483315200,1031.390015,996.702026,998.617004,1021.750000,222184992.0,1021.750000,BTC-USD,1.023165,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
2,1483401600,1044.079956,1021.599976,1021.599976,1043.839966,185168000.0,1043.839966,BTC-USD,1.021770,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
3,1483488000,1159.420044,1044.400024,1044.400024,1154.729980,344945984.0,1154.729980,BTC-USD,1.105640,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
4,1483574400,1191.099976,910.416992,1156.729980,1013.380005,510199008.0,1013.380005,BTC-USD,0.876073,True,...,False,False,False,False,False,False,False,50.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22620,1637712000,0.673348,0.607530,0.621976,0.612812,27702150.0,0.612812,STEEM-USD,0.985266,False,...,False,False,False,False,False,False,True,50.727463,0.002363,0.002923
22621,1637798400,0.695432,0.623388,0.643072,0.631901,45568311.0,0.631901,STEEM-USD,0.982629,False,...,False,False,False,False,False,False,True,54.625830,0.003910,0.003120
22622,1637884800,0.819299,0.673438,0.683161,0.702681,204342166.0,0.702681,STEEM-USD,1.028573,False,...,False,False,False,False,False,False,True,66.873809,0.010723,0.004641
22623,1637971200,0.734130,0.674832,0.703990,0.706345,38813453.0,0.706345,STEEM-USD,1.003345,False,...,False,False,False,False,False,False,True,66.795869,0.016231,0.006959


In [30]:
def train_test_split(df, test_size, train_size):
    train_data = pd.DataFrame(columns=df.columns)
    test_data = pd.DataFrame(columns=df.columns)
    for coin in coins:
        df_coin = df.loc[df['coin'] == coin]
        
        split = int(train_size * len(df_coin))
        test_end = min(int((train_size + test_size) * len(df_coin)), len(df_coin))
        
        coin_train = df_coin.iloc[:split].copy()
        coin_test = df_coin.iloc[split:test_end].copy()
        if normalize_by_coin:
            mean_p = coin_train['p'].mean()
            std_p = coin_train['p'].std()
            coin_train['p'] = ((coin_train['p']-mean_p)/std_p)#.round(1)
            coin_test['p'] = ((coin_test['p']-mean_p)/std_p)#.round(1)

        train_data = pd.concat([train_data, coin_train]) 
        test_data = pd.concat([test_data, coin_test]) 
            
    
    # print(train_data.columns)
    if not normalize_by_coin:
        mean_p = train_data['p'].mean()
        std_p = train_data['p'].std()
        train_data['p'] = ((train_data['p']-mean_p)/std_p)#.round(1)
        test_data['p'] = ((test_data['p']-mean_p)/std_p)#.round(1)


    return train_data.drop(['coin'], axis=1), test_data.drop(['coin'], axis=1)
train, test = train_test_split(df, 0.15, 0.5)
print(train.shape, test.shape)

(11307, 27) (3388, 27)


(             date         high          low         open        close  \
 0      1483228800  1003.080017   958.698975   963.658020   998.325012   
 1      1483315200  1031.390015   996.702026   998.617004  1021.750000   
 2      1483401600  1044.079956  1021.599976  1021.599976  1043.839966   
 3      1483488000  1159.420044  1044.400024  1044.400024  1154.729980   
 4      1483574400  1191.099976   910.416992  1156.729980  1013.380005   
 ...           ...          ...          ...          ...          ...   
 21725  1560038400     0.438784     0.381092     0.401229     0.387386   
 21726  1560124800     0.417566     0.383700     0.387073     0.416082   
 21727  1560211200     0.445997     0.403569     0.416230     0.435659   
 21728  1560297600     0.436418     0.421335     0.435659     0.427926   
 21729  1560384000     0.429957     0.411852     0.427878     0.414671   
 
             volume     adjclose         p BTC-USD ETH-USD  ... AVAX-USD  \
 0      147775008.0   998.325012  

In [23]:
window_len = 22
test_size = 0.15

def extract_window_data(df, window_len):
    window_data = []
    
    for idx in range(len(df) - window_len):
        tmp = df[idx: (idx + window_len)].copy()
        
        window_data.append(tmp.values)
    return np.array(window_data)

def prepare_data(df, target_col, window_len, test_size, train_size):
    train_data, test_data = train_test_split(df, test_size=test_size, train_size=train_size)
    # print(train_data.shape)
    x_train = extract_window_data(train_data, window_len)
    # print(X_train.shape)
    x_test = extract_window_data(test_data, window_len)
    
    y_train = train_data[target_col][window_len:].values
    y_test = test_data[target_col][window_len:].values

    #print(len(train_data[target_col][:-window_len].values-1))
    #print(len(y_train))
    
    # print(X_train, y_train)

    return train_data, test_data, x_train, x_test, y_train, y_test


In [27]:
epochs=2
loss='mse'
optimizer = 'adam'
dropout=0.5
num_layers=4
cell_size=8
dense_units=100
technicals=False
model_type='rnn'
neurons=100

In [28]:
folds=5
min_train_size=0.4
test_size=0.15

train_sizes = []
for k in range(folds):
    train_sizes.append(min_train_size + k / (folds - 1) * (1 - (test_size + min_train_size)))

In [34]:
from rnn_model import rnn
from lstm_model import lstm

# min_MSE = 999# np.float(inf)
# min_MSE_k = 0
MSE_sum = 0
for i, train_size in enumerate(train_sizes):
    train, test, x_train, x_test, y_train, y_test = prepare_data(df, 'p', window_len=window_len, test_size=test_size, train_size=train_size)
    features = tf.convert_to_tensor(x_train, dtype=tf.float32)
    labels = tf.expand_dims(tf.convert_to_tensor(y_train, dtype=tf.float32), axis=1)
    print('Size of fold train data:', features.shape, labels.shape)
    if model_type == 'rnn':
        model = rnn(features=features, 
            labels=labels, 
            dropout=dropout, 
            num_layers=num_layers, 
            cell_size=cell_size, 
            dense_units=dense_units,
            technicals=technicals)
    elif model_type == 'lstm':
        model = lstm(features=features, 
            labels=labels, 
            dropout=dropout, 
            neurons=neurons, 
            dense_units=dense_units,
            technicals=technicals)
    model.fit(features, labels, epochs=epochs, shuffle=True)
    preds = model.predict(tf.convert_to_tensor(x_test, dtype=tf.float32))
    MSE = ((preds - y_test) ** 2).mean()
    print('Preds: ', y_test.shape, preds)
    #if MSE < min_MSE:
    #    min_MSE = min(min_MSE, MSE)
    #    min_MSE_k = i + 1
    MSE_sum += MSE 
        
    print('Fold number', i + 1, 'MSE: ', MSE)
print('MSE: ', MSE_sum/len(train_sizes))
    

Size of fold train data: (9020, 22, 24) (9020, 1)
Epoch 1/2
Epoch 2/2
Preds:  (3367,) [[0.0141035 ]
 [0.01418018]
 [0.01422995]
 ...
 [0.0123165 ]
 [0.0123165 ]
 [0.0123165 ]]
Fold number 1 MSE:  0.43353397839123087
Size of fold train data: (11562, 22, 24) (11562, 1)
Epoch 1/2
Epoch 2/2
Preds:  (3378,) [[0.00748846]
 [0.00748846]
 [0.00748846]
 ...
 [0.00743952]
 [0.00743952]
 [0.00743952]]
Fold number 2 MSE:  0.6254164769425299
Size of fold train data: (14115, 22, 24) (14115, 1)
Epoch 1/2
Epoch 2/2
Preds:  (3367,) [[0.00924516]
 [0.00924516]
 [0.00924516]
 ...
 [0.01051754]
 [0.01051754]
 [0.0105176 ]]
Fold number 3 MSE:  0.6561418486225419
Size of fold train data: (16658, 22, 24) (16658, 1)
Epoch 1/2
Epoch 2/2
Preds:  (3367,) [[0.00634471]
 [0.00634471]
 [0.00634471]
 ...
 [0.00686172]
 [0.00686172]
 [0.00686172]]
Fold number 4 MSE:  1.1520135851690705
Size of fold train data: (19199, 22, 24) (19199, 1)
Epoch 1/2
Epoch 2/2
Preds:  (3382,) [[0.00304353]
 [0.00304353]
 [0.00304353]
 ..

In [31]:
preds = model.predict(tf.convert_to_tensor(x_test, dtype=tf.float32))

print(preds[:20])
print(y_test[:20])

print(y_test.mean(), preds.mean(), (y_test - preds).mean())

print('MAE: ', (np.absolute(preds - y_test)).mean())
print('MSE: ', ((preds - y_test) ** 2).mean())


[[0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]
 [0.00505304]]
[ 0.27765248 -0.09005793  0.67977954  0.39688716 -0.08444477 -0.01213591
  0.03402699 -0.79190661  0.39242253  0.03932539 -0.47024566 -0.9461582
  0.85499618 -0.11283876  0.54331057  0.06148748 -0.19056557  1.34289938
 -0.23357147  0.00941888]
-0.015815196334905555 0.0046039256 -0.02041912211226135
MAE:  0.6781027033826226
MSE:  1.0582273779706
