In [1]:
import json
import requests
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error
# import torch
# import torch.nn as nn
# import torch

In [19]:
# Replace with your own path
df = pd.read_csv('/Users/timwu0/Documents/CS329P/afterhours_crypto/preprocessing/crypto_data.csv')
coins = df['coin'].unique()
# print(coins)

#convert coin to one-hot vectors
for coin in coins:
    df[coin] = df['coin'] == coin

def train_val_split(df, test_val_size, train_size):
    train_data = pd.DataFrame(columns=df.columns)
    val_data = pd.DataFrame(columns=df.columns)
    test_data = pd.DataFrame(columns=df.columns)
    for coin in coins:
        df_coin = df.loc[df['coin'] == coin]
        
        split_train_val = int(train_size * len(df_coin))
        split_val_test = split_train_val + int(test_val_size * len(df_coin))
        
        train_data = pd.concat([train_data, df_coin.iloc[:split_train_val]]) 
        val_data = pd.concat([val_data, df_coin.iloc[split_train_val:split_val_test]]) 
        test_data = pd.concat([test_data, df_coin.iloc[len(df_coin) - int(test_val_size * len(df_coin)):]])
        # print(train_data.tail())
    
    # print(train_data.columns)
    mean_p = train_data['p'].mean()
    std_p = train_data['p'].std()
    train_data['p'] = ((train_data['p']-mean_p)/std_p)#.round(1)
    val_data['p'] = ((val_data['p']-mean_p)/std_p)#.round(1)
    test_data['p'] = ((test_data['p']-mean_p)/std_p)#.round(1)
    

    return train_data.drop(['coin'], axis=1), val_data.drop(['coin'], axis=1), test_data.drop(['coin'], axis=1)
train, val, test = train_val_split(df, 0.15, 0.5)
print(train, val, test)

             date         high          low         open        close  \
0      1483228800  1003.080017   958.698975   963.658020   998.325012   
1      1483315200  1031.390015   996.702026   998.617004  1021.750000   
2      1483401600  1044.079956  1021.599976  1021.599976  1043.839966   
3      1483488000  1159.420044  1044.400024  1044.400024  1154.729980   
4      1483574400  1191.099976   910.416992  1156.729980  1013.380005   
...           ...          ...          ...          ...          ...   
21725  1560038400     0.438784     0.381092     0.401229     0.387386   
21726  1560124800     0.417566     0.383700     0.387073     0.416082   
21727  1560211200     0.445997     0.403569     0.416230     0.435659   
21728  1560297600     0.436418     0.421335     0.435659     0.427926   
21729  1560384000     0.429957     0.411852     0.427878     0.414671   

            volume     adjclose         p BTC-USD ETH-USD  ... XRP-USD  \
0      147775008.0   998.325012  0.381478    True

In [20]:
window_len = 22
test_val_size = 0.15

def extract_window_data(df, window_len):
    window_data = []
    
    for idx in range(len(df) - window_len):
        tmp = df[idx: (idx + window_len)].copy()
        
        window_data.append(tmp.values)
    return np.array(window_data)

def prepare_data(df, target_col, window_len, test_val_size, train_size):
    train_data, val_data, test_data = train_val_split(df, test_val_size=test_val_size, train_size=train_size)
    # print(train_data.shape)
    x_train = extract_window_data(train_data, window_len)
    # print(X_train.shape)
    x_val = extract_window_data(val_data, window_len) 
    x_test = extract_window_data(test_data, window_len)
    
    y_train = train_data[target_col][window_len:].values
    y_val = val_data[target_col][window_len:].values
    y_test = test_data[target_col][window_len:].values

    #print(len(train_data[target_col][:-window_len].values-1))
    #print(len(y_train))
    
    # print(X_train, y_train)

    return train_data, val_data, test_data, x_train, x_val, x_test, y_train, y_val, y_test


In [29]:
epochs=5
loss='mse'
optimizer = 'adam'
dropout=0.9
num_layers=4
cell_size=32
dense_units=95
technicals=False

In [30]:
folds=5
min_train_size=0.4
test_val_size=0.15

train_sizes = []
for k in range(folds):
    train_sizes.append(min_train_size + k / (folds - 1) * (1 - (2 * test_val_size + min_train_size)))
print(train_sizes)

[0.4, 0.47500000000000003, 0.55, 0.625, 0.7000000000000001]


In [33]:
from rnn_model import rnn

min_MSE = 999# np.float(inf)
min_MSE_k = 0
for i, train_size in enumerate(train_sizes):
    train, val, test, x_train, x_val, x_test, y_train, y_val, y_test = prepare_data(df, 'p', window_len=window_len, test_val_size=test_val_size, train_size=train_size)

    features = tf.convert_to_tensor(x_train, dtype=tf.float32)
    labels = tf.expand_dims(tf.convert_to_tensor(y_train, dtype=tf.float32), axis=1)
    print('Size of fold train data:', features.shape, labels.shape)
    model = rnn(features=features, 
            labels=labels, 
            dropout=dropout, 
            num_layers=num_layers, 
            cell_size=cell_size, 
            dense_units=dense_units,
            technicals=technicals)
    model.fit(features, labels, epochs=epochs, shuffle=True)
    preds = model.predict(tf.convert_to_tensor(x_val, dtype=tf.float32))
    MSE = ((preds - y_val) ** 2).mean()
    print('Preds: ', y_val.shape, preds)
    if MSE < min_MSE:
        min_MSE = min(min_MSE, MSE)
        min_MSE_k = i + 1
        
    print('Fold number', i + 1, 'MSE: ', MSE)
    
# Use train data from best fold  
print('Best fold is fold number', min_MSE_k, '\n Training on best fold...')
train, val, test, x_train, x_val, x_test, y_train, y_val, y_test = prepare_data(df, 'p', window_len=window_len, test_val_size=test_val_size, train_size=train_sizes[min_MSE_k-1])
features = tf.convert_to_tensor(x_train, dtype=tf.float32)
labels = tf.expand_dims(tf.convert_to_tensor(y_train, dtype=tf.float32), axis=1)

model = rnn(features=features, 
        labels=labels, 
        dropout=dropout, 
        num_layers=num_layers, 
        cell_size=cell_size, 
        dense_units=dense_units,
        technicals=technicals)
model.fit(features, labels, epochs=epochs, shuffle=True)
    
    

Size of fold train data: (9020, 22, 24) (9020, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3366,) [[7.150167e-07]
 [7.150167e-07]
 [7.150167e-07]
 ...
 [5.517112e-07]
 [5.517112e-07]
 [5.517112e-07]]
Fold number 1 MSE:  0.3719660012690479
Size of fold train data: (10715, 22, 24) (10715, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3366,) [[2.7805484e-07]
 [2.7805484e-07]
 [2.7805484e-07]
 ...
 [2.0783817e-07]
 [2.0783817e-07]
 [2.0783817e-07]]
Fold number 2 MSE:  0.4252373434548543
Size of fold train data: (12409, 22, 24) (12409, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3366,) [[3.4859630e-07]
 [3.4878252e-07]
 [3.4888564e-07]
 ...
 [2.8986898e-07]
 [2.8989137e-07]
 [2.8992122e-07]]
Fold number 3 MSE:  0.6127803856208579
Size of fold train data: (14115, 22, 24) (14115, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Preds:  (3366,) [[1.3776308e-07]
 [1.4216977e-07]
 [1.4775185e-07]
 ...
 [1.2138109e-07]
 [1.2138109e-07]
 [1.2

<keras.callbacks.History at 0x7fd4a5313e80>

In [34]:
preds = model.predict(tf.convert_to_tensor(x_test, dtype=tf.float32))

print(preds[:20])
print(y_test[:20])

print(y_test.mean(), preds.mean(), (y_test - preds).mean())

print('MAE: ', (np.absolute(preds - y_test)).mean())
print('MSE: ', ((preds - y_test) ** 2).mean())


[[8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]
 [8.021305e-07]]
[-0.07353904  0.31122752  0.16983725 -0.07073357 -0.03459343 -0.01152111
 -0.42432464  0.16760582 -0.00887296 -0.26355775 -0.50141994  0.3988012
 -0.08492495  0.24302001  0.00220371 -0.12377299  0.64265636 -0.14526744
 -0.0238203  -0.38251732]
-0.02146501941191319 5.637405e-07 -0.021465583152434017
MAE:  0.5648923171824302
MSE:  0.7465080041575733
