In [5]:
import pandas as pd 
import numpy as np
import os
import random
from sklearn import preprocessing
from collections import deque

SEQLEN = 60 # Every 60 sec
FUTUREPREDICT = 3 # Predict 3 periods
RATIO2PREDICT = "BTC-USD" # What to predict

def classify(current, future):
    if float(future) > float(current):
        return 1
    else: return 0

def preprocess_df(df):
    """
    Normalize and prepossesing data to be modeled
    Returns array of sequential data
    """
    df = df.drop('future', 1)

    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values)

    df.dropna(inplace=True)

    sequential_data = []
    prev_days = deque(maxlen=SEQLEN)

    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQLEN:
            sequential_data.append([np.array(prev_days), i[-1]])

    random.shuffle(sequential_data)

    return sequential_data

df = pd.read_csv("crypto_data/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])
main_df = pd.DataFrame()

In [6]:
ratios = ['BTC-USD', 'LTC-USD', 'ETH-USD', 'BCH-USD']
for ratio in ratios:
    dataset = f'crypto_data/{ratio}.csv'
    
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)
    df.set_index("time", inplace=True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]

    if len(main_df) == 0:
        main_df = df
    else: 
        main_df = main_df.join(df)

main_df['future'] = main_df[f"{RATIO2PREDICT}_close"].shift(-FUTUREPREDICT)
main_df['target'] = list(map(classify, main_df[f"{RATIO2PREDICT}_close"], main_df['future']))

print(main_df[[f"{RATIO2PREDICT}_close", "future", 'target']].head)


<bound method NDFrame.head of             BTC-USD_close       future  target
time                                          
1528968660    6489.549805  6479.410156       0
1528968720    6487.379883  6479.979980       0
1528968780    6479.410156  6480.000000       1
1528968840    6479.410156  6477.220215       0
1528968900    6479.979980  6480.000000       1
...                   ...          ...     ...
1535214960    6713.140137  6715.000000       1
1535215020    6714.520020  6715.000000       1
1535215080    6714.520020          NaN       0
1535215140    6715.000000          NaN       0
1535215200    6715.000000          NaN       0

[97724 rows x 3 columns]>


In [7]:
times = sorted(main_df.index.values)
last_5p = times[-int(0.05*len(times))]

# Split data
validation_df = main_df[(main_df.index >= last_5p)]
training_df = main_df[(main_df.index <= last_5p)]
preprocess_df(main_df)


 ]
[-6.43832690e-04 -8.43140601e-02  3.79159918e-03 -1.03718049e-01
  7.93617870e-01 -5.17583844e-02  3.26788498e-03 -6.17153994e-03
  0.00000000e+00]
[-0.00227897 -0.08163787  0.0037916  -0.06064285  0.0045756  -0.0496007
  0.00326788 -0.00518584  0.        ]
[-1.10901059 -0.03771902  0.0037916  -0.07783445  0.0045756  -0.02177141
 -0.00991939 -0.00602808  0.        ]
[-0.421368   -0.06698762  0.0037916  -0.07023731 -0.02464396 -0.02975798
  0.01645541 -0.00598454  1.        ]
[-0.15488258 -0.01757678 -0.78257033 -0.04564831 -1.19240413  0.0481873
 -0.00991939 -0.00610305  1.        ]
[-6.43832690e-04 -8.75395711e-02 -1.30814044e+00 -5.74265499e-02
  4.57559892e-03 -5.19538032e-02  1.64554071e-02 -5.94008322e-03
  1.00000000e+00]
[ 0.69696564  0.0473937   1.1866067  -0.10718687  0.70624768 -0.03766472
  0.00326788 -0.0061643   1.        ]
[ 0.56069679 -0.07518252  0.0037916  -0.07285553  0.03381706 -0.04929761
  0.00326788 -0.0061531   1.        ]
[-6.43832690e-04 -8.39391987e-02 -1.2