<a href="https://colab.research.google.com/github/davidjeans1/Davids_Portfolio/blob/main/trainDavid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading and Preprocessing

##Load Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from datetime import date, timedelta

def requestData(coin, isTrainingSet=True, delta=150):
    key = "6d0dc559240e6cb9142b14d63b0591cb"
    if isTrainingSet:
        endDate = (date.today() - timedelta(days=delta)).isoformat()
        url = "https://api.nomics.com/v1/candles?key=" + key + "&interval=1d&currency=" + coin + "&end=" + endDate + "T00%3A00%3A00Z"
    else:
        startDate = (date.today() - timedelta(days=delta+60)).isoformat() #60 day overlap between train and test set
        endDate = (date.today() - timedelta(days=1)).isoformat()
        url = "https://api.nomics.com/v1/candles?key=" + key + "&interval=1d&currency=" + coin + "&start=" + startDate + "T00%3A00%3A00Z&end=" + endDate + "T00%3A00%3A00Z"
        print(url)
    rawData = pd.read_json(url)
    if (rawData.empty):
        raise Exception(coin + " does not exist")
    return rawData

In [None]:
import requests

ALPHA_API = "A92RWPBGHVN1IM2M"

def request_stocks(ticker):
  url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={ticker}&outputsize=full&apikey={ALPHA_API}'
  r = requests.get(url).json()
#  if r['Meta Data']:
 #   del r['Meta Data']
  data = pd.DataFrame.from_dict(r['Time Series (Daily)']).transpose()
  data.columns = ['open', 'high', 'low', 'close', 'adjusted close', 'volume', 'divided amount', 'split coefficient']
  data.index = pd.to_datetime(data.index)
  return pd.DataFrame(data)

UUP = request_stocks('UUP')['adjusted close']
UUP = UUP.rename('UUP').astype(float)
SPY = request_stocks("SPY")['adjusted close']
SPY = SPY.rename('SPY').astype(float)


In [None]:
GLASSNODE_API = "1yGjKl8gmits6q0X3gTK7pu7SKM"

def glassnode_data(metric):
  res = requests.get(f'https://api.glassnode.com/v1/metrics/{metric}',
    params={"a": "BTC",'api_key': GLASSNODE_API, "s": 1231027200, "i": "24h", "f": "JSON", "timestamp_format": "humanized"})
  data = pd.read_json(res.text, convert_dates=['t'])
  data = data.rename(index = data["t"], columns = {"t" : "timestamp", "v": metric})
  data = data.drop(columns = "timestamp", axis = 1)
  data.index = data.index.tz_convert(None)
  return data

supply = glassnode_data("supply/current")
s2f = glassnode_data('indicators/stock_to_flow_ratio')
mvrv = glassnode_data('indicators/mvrv_account_based')
sopr = glassnode_data('indicators/sopr_account_based')
hodl = glassnode_data('indicators/rhodl_ratio')

index = s2f.index
s2f = pd.json_normalize(s2f["o"])
s2f.index = index
fund_df = supply.join(s2f).join(mvrv).join(sopr).join(hodl).join(UUP).join(SPY)

##Format Data

In [None]:
# format imput data
def formatDataOg(rawData):
    data = rawData.drop(['timestamp', 'transparent_open', 'transparent_high', 'transparent_low', 'transparent_close', 'transparent_volume', 'volume_transparency'], axis = 1)
    return data

# format imput data
def formatData(rawData):
    data = rawData
    data['timestamp'] = data['timestamp'].dt.date
    data.set_index('timestamp', inplace = True)
    data_new = data.join(fund_df).reset_index()
    data_new = data_new.drop(['timestamp','transparent_open', 'transparent_high', 'transparent_low', 'transparent_close', 'transparent_volume', 'volume_transparency'], axis = 1)
    return data_new

##Fill Missing Values

In [None]:
# fills in blank cells using mean of k=2 nearest neighbors
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer

# input dataFrame and output dataFramee
def fillMissingValues(data):
#     strAttribs = ["Date"]
    numAttribs = data.columns #saves column titles

    # pipeline different transformations
    pipeline = ColumnTransformer([
#             ("date", 'passthrough', strAttribs),
            ("num", KNNImputer(n_neighbors=2), numAttribs)
        ])

    dataArray = pipeline.fit_transform(data)

    # convert back to dataFrame with original headers and index
    dataComplete = pd.DataFrame(dataArray, columns=data.columns, index=data.index)
    print(dataComplete)
    return dataComplete

##MinMax Scaling

In [None]:
# MinMaxScaler to normalize data
def minMax(dataArray):
    scaler = MinMaxScaler()
    dataScaled = scaler.fit_transform(dataArray)
    return dataScaled, scaler

# used to match predictions to same scale as training data
def minMaxMatch(dataArray, scaler):
    dataScaled = scaler.transform(dataArray)
    return dataScaled

# unscale yTest, i.e using first column of scaler
def minMaxUnscale(dataArray, scaler):
    dataUnscaled = scaler.inverse_transform(dataArray)
    return dataUnscaled

##Build Input Data

In [None]:
def buildInputData(trainingData, days=1, version=1):
    '''Build input data using 60 days worth of prior data for each prediction'''
    xTrain = []
    yTrain = []

    # version 1 uses consecutive one day open data to make future predictions
    if (version == 1):
        if (days == 1):
            for i in range(60, trainingData.shape[0]):
                xTrain.append(trainingData[i-60:i])
                yTrain.append(trainingData[i,0])
        elif (days == 7):
            for i in range(67, trainingData.shape[0]):
                xTrain.append(trainingData[i-67:i-7])
                yTrain.append(trainingData[i,0])
        elif (days == 30):
            for i in range(90, trainingData.shape[0]):
                xTrain.append(trainingData[i-90:i-30])
                yTrain.append(trainingData[i,0])
        else:
            raise Exception("only 1, 7, or 30 days allowed")
       
    # version 2 uses data from 1 day each week to make a weekly prediction
    elif (version == 2):
        if (days == 7):
            skip = len(trainingData) % 7
            weeks = len(trainingData) // 7

            for i in range (8, weeks): # using 8 weeks to predict
                xData = []
                for j in range (8, 0, -1):
                    xData.append(trainingData[(i-j)*7 + skip])
                xTrain.append(xData)
                yTrain.append(trainingData[i*7 + skip, 0])

        else: 
            raise Exception("not a valid day")
    else:
        raise Exception("not a valid version")
        
    xTrain, yTrain = np.array(xTrain), np.array(yTrain)
    return xTrain, yTrain

# Build and Train LSTM

##Build Model

In [None]:
# Convolutional Neural Network - LSTM
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

def lstm(shape1, shape2):
    model = Sequential()

    # LSTM layer 1
    model.add(LSTM(units = 50, activation = 'relu', return_sequences = True, input_shape = (shape1, shape2)))
    model.add(Dropout(0.2)) # randomly remove nodes prevent overfitting, ie hyperperameter
    # common dropout value is close to 1 in visible layer and 0.5 for hidden layers

    # LSTM hidden layer 2,3. Will show hidden state outputs when doing model.predict
    model.add(LSTM(units = 60, activation = 'relu', return_sequences = True))
    model.add(Dropout(0.3)) 
    model.add(LSTM(units = 80, activation = 'relu', return_sequences = True))
    model.add(Dropout(0.4))

    # LSTM layer 4
    model.add(LSTM(units = 120, activation = 'relu'))
    model.add(Dropout(0.5))

    # Fully connected output layer
    model.add(Dense(units =1))

    model.summary()
    return model

##Compile Model

In [None]:
def compileModel(model, xTrain, yTrain):
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    # epoch: number of iternations to train 
    # batch size: number of smaples per gridient update
    history = model.fit(xTrain, yTrain, epochs = 1, batch_size =50, validation_split=0.1)
    return history

##Training Evaluation

In [None]:
def trainingEval(history):
    pd.DataFrame(history.history).plot(figsize=(8, 5))
    plt.grid(True)
#     plt.gca().set_ylim(0, 0.02) # set the vertical range to [0-1]
    plt.show()

##Save Model

In [None]:
import pickle
import os
from datetime import datetime

def save(history, scaler, model, modelName, days, version):
    '''(History), (MinMaxScaler), (Sequential), (str), (num), (num)'''
    
    time = datetime.now().strftime("%Y%m%d-%H%M%S")
    folderName = str(days) + "day_v" + str(version) + "_" + time
    folder = os.path.join("model_archive", folderName)
    os.makedirs(folder, exist_ok=True)

    path = os.path.join(folder, "history")
    with open(path,'wb') as file:
        pickle.dump(history.history, file)  

    path = os.path.join(folder, "scaler")
    with open(path,'wb') as file:
        pickle.dump(scaler, file)

    path = os.path.join(folder, modelName)
    model.save(path)

##Train model - **run this function**

In [None]:
def train(coinList, days, version=1, scale=True):
    """Trains a LSTM according the list of coin and which day in the future to predict. Saves model, history, and scaler to file.
    Args:
        coinList (str list): List of coin names to train on
        days (num): Number of days in the future to predict. Takes 1, 3, or 7
        scale (bool): Flag to use scaling (default is True)
    Returns:
        model (Sequential): LSTM model
        history (History): Triaining history
        scaler (MinMaxScaler): Scaler used to format the input data
    """
    # List on coins that you want to train on
    xTrain = []
    yTrain = []
    
    for i in coinList:
        rawData = requestData(i) #training set
        data = formatData(rawData)
        dataComplete = fillMissingValues(data)
    
        if scale:
            dataArray = np.array(dataComplete) #make into array
            dataScaled, scaler = minMax(dataArray)
        else:
            dataScaled = np.array(dataComplete) #make into array
            scaler = None

        xTrainSingle, yTrainSingle = buildInputData(dataScaled, days, version)
        
        for i in range(xTrainSingle.shape[0]):
            xTrain.append(xTrainSingle[i])
            yTrain.append(yTrainSingle[i])
    
    xTrain = np.array(xTrain)
    yTrain = np.array(yTrain)
            
    model = lstm(xTrain.shape[1], xTrain.shape[2])
    history = compileModel(model, xTrain, yTrain)
    
    modelFile = {
        1: {1: "fund_model1day_100.h5",
            7: "fund_model7day_100.h5",
            30: "fund_model30day_100.h5",},
        2: {7: "fund_model7day_v2_100.h5",}}

    fileName = modelFile.get(version, None).get(days, None)
    model.save(fileName)
    
    save(history, scaler, model, fileName, days, version) #save to file
    trainingEval(history)
    
    return model, history, scaler

In [None]:
# coinList from coinmarketcap without stable coins, MIOTA, VGX
coinList = ["BTC","ETH", "ADA", "BNB", "DOGE", "XRP", "DOT", "SOL", "UNI", "LUNA", "BCH", "LTC", "LINK", "ICP", "WBTC", "MATIC", "ETC", "XLM", "AVAX", "VET", "FIL", "THETA", "TRON", "CAKE", "XMR", "AAVE", "EOS", "FTT", "AXS", "ATOM", "XTZ", "GRT", "KLAY", "CRO", "NEO", "BTCB", "MKR", "ALGO", "BSV", "SHIB", "LEO", "BTT", "EGLD", "WAVES", "KSM", "AMP", "DASH", "HBAR", "HT", "QNT", "DCR", "COMP", "RUNE", "NEAR", "HNT", "CHZ", "ZEC", "XDC", "HOT", "XEM", "TFUEL", "STX", "MANA", "ENJ", "SUSHI", "CEL", "SNX", "CELO", "YFI", "TEL", "ZIL", "RVN", "FLOW", "QTUM", "BTG", "REV", "AR", "OKB", "BAT", "FTM", "NEXO", "KCS", "SC", "PERP", "AUDIO", "BNT", "ZEN", "DGB", "MDX", "PAX", "ONT", "ICX", "ZRX", "CRV", "OMG", "NANO", "ANKR", "UMA", "IOTX"]
#coinList = ["BTC"]
len(coinList)

99

In [None]:
model, history, scaler = train(coinList, days=1, version=1)

# Test Model

##Test Function

In [None]:
# input 
# returns nd.arrays
from tensorflow.keras.models import load_model

def test(coinList, days, version=1, scale=True, isTrainingSet=False):
    xTest = []
    yTest = []
    
    for i in coinList:
        rawData = requestData(i, isTrainingSet)
        data = formatData(rawData)
        dataComplete = fillMissingValues(data)
        if scale:
            dataArray = np.array(dataComplete) #make into array
            dataScaled, scaler = minMax(dataArray)
        else:
            dataScaled = np.array(dataComplete) #make into array
            
        xTestSingle, yTestSingle = buildInputData(dataScaled, days, version)
        
        for i in range(xTestSingle.shape[0]):
            xTest.append(xTestSingle[i])
            yTest.append(yTestSingle[i])

    xTest = np.array(xTest)
    yTest = np.array(yTest)
    
    print(coinList[0], xTest.shape)
    
    # load relevant model
    modelFile = {
        1: {1: "fund_model1day_100_4L.h5",
            7: "fund_model7day_100_4L.h5",
            30: "fund_model30day_100_4L.h5",},
        2: {7: "fund_model7day_v2_100_4L.h5",}}
        
    fileName = modelFile.get(version, None).get(days, None)
    model = load_model(fileName)

    prediction = model.predict(xTest)
    # unscale the predictions
    yTest = yTest[:, np.newaxis]
    yTestUnscale = (np.hstack((yTest,np.zeros((yTest.shape[0],12)))))
    yTestUnscale = minMaxUnscale(yTestUnscale, scaler)[:,0]

    predUnscale = (np.hstack((prediction,np.zeros((prediction.shape[0],12)))))
    predUnscale = minMaxUnscale(predUnscale, scaler)[:,0]

    return predUnscale, yTestUnscale

In [None]:
prediction, yTest = test(coinList, 1, version=1, isTrainingSet=False)

In [None]:
plt.figure(figsize=(14,5))
# plt.plot(data, color = 'blue')
plt.plot(yTest, color = 'red', label = 'Actual Price')
plt.plot(prediction, color = 'green', label = 'Predicted Price')
plt.title('Price Prediction using RNN-LSTM')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()

NameError: ignored

<Figure size 1008x360 with 0 Axes>

##Profit Prediction for coins

In [None]:
# amount of profit for if you were to buy or short one coin each day.
# input dataFrame
def profitPrediction(label, prediction, pr=True): 
    label = pd.Series(label) # convert np.array to series
    labelTrend = label - label.shift(periods=1, axis=0, fill_value=0)
    
    prediction = np.squeeze(prediction) #reshape np.array from (x, 1) to (x)
    prediction = pd.Series(prediction, name = "price")
    predictionTrend = prediction - prediction.shift(periods=1, axis=0, fill_value=0)

    win = 0
    winAmount = 0
    lose = 0
    loseAmount = 0
    for i in range(0, len(labelTrend)):
        if (labelTrend.iloc[i] > 0 and predictionTrend.iloc[i] > 0):
            win += 1
            winAmount += labelTrend.iloc[i]
        elif (labelTrend.iloc[i] <= 0 and predictionTrend.iloc[i] <= 0):
            win += 1
            winAmount -= labelTrend.iloc[i]
        elif (labelTrend.iloc[i] > 0):
            lose += 1
            loseAmount += labelTrend.iloc[i]
        else:
            lose += 1
            loseAmount -= labelTrend.iloc[i]
            
    winRate = win/(win+lose)
    profitRate = (winAmount-loseAmount)/(winAmount+loseAmount)
    if pr:  
        print("# correct bets:", win)
        print("# wrong bets:", lose)
        print("win percentage:", winRate)
        print("profit:", winAmount)
        print("loss:", loseAmount)
        print("gross profit:", winAmount - loseAmount, "\n")
        print("profit rate:", profitRate)
    
    return winRate, profitRate

In [None]:
def predictManyCoins(days, coinList, version=1):
    coinRate = []
    
    for i in coinList:
      try:
        prediction, yTest = test([i], days, version)
        winRate, profitRate = profitPrediction(yTest, prediction, False)
        
        coin = [i]
        coin.append(winRate)
        coin.append(profitRate)

        coinRate.append(coin)
      except Exception:
        print("test error")
        pass
    
        
    return coinRate

In [None]:
#coinList = ["BTC"] #"ETH", "ADA", "BNB", "DOGE", "XRP", "DOT", "SOL", "UNI", "LUNA", "BCH", "LTC", "LINK", "ICP", "WBTC", "MATIC", "ETC", "XLM", "AVAX", "VET", "FIL", "THETA", "TRON", "CAKE", "XMR", "AAVE", "EOS", "FTT", "AXS", "ATOM", "XTZ", "GRT", "KLAY", "CRO", "NEO", "BTCB", "MKR", "ALGO", "BSV", "SHIB", "LEO", "BTT", "EGLD", "WAVES", "KSM", "AMP", "DASH", "HBAR", "HT", "QNT", "DCR", "COMP", "RUNE", "NEAR", "HNT", "CHZ", "ZEC", "XDC", "HOT", "XEM", "TFUEL", "STX", "MANA", "ENJ", "SUSHI", "CEL", "SNX", "CELO", "YFI", "TEL", "ZIL", "RVN", "FLOW", "QTUM", "BTG", "REV", "AR", "OKB", "BAT", "FTM", "NEXO", "KCS", "SC", "PERP", "AUDIO", "BNT", "ZEN", "DGB", "MDX", "PAX", "ONT", "ICX", "ZRX", "CRV", "OMG", "NANO", "ANKR", "UMA", "IOTX"]
coinRate = predictManyCoins(1, coinList)
coinRate

https://api.nomics.com/v1/candles?key=6d0dc559240e6cb9142b14d63b0591cb&interval=1d&currency=BTC&start=2021-02-26T00%3A00%3A00Z&end=2021-09-23T00%3A00%3A00Z
             open          high  ...     UUP         SPY
0    47246.741213  48275.871883  ...  24.520  376.644613
1    46397.514815  48045.684682  ...  25.060  416.430371
2    46236.657783  46518.374816  ...  24.650  409.194800
3    45254.702360  49605.545566  ...  24.540  385.774551
4    49624.978026  49848.901824  ...  24.450  382.764246
..            ...           ...  ...     ...         ...
205  48409.433581  48409.433581  ...  24.865  434.364645
206  47343.337266  47343.337266  ...  25.030  434.040000
207  43099.199973  43342.552946  ...  25.010  433.630000
208  40819.091077  44056.877847  ...  25.080  437.860000
209  43638.207839  44960.254091  ...  24.980  443.180000

[210 rows x 13 columns]
BTC (150, 60, 13)
https://api.nomics.com/v1/candles?key=6d0dc559240e6cb9142b14d63b0591cb&interval=1d&currency=ETH&start=2021-02-26T00%3

[['BTC', 0.7866666666666666, 0.8598625862991344],
 ['ETH', 0.8666666666666667, 0.9333512346341376],
 ['ADA', 0.84, 0.9236385194558392],
 ['BNB', 0.86, 0.9158136192430106],
 ['DOGE', 0.8533333333333334, 0.9281125365577173],
 ['XRP', 0.8466666666666667, 0.8633129437527604],
 ['DOT', 0.8133333333333334, 0.8732428537378071],
 ['SOL', 0.8133333333333334, 0.8560518935704045],
 ['UNI', 0.8266666666666667, 0.8571062184635209],
 ['LUNA', 0.7733333333333333, 0.8614170118472243],
 ['BCH', 0.8066666666666666, 0.821534975960467],
 ['LTC', 0.7933333333333333, 0.8205160287785338],
 ['LINK', 0.8066666666666666, 0.8541533422895067],
 ['ICP', 0.72, 0.37460030056426974],
 ['WBTC', 0.8333333333333334, 0.9218504919990066],
 ['MATIC', 0.8866666666666667, 0.9244262722327042],
 ['ETC', 0.8733333333333333, 0.9419826996723101],
 ['XLM', 0.78, 0.8737725275558619],
 ['AVAX', 0.7666666666666667, 0.8415232813448554],
 ['VET', 0.8533333333333334, 0.8783965772860303],
 ['FIL', 0.8066666666666666, 0.8298166680402087],

##Save results

In [None]:
def saveResults(coinData, days, version):
  coinData = pd.DataFrame(coinData, columns =["Coin", "WinRate", "ProfitRate"])
  modelName = {
    1: {1: "fund_model1day_4L_100",
        7: "fund_model7day_4L_100",
        30: "fund_model30day_4L_100",},
    2: {7: "fund_model7day_v2_4L_100",}}
  fileName = modelName.get(version, None).get(days, None)
  coinData.to_csv(f"{fileName}_results.csv")


In [None]:
saveResults(coinRate,1,1)
coinRate

[['BTC', 0.7866666666666666, 0.8598625862991344],
 ['ETH', 0.8666666666666667, 0.9333512346341376],
 ['ADA', 0.84, 0.9236385194558392],
 ['BNB', 0.86, 0.9158136192430106],
 ['DOGE', 0.8533333333333334, 0.9281125365577173],
 ['XRP', 0.8466666666666667, 0.8633129437527604],
 ['DOT', 0.8133333333333334, 0.8732428537378071],
 ['SOL', 0.8133333333333334, 0.8560518935704045],
 ['UNI', 0.8266666666666667, 0.8571062184635209],
 ['LUNA', 0.7733333333333333, 0.8614170118472243],
 ['BCH', 0.8066666666666666, 0.821534975960467],
 ['LTC', 0.7933333333333333, 0.8205160287785338],
 ['LINK', 0.8066666666666666, 0.8541533422895067],
 ['ICP', 0.72, 0.37460030056426974],
 ['WBTC', 0.8333333333333334, 0.9218504919990066],
 ['MATIC', 0.8866666666666667, 0.9244262722327042],
 ['ETC', 0.8733333333333333, 0.9419826996723101],
 ['XLM', 0.78, 0.8737725275558619],
 ['AVAX', 0.7666666666666667, 0.8415232813448554],
 ['VET', 0.8533333333333334, 0.8783965772860303],
 ['FIL', 0.8066666666666666, 0.8298166680402087],

In [None]:
!git init https://github.com/davidjeans1/lstm.git