# Random Forest Training

---

### Import Libraries and Dependencies

In [47]:
import pandas as pd
from pandas import DataFrame
import requests
import os
import os.path
import datetime as dt
import json
import numpy as np
from datetime import timedelta, datetime
from dateutil import parser
import math
%matplotlib inline
from matplotlib import pyplot as plt

from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.filterwarnings('ignore')

In [48]:
# Load environment variables 
binance_api_key = os.getenv("BINANCE_API_KEY")
binance_secret_key= os.getenv("BINANCE_SECRET_KEY")

In [49]:
## Set up client 
from binance.client import Client
binance_client = Client(api_key=binance_api_key, api_secret=binance_secret_key)

# Constants
binsizes = {"1m": 1, "5m": 5, "1h": 60, "1d": 1440}
batch_size = 750

# Create collecting new data function
def minutes_of_new_data(symbol, kline_size, data, source):
    if len(data) > 0:  old = parser.parse(data["timestamp"].iloc[-1])
    elif source == "binance": old = datetime.strptime('1 Jan 2015', '%d %b %Y')
    if source == "binance": new = pd.to_datetime(binance_client.get_klines(symbol=symbol, interval=kline_size)[-1][0], unit='ms')
    return old, new

# Create function to retreive tickers and timestamps
def get_all_binance(symbol, kline_size, save = False):
    filename = '%s-%s-data.csv' % (symbol, kline_size)
    if os.path.isfile(filename): data_df = pd.read_csv(filename)
    else: data_df = pd.DataFrame()
    oldest_point, newest_point = minutes_of_new_data(symbol, kline_size, data_df, source = "binance")
    delta_min = (newest_point - oldest_point).total_seconds()/60
    available_data = math.ceil(delta_min/binsizes[kline_size])
    if oldest_point == datetime.strptime('1 Jan 2015', '%d %b %Y'): print('Downloading all available %s data for %s. Be patient..!' % (kline_size, symbol))
    else: print('Downloading %d minutes of new data available for %s, i.e. %d instances of %s data.' % (delta_min, symbol, available_data, kline_size))
    klines = binance_client.get_historical_klines(symbol, kline_size, oldest_point.strftime("%d %b %Y %H:%M:%S"), newest_point.strftime("%d %b %Y %H:%M:%S"))
    data = pd.DataFrame(klines, columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_av', 'trades', 'tb_base_av', 'tb_quote_av', 'ignore' ])
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')
    if len(data_df) > 0:
        temp_df = pd.DataFrame(data)
        data_df = data_df.append(temp_df)
    else: data_df = data
    data_df.set_index('timestamp', inplace=True)
    if save: data_df.to_csv(filename)
    print('All caught up..!')
    return data_df

In [50]:
symbols = ['LINKBTC', 'VETBTC', 'WTCBTC']

In [51]:
# Get chainlink data
link_data=get_all_binance('LINKBTC', '1h')
link_data.head()

Downloading all available 1h data for LINKBTC. Be patient..!
All caught up..!


Unnamed: 0_level_0,open,high,low,close,volume,close_time,quote_av,trades,tb_base_av,tb_quote_av,ignore
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-09-28 08:00:00,3e-05,3e-05,3e-05,3e-05,100.0,1506589199999,0.003,1,100.0,0.003,30001.696
2017-09-28 09:00:00,8.6e-05,8.9e-05,4.62e-05,4.62e-05,853945.0,1506592799999,48.52663619,645,257197.0,13.72629174,6500707.40881102
2017-09-28 10:00:00,4.7e-05,6.878e-05,4.7e-05,6.5e-05,2161631.0,1506596399999,121.7943162,843,929032.0,52.10048144,8465292.69686671
2017-09-28 11:00:00,6.5e-05,8.27e-05,6.219e-05,6.662e-05,2060651.0,1506599999999,144.5124556,1268,649557.0,46.64358554,13052415.49262562
2017-09-28 12:00:00,6.662e-05,6.664e-05,5.209e-05,6.002e-05,1124998.0,1506603599999,68.62397421,689,321803.0,19.15174859,18460641.76811298


In [52]:
link_data=link_data.loc[:,['high', 'low', 'close']]

In [53]:
link_data=link_data.apply(pd.to_numeric)

In [54]:
link_data = link_data.loc['2020-06-11 01:00:00':'2020-07-11 01:00:00']

In [55]:
link_data.head()

Unnamed: 0_level_0,high,low,close
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-11 01:00:00,0.000448,0.000446,0.000448
2020-06-11 02:00:00,0.000449,0.000447,0.000447
2020-06-11 03:00:00,0.000448,0.000445,0.000446
2020-06-11 04:00:00,0.000447,0.000445,0.000445
2020-06-11 05:00:00,0.000447,0.000445,0.000445


In [56]:
link_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 718 entries, 2020-06-11 01:00:00 to 2020-07-11 01:00:00
Data columns (total 3 columns):
high     718 non-null float64
low      718 non-null float64
close    718 non-null float64
dtypes: float64(3)
memory usage: 22.4 KB


In [57]:
link_data['daily_return']=link_data['close'].pct_change()

In [59]:
link_data = link_data.dropna()

In [60]:
link_data.head()

Unnamed: 0_level_0,high,low,close,daily_return,L14,H14,%K,%D
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-06-11 16:00:00,0.000442,0.000428,0.00043,-0.02526,0.000428,0.000449,7.306381,39.866613
2020-06-11 17:00:00,0.000441,0.000423,0.000437,0.016865,0.000423,0.000449,54.234654,36.464328
2020-06-11 18:00:00,0.000438,0.000428,0.00043,-0.017112,0.000423,0.000449,25.174825,28.905287
2020-06-11 19:00:00,0.000432,0.000426,0.00043,0.000908,0.000423,0.000449,26.689977,35.366485
2020-06-11 20:00:00,0.000431,0.000424,0.00043,0.000721,0.000423,0.000449,27.894328,26.586377


In [61]:
#Create the "L14" column in the DataFrame
link_data['L14'] = link_data['low'].rolling(window=14).min()
#Create the "H14" column in the DataFrame
link_data['H14'] = link_data['high'].rolling(window=14).max()
#Create the "%K" column in the DataFrame
link_data['%K'] = 100*((link_data['close'] - link_data['L14']) / (link_data['H14'] - link_data['L14']))
#Create the "%D" column in the DataFrame
link_data['%D'] = link_data['%K'].rolling(window=3).mean()

In [62]:
#Create a column in the DataFrame showing "TRUE" if sell entry signal is given and "FALSE" otherwise. 
#A sell is initiated when the %K line crosses down through the %D line and the value of the oscillator is above 80 
link_data['SOSC Sell Entry'] = ((link_data['%K'] < link_data['%D']) & (link_data['%K'].shift(1) > link_data['%D'].shift(1))) & (link_data['%D'] > 80) 
#Create a column in the DataFrame showing "TRUE" if sell exit signal is given and "FALSE" otherwise. 
#A sell exit signal is given when the %K line crosses back up through the %D line 
link_data['SOSC Sell Exit'] = ((link_data['%K'] > link_data['%D']) & (link_data['%K'].shift(1) < link_data['%D'].shift(1))) 
#create a placeholder column to populate with short positions (-1 for short and 0 for flat) using boolean values created above 
link_data['SOSC Short'] = np.nan 
link_data.loc[link_data['SOSC Sell Entry'],'SOSC Short'] = -1 
link_data.loc[link_data['SOSC Sell Exit'],'SOSC Short'] = 0 
#Set initial position on day 1 to flat 
link_data['SOSC Short'][0] = 0 
#Forward fill the position column to represent the holding of positions through time 
link_data['SOSC Short'] = link_data['SOSC Short'].fillna(method='pad') 
#Create a column in the DataFrame showing "TRUE" if buy entry signal is given and "FALSE" otherwise. 
#A buy is initiated when the %K line crosses up through the %D line and the value of the oscillator is below 20 
link_data['SOSC Buy Entry'] = ((link_data['%K'] > link_data['%D']) & (link_data['%K'].shift(1) < link_data['%D'].shift(1))) & (link_data['%D'] < 20) 
#Create a column in the DataFrame showing "TRUE" if buy exit signal is given and "FALSE" otherwise. 
#A buy exit signal is given when the %K line crosses back down through the %D line 
link_data['SOSC Buy Exit'] = ((link_data['%K'] < link_data['%D']) & (link_data['%K'].shift(1) > link_data['%D'].shift(1))) 
#create a placeholder column to polulate with long positions (1 for long and 0 for flat) using boolean values created above 
link_data['SOSC Long'] = np.nan  
link_data.loc[link_data['SOSC Buy Entry'],'SOSC Long'] = 1  
link_data.loc[link_data['SOSC Buy Exit'],'SOSC Long'] = 0  
#Set initial position on day 1 to flat 
link_data['SOSC Long'][0] = 0  
#Forward fill the position column to represent the holding of positions through time 
link_data['SOSC Long'] = link_data['SOSC Long'].fillna(method='pad') 
#Add Long and Short positions together to get final strategy position (1 for long, -1 for short and 0 for flat) 
link_data['SOSC Position'] = link_data['SOSC Long'] + link_data['SOSC Short']



In [63]:
link_data.head()

Unnamed: 0_level_0,high,low,close,daily_return,L14,H14,%K,%D,SOSC Sell Entry,SOSC Sell Exit,SOSC Short,SOSC Buy Entry,SOSC Buy Exit,SOSC Long,SOSC Position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-06-11 16:00:00,0.000442,0.000428,0.00043,-0.02526,,,,,False,False,0.0,False,False,0.0,0.0
2020-06-11 17:00:00,0.000441,0.000423,0.000437,0.016865,,,,,False,False,0.0,False,False,0.0,0.0
2020-06-11 18:00:00,0.000438,0.000428,0.00043,-0.017112,,,,,False,False,0.0,False,False,0.0,0.0
2020-06-11 19:00:00,0.000432,0.000426,0.00043,0.000908,,,,,False,False,0.0,False,False,0.0,0.0
2020-06-11 20:00:00,0.000431,0.000424,0.00043,0.000721,,,,,False,False,0.0,False,False,0.0,0.0


In [64]:
# Calculate change in closing prices day over day
link_data['delta'] = link_data['close'].diff()
link_data = link_data.dropna()

# Make the positive gains (up) and negative gains (down) Series
up, down = link_data['delta'].copy(), link_data['delta'].copy()
up[up < 0] = 0
down[down > 0] = 0

window_length = 14

# Calculate the EWMA
roll_up1 = up.ewm(span=window_length).mean()
roll_down1 = down.abs().ewm(span=window_length).mean()

# Calculate the RSI based on EWMA
RS1 = roll_up1 / roll_down1
link_data['RSI1'] = 100.0 - (100.0 / (1.0 + RS1))

# Construct a short/long trading signal RSI
link_data['RSI short'] = np.where(link_data['RSI1'] > 70, -1.0, 0.0)
link_data['RSI long'] = np.where(link_data['RSI1'] < 30, 1.0, 0.0)
link_data['RSI Position'] = link_data['RSI short'] + link_data['RSI long']


short_window = 7
long_window = 14

# Construct a `Fast` and `Slow` Exponential Moving Average from short and long windows, respectively
link_data['fast_close'] = link_data['close'].ewm(halflife=short_window).mean()
link_data['slow_close'] = link_data['close'].ewm(halflife=long_window).mean()

link_data.tail()

Unnamed: 0_level_0,high,low,close,daily_return,L14,H14,%K,%D,SOSC Sell Entry,SOSC Sell Exit,...,SOSC Buy Exit,SOSC Long,SOSC Position,delta,RSI1,RSI short,RSI long,RSI Position,fast_close,slow_close
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-10 21:00:00,0.000667,0.000653,0.000658,-0.011275,0.000623,0.000672,70.901639,79.487705,False,False,...,False,0.0,-1.0,-7.5e-06,58.345594,0.0,0.0,0.0,0.00065,0.000649
2020-07-10 22:00:00,0.000666,0.000656,0.000662,0.007268,0.000623,0.000672,80.696721,79.289617,False,True,...,False,0.0,0.0,4.78e-06,62.795763,0.0,0.0,0.0,0.000651,0.000649
2020-07-10 23:00:00,0.000665,0.000659,0.000662,-0.000845,0.000623,0.000672,79.54918,77.04918,False,False,...,False,0.0,0.0,-5.6e-07,61.901785,0.0,0.0,0.0,0.000652,0.00065
2020-07-11 00:00:00,0.000674,0.000662,0.000671,0.013356,0.000623,0.000674,93.781976,84.675959,False,False,...,False,0.0,0.0,8.84e-06,69.746601,0.0,0.0,0.0,0.000654,0.000651
2020-07-11 01:00:00,0.000675,0.000666,0.000667,-0.00486,0.000623,0.000675,85.318985,86.216714,True,False,...,True,0.0,-1.0,-3.26e-06,64.127871,0.0,0.0,0.0,0.000655,0.000652


In [65]:
#Create a column in the DataFrame showing "TRUE" if sell entry signal is given and "FALSE" otherwise. 
#A sell is initiated when the %K line crosses down through the %D line and the value of the oscillator is above 80 
link_data['RSI/MACD Sell Entry'] = ((link_data['fast_close'] < link_data['slow_close']) & (link_data['RSI1'] > 70)) 
#Create a column in the DataFrame showing "TRUE" if sell exit signal is given and "FALSE" otherwise. 
#A sell exit signal is given when the %K line crosses back up through the %D line 
link_data['RSI/MACD Sell Exit'] = ((link_data['fast_close'] > link_data['slow_close']))
#create a placeholder column to populate with short positions (-1 for short and 0 for flat) using boolean values created above 
link_data['RSI/MACD Short'] = np.nan 
link_data.loc[link_data['RSI/MACD Sell Entry'],'RSI/MACD Short'] = -1 
link_data.loc[link_data['RSI/MACD Sell Exit'],'RSI/MACD Short'] = 0 
#Set initial position on day 1 to flat 
link_data['RSI/MACD Short'][0] = 0 
#Forward fill the position column to represent the holding of positions through time 
link_data['RSI/MACD Short'] = link_data['RSI/MACD Short'].fillna(method='pad') 
#Create a column in the DataFrame showing "TRUE" if buy entry signal is given and "FALSE" otherwise. 
#A buy is initiated when the %K line crosses up through the %D line and the value of the oscillator is below 20 
link_data['RSI/MACD Buy Entry'] = ((link_data['fast_close'] > link_data['slow_close']) & (link_data['RSI1'] < 30)) 
#Create a column in the DataFrame showing "TRUE" if buy exit signal is given and "FALSE" otherwise. 
#A buy exit signal is given when the %K line crosses back down through the %D line 
link_data['RSI/MACD Buy Exit'] = (link_data['fast_close'] < link_data['slow_close'])
#create a placeholder column to polulate with long positions (1 for long and 0 for flat) using boolean values created above 
link_data['RSI/MACD Long'] = np.nan  
link_data.loc[link_data['RSI/MACD Buy Entry'],'RSI/MACD Long'] = 1  
link_data.loc[link_data['RSI/MACD Buy Exit'],'RSI/MACD Long'] = 0  
#Set initial position on day 1 to flat 
link_data['RSI/MACD Long'][0] = 0  
#Forward fill the position column to represent the holding of positions through time 
link_data['RSI/MACD Long'] = link_data['RSI/MACD Long'].fillna(method='pad') 
#Add Long and Short positions together to get final strategy position (1 for long, -1 for short and 0 for flat) 
link_data['RSI/MACD Position'] = link_data['RSI/MACD Long'] + link_data['RSI/MACD Short']

link_data.tail()

Unnamed: 0_level_0,high,low,close,daily_return,L14,H14,%K,%D,SOSC Sell Entry,SOSC Sell Exit,...,RSI Position,fast_close,slow_close,RSI/MACD Sell Entry,RSI/MACD Sell Exit,RSI/MACD Short,RSI/MACD Buy Entry,RSI/MACD Buy Exit,RSI/MACD Long,RSI/MACD Position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-10 21:00:00,0.000667,0.000653,0.000658,-0.011275,0.000623,0.000672,70.901639,79.487705,False,False,...,0.0,0.00065,0.000649,False,True,0.0,False,False,0.0,0.0
2020-07-10 22:00:00,0.000666,0.000656,0.000662,0.007268,0.000623,0.000672,80.696721,79.289617,False,True,...,0.0,0.000651,0.000649,False,True,0.0,False,False,0.0,0.0
2020-07-10 23:00:00,0.000665,0.000659,0.000662,-0.000845,0.000623,0.000672,79.54918,77.04918,False,False,...,0.0,0.000652,0.00065,False,True,0.0,False,False,0.0,0.0
2020-07-11 00:00:00,0.000674,0.000662,0.000671,0.013356,0.000623,0.000674,93.781976,84.675959,False,False,...,0.0,0.000654,0.000651,False,True,0.0,False,False,0.0,0.0
2020-07-11 01:00:00,0.000675,0.000666,0.000667,-0.00486,0.000623,0.000675,85.318985,86.216714,True,False,...,0.0,0.000655,0.000652,False,True,0.0,False,False,0.0,0.0


In [66]:
trading_signals_df = link_data.copy()

In [67]:
# Set x variable list of features
x_var_list = ['RSI Position', 'SOSC Position', 'RSI/MACD Position']

# Filter by x-variable list
trading_signals_df[x_var_list].tail()

Unnamed: 0_level_0,RSI Position,SOSC Position,RSI/MACD Position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-07-10 21:00:00,0.0,-1.0,0.0
2020-07-10 22:00:00,0.0,0.0,0.0
2020-07-10 23:00:00,0.0,0.0,0.0
2020-07-11 00:00:00,0.0,0.0,0.0
2020-07-11 01:00:00,0.0,-1.0,0.0


### Shift the DataFrame Index by 1

In [68]:
# Shift DataFrame values by 1
trading_signals_df[x_var_list] = trading_signals_df[x_var_list].shift(1)
trading_signals_df[x_var_list].tail()

Unnamed: 0_level_0,RSI Position,SOSC Position,RSI/MACD Position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-07-10 21:00:00,0.0,-1.0,0.0
2020-07-10 22:00:00,0.0,-1.0,0.0
2020-07-10 23:00:00,0.0,0.0,0.0
2020-07-11 00:00:00,0.0,0.0,0.0
2020-07-11 01:00:00,0.0,0.0,0.0


In [79]:
trading_signals_df[x_var_list].loc['2020-07-01':'2020-07-11'].to_csv('link_x_test.csv')

### Drop NAs and Replace Infs (Positive/Negative Infinity) 

In [70]:
# Drop NAs and replace positive/negative infinity values
trading_signals_df.dropna(subset=x_var_list, inplace=True)
trading_signals_df.dropna(subset=['daily_return'], inplace=True)
trading_signals_df = trading_signals_df.replace([np.inf, -np.inf], np.nan)
trading_signals_df.head()

Unnamed: 0_level_0,high,low,close,daily_return,L14,H14,%K,%D,SOSC Sell Entry,SOSC Sell Exit,...,RSI Position,fast_close,slow_close,RSI/MACD Sell Entry,RSI/MACD Sell Exit,RSI/MACD Short,RSI/MACD Buy Entry,RSI/MACD Buy Exit,RSI/MACD Long,RSI/MACD Position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-12 08:00:00,0.000439,0.000435,0.000437,-0.0016,0.000417,0.000441,82.044888,82.071121,True,False,...,-1.0,0.000437,0.000437,False,False,0.0,False,True,0.0,0.0
2020-06-12 09:00:00,0.000437,0.000435,0.000436,-0.001488,0.000417,0.000441,79.343308,82.114159,False,False,...,0.0,0.000437,0.000437,False,False,0.0,False,True,0.0,0.0
2020-06-12 10:00:00,0.000437,0.000432,0.000434,-0.005502,0.000417,0.000441,69.368246,76.918814,False,False,...,0.0,0.000436,0.000436,False,False,0.0,False,True,0.0,0.0
2020-06-12 11:00:00,0.000437,0.000434,0.000436,0.005509,0.000417,0.000441,79.301746,76.004433,False,True,...,1.0,0.000436,0.000436,False,False,0.0,False,True,0.0,0.0
2020-06-12 12:00:00,0.000439,0.000436,0.000439,0.005731,0.000417,0.000441,89.692436,79.454142,False,False,...,0.0,0.000437,0.000437,False,True,0.0,False,False,0.0,0.0


### Construct the Dependent Variable

In [71]:
# Construct the dependent variable where if daily return is greater than 0, then 1, else, 0.
trading_signals_df['Positive Return'] = np.where(trading_signals_df['daily_return'] > 0, 1.0, 0.0)
trading_signals_df

Unnamed: 0_level_0,high,low,close,daily_return,L14,H14,%K,%D,SOSC Sell Entry,SOSC Sell Exit,...,fast_close,slow_close,RSI/MACD Sell Entry,RSI/MACD Sell Exit,RSI/MACD Short,RSI/MACD Buy Entry,RSI/MACD Buy Exit,RSI/MACD Long,RSI/MACD Position,Positive Return
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-12 08:00:00,0.000439,0.000435,0.000437,-0.001600,0.000417,0.000441,82.044888,82.071121,True,False,...,0.000437,0.000437,False,False,0.0,False,True,0.0,0.0,0.0
2020-06-12 09:00:00,0.000437,0.000435,0.000436,-0.001488,0.000417,0.000441,79.343308,82.114159,False,False,...,0.000437,0.000437,False,False,0.0,False,True,0.0,0.0,0.0
2020-06-12 10:00:00,0.000437,0.000432,0.000434,-0.005502,0.000417,0.000441,69.368246,76.918814,False,False,...,0.000436,0.000436,False,False,0.0,False,True,0.0,0.0,0.0
2020-06-12 11:00:00,0.000437,0.000434,0.000436,0.005509,0.000417,0.000441,79.301746,76.004433,False,True,...,0.000436,0.000436,False,False,0.0,False,True,0.0,0.0,1.0
2020-06-12 12:00:00,0.000439,0.000436,0.000439,0.005731,0.000417,0.000441,89.692436,79.454142,False,False,...,0.000437,0.000437,False,True,0.0,False,False,0.0,0.0,1.0
2020-06-12 13:00:00,0.000440,0.000438,0.000438,-0.001983,0.000417,0.000441,86.076475,85.023552,False,False,...,0.000437,0.000437,False,True,0.0,False,False,0.0,0.0,0.0
2020-06-12 14:00:00,0.000438,0.000435,0.000435,-0.005870,0.000427,0.000441,58.162544,77.977152,False,False,...,0.000437,0.000437,False,False,0.0,False,True,0.0,0.0,0.0
2020-06-12 15:00:00,0.000439,0.000433,0.000435,-0.000459,0.000430,0.000441,44.615385,62.951468,False,False,...,0.000436,0.000436,False,False,0.0,False,True,0.0,0.0,0.0
2020-06-12 16:00:00,0.000436,0.000433,0.000435,-0.001195,0.000430,0.000441,39.909502,47.562477,False,False,...,0.000436,0.000436,False,False,0.0,False,True,0.0,0.0,0.0
2020-06-12 17:00:00,0.000435,0.000432,0.000433,-0.003360,0.000430,0.000441,26.696833,37.073906,False,False,...,0.000436,0.000436,False,False,0.0,False,True,0.0,0.0,0.0


### Assign Training and Testing Windows

In [72]:
# Construct training start and end dates
training_start = trading_signals_df.index.min().strftime(format= '%Y-%m-%d')
training_end = '2020-06-30'

# Construct testing start and end dates
testing_start =  '2020-07-01'
testing_end = trading_signals_df.index.max().strftime(format= '%Y-%m-%d')

# Print training and testing start/end dates
print(f"Training Start: {training_start}")
print(f"Training End: {training_end}")
print(f"Testing Start: {testing_start}")
print(f"Testing End: {testing_end}")

Training Start: 2020-06-12
Training End: 2020-06-30
Testing Start: 2020-07-01
Testing End: 2020-07-11


### Separate X and y Training Datasets

In [73]:
# Construct the X_train and y_train datasets
X_train = trading_signals_df[x_var_list][training_start:training_end]
y_train = trading_signals_df['Positive Return'][training_start:training_end]

X_train.tail()

Unnamed: 0_level_0,RSI Position,SOSC Position,RSI/MACD Position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-30 19:00:00,0.0,0.0,0.0
2020-06-30 20:00:00,0.0,0.0,0.0
2020-06-30 21:00:00,0.0,0.0,0.0
2020-06-30 22:00:00,0.0,0.0,0.0
2020-06-30 23:00:00,0.0,0.0,0.0


In [74]:
y_train.tail()

timestamp
2020-06-30 19:00:00    1.0
2020-06-30 20:00:00    0.0
2020-06-30 21:00:00    0.0
2020-06-30 22:00:00    1.0
2020-06-30 23:00:00    0.0
Name: Positive Return, dtype: float64

### Separate X and y Testing Datasets

In [75]:
# Construct the X test and y test datasets
X_test = trading_signals_df[x_var_list][testing_start:testing_end]
y_test = trading_signals_df['Positive Return'][testing_start:testing_end]

X_test.tail()

Unnamed: 0_level_0,RSI Position,SOSC Position,RSI/MACD Position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-07-10 21:00:00,0.0,-1.0,0.0
2020-07-10 22:00:00,0.0,-1.0,0.0
2020-07-10 23:00:00,0.0,0.0,0.0
2020-07-11 00:00:00,0.0,0.0,0.0
2020-07-11 01:00:00,0.0,0.0,0.0


In [76]:
y_test.tail()

timestamp
2020-07-10 21:00:00    0.0
2020-07-10 22:00:00    1.0
2020-07-10 23:00:00    0.0
2020-07-11 00:00:00    1.0
2020-07-11 01:00:00    0.0
Name: Positive Return, dtype: float64

In [38]:
y_test.to_csv('link_y_test.csv', header=True)

### Import SKLearn Library and Classes

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### Train Random Forest Model 

In [42]:
# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
model.fit(X_train, y_train)

# Make a prediction of "y" values from the X_test dataset
predictions = model.predict(X_test)

# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results["Return"] = link_data['daily_return'].loc['2020-07-01':'2020-07-11']
Results

Unnamed: 0_level_0,Positive Return,Return
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-01 00:00:00,0.0,-0.002383
2020-07-01 01:00:00,1.0,0.001224
2020-07-01 02:00:00,0.0,-0.001223
2020-07-01 03:00:00,0.0,-0.000943
2020-07-01 04:00:00,1.0,0.001607
2020-07-01 05:00:00,1.0,0.002828
2020-07-01 06:00:00,0.0,-0.005861
2020-07-01 07:00:00,0.0,-0.000060
2020-07-01 08:00:00,1.0,0.012557
2020-07-01 09:00:00,1.0,0.000119


In [43]:
Results.columns = ['Actual Value', 'Return']

In [44]:
Results.head()

Unnamed: 0_level_0,Actual Value,Return
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-01 00:00:00,0.0,-0.002383
2020-07-01 01:00:00,1.0,0.001224
2020-07-01 02:00:00,0.0,-0.001223
2020-07-01 03:00:00,0.0,-0.000943
2020-07-01 04:00:00,1.0,0.001607


In [45]:
Results.to_csv('link_results.csv')

### Save Pre-Trained Model Using Joblib

In [46]:
# Save the pre-trained model
from joblib import dump, load
dump(model, 'link_random_forest_model.joblib')

['link_random_forest_model.joblib']