# BTC positive return prediction - Random Forest Classifier 

In [62]:
# Import libraries
import numpy as np
import pandas as pd
import pandas_ta as pta
from pathlib import Path
import os
import requests
import datetime

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline

In [45]:
# Read in BTC-USD data 
btc_price = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/BTC-USD.csv', parse_dates=True, index_col="Date")
btc_price.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-22,36471.589844,36688.8125,34349.25,35030.25,35030.25,39714385405
2022-01-23,35047.359375,36433.3125,34784.96875,36276.804688,36276.804688,26017975951
2022-01-24,36275.734375,37247.519531,33184.058594,36654.328125,36654.328125,41856658597
2022-01-25,36654.804688,37444.570313,35779.429688,36954.003906,36954.003906,26428189594
2022-01-26,36950.515625,38825.410156,36374.90625,36852.121094,36852.121094,31324598034


In [46]:
# Construct the dependent variable where if daily return is greater than 0, then 1, else, 0.
btc_price['Daily_Return'] = btc_price['Adj Close'].pct_change()
btc_price['Positive_Return'] = np.where(btc_price['Daily_Return'] > 0, 1.0, 0.0)
btc_price = btc_price.dropna()

In [47]:
# Create lagged y prediction variable
btc_price['Lagged_Pos_Ret'] = btc_price['Positive_Return'].shift(1)
btc_price.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Daily_Return,Positive_Return,Lagged_Pos_Ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-22,36471.589844,36688.8125,34349.25,35030.25,35030.25,39714385405,-0.039143,0.0,0.0
2022-01-23,35047.359375,36433.3125,34784.96875,36276.804688,36276.804688,26017975951,0.035585,1.0,0.0
2022-01-24,36275.734375,37247.519531,33184.058594,36654.328125,36654.328125,41856658597,0.010407,1.0,1.0
2022-01-25,36654.804688,37444.570313,35779.429688,36954.003906,36954.003906,26428189594,0.008176,1.0,1.0
2022-01-26,36950.515625,38825.410156,36374.90625,36852.121094,36852.121094,31324598034,-0.002757,0.0,1.0


In [48]:
btc_price= btc_price.dropna()

In [49]:
# Calculate most dominant features using pandas-ta, RSI, TSI, ROC (most dominant features identified in btc_price_direction.ipynb)

#Calculate RSI (Relative Strength Index)
rsi_btc = pta.rsi(btc_price['Adj Close'],length=14).dropna()
rsi_btc.to_csv('resources/dominant_features/rsi_btc.csv')

In [50]:
# Calculate TSI (True Strength Index)
tsi_btc = pta.tsi(btc_price['Adj Close'],length=14).dropna()
tsi_btc = tsi_btc.drop(columns='TSIs_13_25_13')
tsi_btc.to_csv('resources/dominant_features/tsi_btc.csv')

In [51]:
# Calculate ROC (Rate of Change)
roc_btc = pta.roc(btc_price['Adj Close'],length=14).dropna()
roc_btc.to_csv('resources/dominant_features/roc_btc.csv')

In [52]:
# Read in RSI, TSI, ROC. Concat data into dominant features dataframe.
rsi_btc = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/dominant_features/rsi_btc.csv', parse_dates=True, index_col='Date')
tsi_btc = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/dominant_features/tsi_btc.csv', parse_dates=True, index_col='Date')
roc_btc = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/dominant_features/roc_btc.csv', parse_dates=True, index_col='Date')

dominant_features = pd.concat([rsi_btc, tsi_btc, roc_btc, btc_price], axis=1).dropna()
dominant_features = dominant_features.drop(columns=['Open','High','Low','Close','Volume'])
dominant_features.to_csv('resources/dominant_features/dominant_btc.csv')
dominant_features.tail()

Unnamed: 0_level_0,RSI_14,TSI_13_25_13,ROC_14,Adj Close,Daily_Return,Positive_Return,Lagged_Pos_Ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-22,20.338766,-37.285541,-16.062924,35030.25,-0.039143,0.0,0.0
2022-01-23,26.967368,-38.914563,-13.44448,36276.804688,0.035585,1.0,0.0
2022-01-24,28.897024,-39.547892,-12.354801,36654.328125,0.010407,1.0,1.0
2022-01-25,30.467542,-39.558198,-13.529275,36954.003906,0.008176,1.0,1.0
2022-01-26,30.223127,-39.638065,-16.148181,36852.121094,-0.002757,0.0,1.0


In [53]:
# Select model features and timeframe for train data
X = dominant_features[['RSI_14','ROC_14','TSI_13_25_13']]
X_train = X[:'2021']

In [54]:
# Select prediction variable and timeframe for train data
y = dominant_features['Lagged_Pos_Ret']
y_train = y[:'2021']

In [55]:
# Define timeframe for testing data
X_test = X['2022':]
y_test = y['2022':]

In [56]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [63]:
# Perform the grid search for the optimal set of tree parameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

n_estimators = [20,500]
max_depth= [5,10]
num_folds = 10
scoring = 'accuracy'

criterion = ["gini","entropy"]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, \
  criterion = criterion )

model = RandomForestClassifier(n_jobs=-1)

kfold = KFold(n_splits=num_folds, random_state=None)
grid = GridSearchCV(estimator=model, param_grid=param_grid, \
  scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_,\
  grid_result.best_params_))

Best: 0.630178 using {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 500}


In [69]:
# Define the model using optimized settings (also use if you want to run without optimizer above)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier(n_jobs=-1)
model = RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=500, n_jobs=-1)

In [70]:
# Fit the model
model = model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [71]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13,2
Actual 1,8,3


Accuracy Score : 0.6153846153846154
Classification Report
              precision    recall  f1-score   support

         0.0       0.62      0.87      0.72        15
         1.0       0.60      0.27      0.37        11

    accuracy                           0.62        26
   macro avg       0.61      0.57      0.55        26
weighted avg       0.61      0.62      0.58        26



In [72]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe
Results = y_test.to_frame()
Results["Predicted Value"] = predictions
Results[-20:]

Unnamed: 0_level_0,Lagged_Pos_Ret,Predicted Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-07,0.0,0.0
2022-01-08,0.0,0.0
2022-01-09,1.0,0.0
2022-01-10,1.0,0.0
2022-01-11,0.0,0.0
2022-01-12,1.0,1.0
2022-01-13,1.0,0.0
2022-01-14,0.0,1.0
2022-01-15,1.0,1.0
2022-01-16,1.0,1.0
