# ETH positive return prediction - Random Forest Classifier 

In [97]:
# Import libraries
import numpy as np
import pandas as pd
import pandas_ta as pta
from pathlib import Path
import os
import requests
import datetime

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline

In [98]:
# Read in ETH-USD data 
eth_price = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/ETH-USD.csv', parse_dates=True, index_col="Date")
eth_price.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-22,2561.145264,2615.247314,2330.247314,2405.181152,2405.181152,27369692036
2022-01-23,2406.924316,2542.144775,2381.515137,2535.039063,2535.039063,16481489511
2022-01-24,2535.891113,2537.208496,2172.30127,2440.352295,2440.352295,28220804648
2022-01-25,2440.393555,2498.50708,2359.384766,2455.935059,2455.935059,16179776932
2022-01-26,2455.579102,2705.78418,2417.683105,2468.030273,2468.030273,21229909340


In [99]:
# Construct the dependent variable where if daily return is greater than 0, then 1, else, 0.
eth_price['Daily_Return'] = eth_price['Adj Close'].pct_change()
eth_price['Positive_Return'] = np.where(eth_price['Daily_Return'] > 0, 1.0, 0.0)
eth_price = eth_price.dropna()

In [100]:
# Create lagged y prediction variable
eth_price['Lagged_Pos_Ret'] = eth_price['Positive_Return'].shift(1)
eth_price.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Daily_Return,Positive_Return,Lagged_Pos_Ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-11-10,320.67099,324.717987,294.541992,299.252991,299.252991,885985984,-0.067411,0.0,
2017-11-11,298.585999,319.453003,298.191986,314.681,314.681,842300992,0.051555,1.0,0.0
2017-11-12,314.690002,319.153015,298.513,307.90799,307.90799,1613479936,-0.021523,0.0,1.0
2017-11-13,307.024994,328.415009,307.024994,316.716003,316.716003,1041889984,0.028606,1.0,0.0
2017-11-14,316.763,340.177002,316.763,337.631012,337.631012,1069680000,0.066037,1.0,1.0


In [101]:
eth_price= eth_price.dropna()

In [102]:
# Calculate most dominant features using pandas-ta, RSI, TSI, ROC (most dominant features identified in eth_dominant_features.ipynb)

#Calculate RSI (Relative Strength Index)
rsi_eth = pta.rsi(eth_price['Adj Close'],length=14).dropna()
rsi_eth.to_csv('resources/dominant_features/rsi_eth.csv')

In [103]:
# Calculate TSI (True Strength Index)
tsi_eth = pta.tsi(eth_price['Adj Close'],length=14).dropna()
tsi_eth = tsi_eth.drop(columns='TSIs_13_25_13')
tsi_eth.to_csv('resources/dominant_features/tsi_eth.csv')

In [104]:
# Calculate ROC (Rate of Change)
roc_eth = pta.roc(eth_price['Adj Close'],length=14).dropna()
roc_eth.to_csv('resources/dominant_features/roc_eth.csv')

In [105]:
# Read in RSI, TSI, ROC. Concat data into dominant features dataframe.
rsi_eth = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/dominant_features/rsi_eth.csv', parse_dates=True, index_col='Date')
tsi_eth = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/dominant_features/tsi_eth.csv', parse_dates=True, index_col='Date')
roc_eth = pd.read_csv('/Users/draganbogatic/Documents/Fin_Tech/crypto-analysis/resources/dominant_features/roc_eth.csv', parse_dates=True, index_col='Date')

dominant_features = pd.concat([rsi_eth, tsi_eth, roc_eth, eth_price], axis=1).dropna()
dominant_features = dominant_features.drop(columns=['Open','High','Low','Close','Volume'])
dominant_features.to_csv('resources/dominant_features/dominant_eth.csv')
dominant_features.tail()

Unnamed: 0_level_0,RSI_14,TSI_13_25_13,ROC_14,Adj Close,Daily_Return,Positive_Return,Lagged_Pos_Ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-22,21.615801,-35.92364,-22.212082,2405.181152,-0.059716,0.0,0.0
2022-01-23,27.552694,-37.557619,-19.720121,2535.039063,0.053991,1.0,0.0
2022-01-24,26.005982,-39.388232,-20.847395,2440.352295,-0.037351,0.0,1.0
2022-01-25,26.734906,-40.595574,-24.155329,2455.935059,0.006385,1.0,0.0
2022-01-26,27.333284,-41.386144,-26.813724,2468.030273,0.004925,1.0,1.0


In [106]:
# Select model features and timeframe for train data
X = dominant_features[['RSI_14','ROC_14','TSI_13_25_13']]
X_train = X[:'2021']

In [107]:
# Select prediction variable and timeframe for train data
y = dominant_features['Lagged_Pos_Ret']
y_train = y[:'2021']

In [108]:
# Define timeframe for testing data
X_test = X['2022':]
y_test = y['2022':]

In [109]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [110]:
# Perform the grid search for the optimal set of tree parameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

n_estimators = [20,500]
max_depth= [5,10]
num_folds = 10
scoring = 'accuracy'

criterion = ["gini","entropy"]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, \
  criterion = criterion )

model = RandomForestClassifier(n_jobs=-1)

kfold = KFold(n_splits=num_folds, random_state=None)
grid = GridSearchCV(estimator=model, param_grid=param_grid, \
  scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_,\
  grid_result.best_params_))

Best: 0.613149 using {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 500}


In [111]:
# Define the model using optimized settings (also use if you want to run without optimizer above)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier(n_jobs=-1)
model = RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=500, n_jobs=-1)

In [112]:
# Fit the model
model = model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [113]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13,2
Actual 1,7,4


Accuracy Score : 0.6538461538461539
Classification Report
              precision    recall  f1-score   support

         0.0       0.65      0.87      0.74        15
         1.0       0.67      0.36      0.47        11

    accuracy                           0.65        26
   macro avg       0.66      0.62      0.61        26
weighted avg       0.66      0.65      0.63        26



In [114]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe
Results = y_test.to_frame()
Results["Predicted Value"] = predictions
Results[-20:]

Unnamed: 0_level_0,Lagged_Pos_Ret,Predicted Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-07,0.0,0.0
2022-01-08,0.0,0.0
2022-01-09,0.0,0.0
2022-01-10,1.0,0.0
2022-01-11,0.0,0.0
2022-01-12,1.0,1.0
2022-01-13,1.0,0.0
2022-01-14,0.0,1.0
2022-01-15,1.0,1.0
2022-01-16,1.0,1.0
