Using features extracted from data on historical bitcoin prices, gold prices, S&P 500 data and tweets, we will fit a Random Forest Classifier to predict whether bitcoin prices will go up or down. 

We experiment with different features here and choose the features that give the highest accuracy.

In [64]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

path = '/Users/aghasaifkhan/Desktop/UChicago/Spring_22/ML_CAPP/Project/crypto-currency/data'

In [65]:
features_df = pd.read_csv(os.path.join(path, 'merged_datasets.csv'), index_col = 'Unnamed: 0')
features_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Daily_Change,Daily_Change_Ind,MACD,...,compound,neg,neu,pos,retweets_count,likes_count,replies_count,compound_weighted_replies,compound_weighted_likes,compound_weighted_retweets
0,2014-10-13,377.92099,397.226013,368.897003,390.414001,390.414001,35221400,11.86499,1.0,-15.70184,...,0.1858,0.0,0.925667,0.074333,269,83,14,0.079629,0.127598,0.093245
1,2014-10-14,391.691986,411.697998,391.324005,400.869995,400.869995,38491500,10.455994,1.0,-11.895282,...,0.06758,0.0484,0.8498,0.1018,140,117,7,0.004257,-0.023816,0.009022
2,2014-10-15,400.954987,402.22699,388.765991,394.77301,394.77301,25267100,-6.096985,0.0,-9.263747,...,0.243837,0.025125,0.876375,0.098375,167,139,18,0.112306,0.243701,0.21332
3,2014-10-17,382.756012,385.477997,375.389008,383.757996,383.757996,13600700,1.201996,1.0,-6.948665,...,0.5434,0.0,0.787667,0.212333,92,81,4,0.4971,0.519504,0.528261
4,2014-10-20,389.230988,390.084015,378.252014,382.845001,382.845001,16419000,-6.700989,0.0,-3.828066,...,0.269725,0.0,0.8755,0.1245,550,135,2,0.12825,0.20944,0.042557


In [66]:
# Check for NaNs

features_df.isnull().sum()

Date                           0
Open                           0
High                           0
Low                            0
Close                          0
Adj Close                      0
Volume                         0
Daily_Change                   0
Daily_Change_Ind               0
MACD                           0
PROC_3                         0
PROC_5                         0
PROC_10                        0
wpr                            0
sto_os                         0
gold_close                     0
gold_label                     0
sp500_close                    0
sp500_label                    0
goog_trend_score               0
isPartial                      0
count                          0
compound                       0
neg                            0
neu                            0
pos                            0
retweets_count                 0
likes_count                    0
replies_count                  0
compound_weighted_replies     30
compound_w

In [67]:
features_df = features_df.dropna()
features_df.isnull().sum()

Date                          0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Daily_Change                  0
Daily_Change_Ind              0
MACD                          0
PROC_3                        0
PROC_5                        0
PROC_10                       0
wpr                           0
sto_os                        0
gold_close                    0
gold_label                    0
sp500_close                   0
sp500_label                   0
goog_trend_score              0
isPartial                     0
count                         0
compound                      0
neg                           0
neu                           0
pos                           0
retweets_count                0
likes_count                   0
replies_count                 0
compound_weighted_replies     0
compound_weighted_likes       0
compound

In [68]:
features_df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5',
       'PROC_10', 'wpr', 'sto_os', 'gold_close', 'gold_label', 'sp500_close',
       'sp500_label', 'goog_trend_score', 'isPartial', 'count', 'compound',
       'neg', 'neu', 'pos', 'retweets_count', 'likes_count', 'replies_count',
       'compound_weighted_replies', 'compound_weighted_likes',
       'compound_weighted_retweets'],
      dtype='object')

For the first model, we are using the same features that we used in our Logistic Regression model so that we can compare the results of both models.

In [69]:
# Creating X and y 

features_lst = ['Close', 'gold_close', 'sp500_close', 'goog_trend_score', 'compound', 'compound_weighted_replies', 
'compound_weighted_likes', 'compound_weighted_retweets']

y = features_df['Daily_Change_Ind']


In [75]:
def predict_btc(rf_model, features):
    '''
    Fits a Random Forest model to predict whether the price of bitcoin will go up or down
    Inputs:
        rf_model: the random forest model
        features: list of features to use
    Returns: 
         yhat: predictions  
    '''
    
    X = features_df[features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
    rf.fit(X_train, y_train)

    yhat = rf.predict(X_test)
    acc_score  = accuracy_score(y_test, yhat)
    mse = mean_squared_error(y_test, yhat)
    
    print('Fraction of correctly classified samples: ', acc_score)
    
    return yhat


In [71]:
rf = RandomForestClassifier()
yhat1 = predict_btc(rf, features_lst)

Fraction of correctly classified samples:  0.4806201550387597


This accuracy is lower than the accuracy we got from fitting a logistic regresion model (0.5417721518987342), so we will experiment with some different hyperparamters, but still keep the same fetaures.



n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None

In [72]:
param_grid = [{'n_estimators': [5, 10, 15, 20, 25, 30]}, {'max_features': ['sqrt', 'log2', None]},
              {'oob_score': [True, False]}]


X = features_df[features_lst]
grid_search_rf = GridSearchCV(rf, param_grid, cv = 5, scoring = 'accuracy')
grid_search_rf.fit(X, y)


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [74]:
best_hp = grid_search_rf.best_params_
best_hp

{'n_estimators': 15}

The best hyperparamter from our hyperparamter grid is n_estimators =15 so we'll fit a random forest using that

In [82]:
rf_15 = RandomForestClassifier(n_estimators = 15)
yhat15 = predict_btc(rf_15, features_lst)

Fraction of correctly classified samples:  0.4935400516795866


The accuracy has increased slightly, but it is still less than our accuracy from the logistic regression