In [14]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
features_df = pd.read_csv("bitcoin_train.csv")
features_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Daily_Change,Daily_Change_Ind,MACD,...,Open_Gold,Daily_Change_Gold,Daily_Change_Perc_Gold,Increased_Gold,Close/Last_SP500,Open_SP500,Daily_Change_SP500,Daily_Change_Perc_SP500,Increased_SP500,label
0,793,2017-12-28,15864.099609,15888.400391,13937.299805,14606.5,12336499712,-1232.0,0.0,636.405515,...,1292.0,5.2,0.004025,1,2687.54,2686.1,1.44,0.000536,1,0.0
1,908,2018-06-14,6342.75,6707.140137,6334.459961,6675.350098,5138710016,325.450196,1.0,-383.20157,...,1303.1,5.2,0.00399,1,2782.49,2783.21,-0.72,-0.000259,0,0.0
2,224,2015-09-25,234.362,237.427002,233.684006,235.143997,22363600,0.61499,1.0,-1.731076,...,1151.0,-5.4,-0.004692,0,1931.34,1935.93,-4.59,-0.002371,0,1.0
3,1042,2018-12-26,3819.666748,3893.359619,3769.86377,3857.297607,5326547918,41.806884,1.0,-83.043066,...,1273.5,-0.5,-0.000393,0,2467.7,2363.12,104.58,0.044255,1,1.0
4,563,2017-01-31,920.958984,972.018982,920.958984,970.403015,164582000,50.020996,1.0,12.846431,...,1197.7,13.7,0.011439,1,2278.87,2274.02,4.85,0.002133,1,1.0


In [3]:
features_df.isnull().sum()

Unnamed: 0                    0
Date                          0
Open                          0
High                          0
Low                           0
Close                         0
Volume                        0
Daily_Change                  0
Daily_Change_Ind              0
MACD                          0
PROC_3                        0
PROC_5                        0
PROC_10                       0
wpr                           0
sto_os                        0
goog_trend_score              0
count                         0
compound                      0
retweets_count                0
likes_count                   0
replies_count                 0
compound_weighted_replies     0
compound_weighted_likes       0
compound_weighted_retweets    0
Daily_Change_Perc             0
Close/Last_Gold               0
Open_Gold                     0
Daily_Change_Gold             0
Daily_Change_Perc_Gold        0
Increased_Gold                0
Close/Last_SP500              0
Open_SP5

In [4]:
features_df.columns

Index(['Unnamed: 0', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume',
       'Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5',
       'PROC_10', 'wpr', 'sto_os', 'goog_trend_score', 'count', 'compound',
       'retweets_count', 'likes_count', 'replies_count',
       'compound_weighted_replies', 'compound_weighted_likes',
       'compound_weighted_retweets', 'Daily_Change_Perc', 'Close/Last_Gold',
       'Open_Gold', 'Daily_Change_Gold', 'Daily_Change_Perc_Gold',
       'Increased_Gold', 'Close/Last_SP500', 'Open_SP500',
       'Daily_Change_SP500', 'Daily_Change_Perc_SP500', 'Increased_SP500',
       'label'],
      dtype='object')

In [15]:
# Creating X and y 

features_lst = ['Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5','PROC_10', 'wpr',\
                'sto_os', 'goog_trend_score', 'count', 'compound', 'retweets_count', 'likes_count', 'replies_count',\
                'compound_weighted_replies', 'compound_weighted_likes','compound_weighted_retweets',\
                'Daily_Change_Perc', 'Daily_Change_Gold', 'Daily_Change_Perc_Gold', 'Increased_Gold', \
                'Daily_Change_SP500', 'Daily_Change_Perc_SP500', 'Increased_SP500']

y = features_df['label']


In [6]:
def predict_btc(rf_model, features):
    '''
    Fits a Random Forest model to predict whether the price of bitcoin will go up or down
    Inputs:
        rf_model: the random forest model
        features: list of features to use
    Returns: 
         yhat: predictions  
    '''
    
    X = features_df[features_lst]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 42)
    
    rf.fit(X_train, y_train)

    yhat = rf.predict(X_valid)
    acc_score  = accuracy_score(y_valid, yhat)
    mse = mean_squared_error(y_valid, yhat)
    
    print('Fraction of correctly classified samples: ', acc_score)
    
    return (yhat, rf.feature_importances_)


In [7]:
rf = RandomForestClassifier()
yhat1, feature_import = predict_btc(rf, features_lst)

Fraction of correctly classified samples:  0.6528662420382165


In [8]:
for i in range(len(features_lst)):
    print(f"{features_lst[i]}: {feature_import[i]}")

Daily_Change: 0.042769767743863624
Daily_Change_Ind: 0.004831284059931657
MACD: 0.05088726328017087
PROC_3: 0.08912090800068667
PROC_5: 0.08358884421858975
PROC_10: 0.049805996143422765
wpr: 0.07397925309805797
sto_os: 0.07258941208249682
goog_trend_score: 0.035244468744150395
count: 0.030676741736961256
compound: 0.038097194443367455
retweets_count: 0.0359946171944019
likes_count: 0.033132506983833195
replies_count: 0.03283892515888229
compound_weighted_replies: 0.03679999421996541
compound_weighted_likes: 0.039880791375812547
compound_weighted_retweets: 0.04095499769618014
Daily_Change_Perc: 0.04625440363491208
Daily_Change_Gold: 0.03627544098130936
Daily_Change_Perc_Gold: 0.03580380618533167
Increased_Gold: 0.0045746746073670385
Daily_Change_SP500: 0.038740341660153096
Daily_Change_Perc_SP500: 0.042348618615382264
Increased_SP500: 0.004809748134769845


In [9]:
param_grid = [{'n_estimators': [5, 10, 15, 20, 25, 30]}, {'max_features': ['sqrt', 'log2', None]},
              {'oob_score': [True, False]}]


X = features_df[features_lst]
grid_search_rf = GridSearchCV(rf, param_grid, cv = 5, scoring = 'accuracy')
grid_search_rf.fit(X, y)


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid=[{'n_estimators': [5, 10, 15, 20, 25, 30]},
                         {'max_features': ['sqrt', 'log2', None]},
                         {'oob_score': [True, False]}],
             scoring='accuracy')

In [10]:
best_hp = grid_search_rf.best_params_
best_hp

{'oob_score': False}

In [11]:
rf_oob = RandomForestClassifier(oob_score=True)
yhat_oob = predict_btc(rf_oob, features_lst)

Fraction of correctly classified samples:  0.6560509554140127


In [25]:
dogecoin = pd.read_csv("dogecoin_train.csv")
dogecoin = dogecoin.rename(columns={'goog_trend': 'goog_trend_score'})

In [26]:
y2 = dogecoin['label']

In [27]:
def predict_doge(rf_model, features):
    '''
    Fits a Random Forest model to predict whether the price of bitcoin will go up or down
    Inputs:
        rf_model: the random forest model
        features: list of features to use
    Returns: 
         yhat: predictions  
    '''
    
    X = dogecoin[features_lst]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y2, random_state = 42)
    
    rf.fit(X_train, y_train)

    yhat = rf.predict(X_valid)
    acc_score  = accuracy_score(y_valid, yhat)
    mse = mean_squared_error(y_valid, yhat)
    
    print('Fraction of correctly classified samples: ', acc_score)
    
    return (yhat, rf.feature_importances_)

In [28]:
rf2 = RandomForestClassifier()
yhat1, feature_import = predict_doge(rf2, features_lst)

Fraction of correctly classified samples:  0.8536585365853658


In [29]:
for i in range(len(features_lst)):
    print(f"{features_lst[i]}: {feature_import[i]}")

Daily_Change: 0.046939303918911594
Daily_Change_Ind: 0.007871151133848146
MACD: 0.07961770450331775
PROC_3: 0.08726040073704334
PROC_5: 0.08730142063631043
PROC_10: 0.07250571766584481
wpr: 0.06259181929078961
sto_os: 0.09644013821431655
goog_trend_score: 0.0523637915224342
count: 0.016018323161874935
compound: 0.018943295804393204
retweets_count: 0.018241593761498462
likes_count: 0.019919112401886638
replies_count: 0.01825932661314547
compound_weighted_replies: 0.012579020691635287
compound_weighted_likes: 0.01652161637381665
compound_weighted_retweets: 0.019230673271766526
Daily_Change_Perc: 0.06825555260241903
Daily_Change_Gold: 0.042091134789235755
Daily_Change_Perc_Gold: 0.04535316249231582
Increased_Gold: 0.007215170573422292
Daily_Change_SP500: 0.05207822375673977
Daily_Change_Perc_SP500: 0.045276711144137095
Increased_SP500: 0.007125634938896608
