In [10]:
import pandas as pd
from pandas_datareader import data
from pandas_datareader._utils import RemoteDataError
from datetime import timedelta
import datetime

# Please read, my idea was to classify the data into three cateogries: "UP", "DOWN", "NEUTRAL"
# UP - the price of the stock is up MINIMUM_GAIN percent after LOOK_AHEAD_DAYS days.
# DOWN - the price of the stock is down MINIMUM_GAIN percent after LOOK_AHEAD_DAYS days.
# NEUTRAL - the price of stock is did not rise or fall past the MINIMUM_GAIN threshhold
#
# EXAMPLE: LOOK_AHEAD_DAYS = 30, MINIMUM_GAIN = .05 (5%). If stock is up over 5% ore more after 30 days, 
# we mark it as 'UP'. If the stock is down 5% or more after 30 days, we mark it as 'DOWN'. If the stock
# is neither up nor down 5%, then we mark it as 'NEUTRAL'

MINIMUM_GAIN = .05 #minimal gain to be considered up or down for classification
LOOK_AHEAD_DAYS = 30 #number of days into the future we are trying to predict

class Utilities:

    @staticmethod
    def getData(ticker, start_date, end_date):
        try:
            stock_data = data.DataReader(ticker,
                                         'yahoo',
                                         start_date,
                                         end_date)
            return stock_data
        except RemoteDataError:
            print('No data found for {t}'.format(t=ticker))
#just get the btc price points from the past year
start_date = datetime.datetime.now() - timedelta(365)
end_date = datetime.datetime.now() - timedelta(1)
btc_df = Utilities.getData('BTC-USD', str(start_date.strftime('%Y-%m-%d')), str(end_date.strftime('%Y-%m-%d')))
btc_df

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-08-05,12273.821289,11290.731445,11811.544922,11478.168945,2.363511e+10,11478.168945
2019-08-06,12036.990234,11433.701172,11476.193359,11941.968750,2.219499e+10,11941.968750
2019-08-07,11979.419922,11556.167969,11954.040039,11966.407227,1.948159e+10,11966.407227
2019-08-08,11970.458008,11709.745117,11953.469727,11862.936523,1.833999e+10,11862.936523
2019-08-09,11915.655273,11323.898438,11861.556641,11354.024414,1.812536e+10,11354.024414
...,...,...,...,...,...,...
2020-07-30,11415.864258,10987.053711,11110.210938,11323.466797,2.316047e+10,11323.466797
2020-07-31,11794.775391,11239.682617,11322.570312,11759.592773,2.607567e+10,11759.592773
2020-08-01,12034.144531,11018.129883,11758.764648,11053.614258,2.741007e+10,11053.614258
2020-08-02,11453.079102,11012.415039,11043.768555,11246.348633,2.027171e+10,11246.348633


In [11]:
btc_df = btc_df.assign(MonthTrend=lambda x: "N/A") #introduce new column that will be the trend we are predicting
btc_df = btc_df.loc[~btc_df.index.duplicated(keep='first')] #delete all duplicate indices

#Populate the classification column MonthTrend
for index, row in btc_df.iterrows():
    current_date = index.to_pydatetime()
    if (end_date - current_date).days > LOOK_AHEAD_DAYS:
        look_ahead_date = current_date + timedelta(days=LOOK_AHEAD_DAYS)
        
        while True: #we need to keep adding days to the look_ahead_date because there is no index for that day
            if look_ahead_date in btc_df.index:
                break
            print(look_ahead_date)
            look_ahead_date = look_ahead_date + timedelta(days=1)
        
        #Here we check if the open price in the future is higher than today's open price.
        if btc_df.loc[pd.Timestamp(look_ahead_date)]['Open'] >= btc_df.loc[index]['Open'] * (1 + MINIMUM_GAIN):
            btc_df.at[pd.Timestamp(look_ahead_date), 'MonthTrend'] = 'UP'
        elif btc_df.loc[pd.Timestamp(look_ahead_date)]['Open'] <= btc_df.loc[index]['Open'] * (1 - MINIMUM_GAIN):
            btc_df.at[pd.Timestamp(look_ahead_date), 'MonthTrend'] = 'DOWN'
        else:
            btc_df.at[pd.Timestamp(look_ahead_date), 'MonthTrend'] = 'NEUTRAL'
            
#Now delete all columns that have a MonthTrend value of N/A
btc_df = btc_df[btc_df.MonthTrend != 'N/A']

#Data is now fully processed and ready to be trained on the machine learning model
#There are 3 values for MonthTrend: UP, DOWN, NEUTRAL
btc_df

2019-10-27 00:00:00
2020-08-03 00:00:00
2020-08-04 00:00:00


Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,MonthTrend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-09-04,10627.269531,10516.417969,10588.183594,10575.533203,1.455124e+10,10575.533203,DOWN
2019-09-05,10898.761719,10292.299805,10578.198242,10353.302734,1.953657e+10,10353.302734,DOWN
2019-09-06,10558.673828,10348.918945,10353.931641,10517.254883,1.530737e+10,10517.254883,DOWN
2019-09-07,10595.637695,10409.090820,10518.114258,10441.276367,1.367057e+10,10441.276367,DOWN
2019-09-08,10450.311523,10144.929688,10443.228516,10334.974609,1.759594e+10,10334.974609,DOWN
...,...,...,...,...,...,...,...
2020-07-30,11415.864258,10987.053711,11110.210938,11323.466797,2.316047e+10,11323.466797,UP
2020-07-31,11794.775391,11239.682617,11322.570312,11759.592773,2.607567e+10,11759.592773,UP
2020-08-01,12034.144531,11018.129883,11758.764648,11053.614258,2.741007e+10,11053.614258,UP
2020-08-02,11453.079102,11012.415039,11043.768555,11246.348633,2.027171e+10,11246.348633,UP


In [18]:
# selecting the important feature open close
btc = btc_df[['Close','Open','MonthTrend']]
btc.head(10)

Unnamed: 0_level_0,Close,Open,MonthTrend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-04,10575.533203,10588.183594,DOWN
2019-09-05,10353.302734,10578.198242,DOWN
2019-09-06,10517.254883,10353.931641,DOWN
2019-09-07,10441.276367,10518.114258,DOWN
2019-09-08,10334.974609,10443.228516,DOWN
2019-09-09,10115.975586,10336.408203,DOWN
2019-09-10,10178.37207,10123.035156,DOWN
2019-09-11,10410.126953,10176.819336,DOWN
2019-09-12,10360.546875,10415.362305,NEUTRAL
2019-09-13,10358.048828,10345.40332,NEUTRAL


In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
Y = btc['MonthTrend']
X = btc.drop(['MonthTrend'], axis=1)
X_train_temp, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, random_state=2)
X_train =StandardScaler().fit_transform(X_train_temp)

In [39]:
# decision tree
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier(random_state=45)
param_grid = {'splitter': ["best","random"],  
              'min_samples_split': [2,4,6,8],
             'criterion':["gini","entropy"],
             'max_features':["auto", "sqrt", "log2"],
              'max_depth':[6,8,10,20],
             'max_leaf_nodes':[10,50,None]}  
grid =  GridSearchCV(model_tree,param_grid,scoring="accuracy", cv=10, verbose=1,n_jobs=6)
grid.fit(X_train_temp,Y_train)
print(grid.best_params_)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 tasks      | elapsed:    1.5s


{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': 10, 'min_samples_split': 8, 'splitter': 'random'}


[Parallel(n_jobs=6)]: Done 5688 tasks      | elapsed:    4.2s
[Parallel(n_jobs=6)]: Done 5760 out of 5760 | elapsed:    4.3s finished


In [51]:
# error rate
model_tree = DecisionTreeClassifier(criterion='entropy',min_samples_split=8,splitter="random",max_features="auto",
                                    max_leaf_nodes=10,max_depth=6)
model_tree.fit(X_train,Y_train)
tree_error_rate = 1 - model_tree.score(X_test,Y_test)
tree_error_rate

0.52

In [52]:
model_tree.score(X_train, Y_train)

0.575107296137339