In [126]:
import pandas as pd
from pandas_datareader import data
from pandas_datareader._utils import RemoteDataError
from datetime import timedelta

# My idea was to classify the data into three cateogries: "UP", "DOWN", "NEUTRAL"
# UP - the price of the stock is up MINIMUM_GAIN percent or more after LOOK_AHEAD_DAYS days.
# DOWN - the price of the stock is down MINIMUM_GAIN percent ore more after LOOK_AHEAD_DAYS days.
# NEUTRAL - the price of stock is did not rise or fall past the MINIMUM_GAIN threshhold
#
# EXAMPLE: LOOK_AHEAD_DAYS = 30, MINIMUM_GAIN = .05 (5%). If stock is up over 5% or more after 30 days, 
# we mark it as 'UP'. If the stock is down 5% or more after 30 days, we mark it as 'DOWN'. If the stock
# is neither up nor down 5%, then we mark it as 'NEUTRAL'

MINIMUM_GAIN = .05 #minimal gain to be considered up or down for classification
LOOK_AHEAD_DAYS = 30 #number of days into the future we are trying to predict

class Utilities:

    @staticmethod
    def getData(ticker, start_date, end_date):
        try:
            stock_data = data.DataReader(ticker,
                                         'yahoo',
                                         start_date,
                                         end_date)
            return stock_data
        except RemoteDataError:
            print('No data found for {t}'.format(t=ticker))

In [120]:
# Just get the btc price points from the past year
start_date = datetime.now() - timedelta(365)
end_date = datetime.now() - timedelta(1)
btc_df = Utilities.getData('BTC-USD', str(start_date.strftime('%Y-%m-%d')), str(end_date.strftime('%Y-%m-%d')))
btc_df

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-08-04,11895.091797,10960.735352,10960.735352,11805.653320,2.387599e+10,11805.653320
2019-08-05,12273.821289,11290.731445,11811.544922,11478.168945,2.363511e+10,11478.168945
2019-08-06,12036.990234,11433.701172,11476.193359,11941.968750,2.219499e+10,11941.968750
2019-08-07,11979.419922,11556.167969,11954.040039,11966.407227,1.948159e+10,11966.407227
2019-08-08,11970.458008,11709.745117,11953.469727,11862.936523,1.833999e+10,11862.936523
...,...,...,...,...,...,...
2020-07-29,11169.356445,10895.455078,11099.833008,11111.213867,2.285725e+10,11111.213867
2020-07-30,11415.864258,10987.053711,11110.210938,11323.466797,2.316047e+10,11323.466797
2020-07-31,11794.775391,11239.682617,11322.570312,11759.592773,2.607567e+10,11759.592773
2020-08-01,12034.144531,11018.129883,11758.764648,11053.614258,2.741007e+10,11053.614258


In [144]:
btc_df = btc_df.assign(MonthTrend=lambda x: "N/A") # Introduce new column that will be the trend we are predicting
btc_df = btc_df.loc[~btc_df.index.duplicated(keep='first')] # Delete all duplicate indices

# Populate the classification column MonthTrend
for index, row in btc_df.iterrows():
    current_date = index.to_pydatetime()
    if (end_date - current_date).days > LOOK_AHEAD_DAYS:
        look_ahead_date = current_date + timedelta(days=LOOK_AHEAD_DAYS)
        
        while True: #we need to keep adding days to the look_ahead_date because there is no index for that day
            if look_ahead_date in btc_df.index:
                break
            print(look_ahead_date)
            look_ahead_date = look_ahead_date + timedelta(days=1)
        
        #Here we check if the open price in the future is higher than today's open price.
        if btc_df.loc[pd.Timestamp(look_ahead_date)]['Open'] >= btc_df.loc[index]['Open'] * (1 + MINIMUM_GAIN):
            btc_df.at[pd.Timestamp(look_ahead_date), 'MonthTrend'] = 'UP'
        elif btc_df.loc[pd.Timestamp(look_ahead_date)]['Open'] <= btc_df.loc[index]['Open'] * (1 - MINIMUM_GAIN):
            btc_df.at[pd.Timestamp(look_ahead_date), 'MonthTrend'] = 'DOWN'
        else:
            btc_df.at[pd.Timestamp(look_ahead_date), 'MonthTrend'] = 'NEUTRAL'
            
# Now delete all columns that have a MonthTrend value of N/A
btc_df = btc_df[btc_df.MonthTrend != 'N/A']

# Data is now fully processed and ready to be trained on the machine learning model
# There are 3 values for MonthTrend: UP, DOWN, NEUTRAL
# Please be careful when testing the model of leakage. There should be a LOOK_AHEAD_DAYS day gap between the train and test data sets
# otherwise, the train data set will unrealistically know something in the future
btc_df

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,MonthTrend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01,7254.330566,7174.944336,7194.892090,7200.174316,1.856566e+10,7200.174316,NEUTRAL
2020-01-02,7212.155273,6935.270020,7202.551270,6985.470215,2.080208e+10,6985.470215,NEUTRAL
2020-01-03,7413.715332,6914.996094,6984.428711,7344.884277,2.811148e+10,7344.884277,NEUTRAL
2020-01-04,7427.385742,7309.514160,7345.375488,7410.656738,1.844427e+10,7410.656738,NEUTRAL
2020-01-05,7544.497070,7400.535645,7410.451660,7411.317383,1.972507e+10,7411.317383,NEUTRAL
...,...,...,...,...,...,...,...
2020-07-29,11169.356445,10895.455078,11099.833008,11111.213867,2.285725e+10,11111.213867,UP
2020-07-30,11415.864258,10987.053711,11110.210938,11323.466797,2.316047e+10,11323.466797,UP
2020-07-31,11794.775391,11239.682617,11322.570312,11759.592773,2.607567e+10,11759.592773,UP
2020-08-01,12034.144531,11018.129883,11758.764648,11053.614258,2.741007e+10,11053.614258,UP
