In [1]:
import pandas as pd
from pandas_datareader import data
from pandas_datareader._utils import RemoteDataError
from datetime import timedelta
import datetime

LOOK_BACK_DAYS = 30 #number of days into the past we would like to take into account
LOOK_AHEAD_DAYS = 1 #number of days into the future we are trying to predict

class Utilities:

    @staticmethod
    def getData(ticker, start_date, end_date):
        try:
            stock_data = data.DataReader(ticker,
                                         'yahoo',
                                         start_date,
                                         end_date)
            return stock_data
        except RemoteDataError:
            print('No data found for {t}'.format(t=ticker))
#just get the btc price points from the past year
start_date = datetime.datetime.now() - timedelta(80)
end_date = datetime.datetime.now()
btc_df = Utilities.getData('BTC-USD', str(start_date.strftime('%Y-%m-%d')), str(end_date.strftime('%Y-%m-%d')))
btc_df

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-06-12,9493.211914,9396.009766,9480.735352,9475.277344,1.756432e+10,9475.277344
2020-06-13,9482.270508,9347.593750,9477.553711,9386.788086,1.899173e+10,9386.788086
2020-06-14,9504.860352,8990.175781,9386.035156,9450.702148,2.669970e+10,9450.702148
2020-06-15,9579.430664,9400.445312,9454.266602,9538.024414,2.156554e+10,9538.024414
2020-06-16,9540.422852,9327.339844,9533.784180,9480.254883,2.017771e+10,9480.254883
...,...,...,...,...,...,...
2020-08-28,11585.640625,11466.292969,11541.054688,11506.865234,1.748560e+10,11506.865234
2020-08-29,11715.264648,11492.381836,11508.713867,11711.505859,1.976013e+10,11711.505859
2020-08-30,11768.876953,11598.318359,11713.306641,11680.820312,2.228593e+10,11680.820312
2020-08-31,12067.081055,11601.128906,11679.316406,11970.478516,2.731156e+10,11970.478516


In [2]:
def createLookBackCols():
    for i in range(LOOK_BACK_DAYS):
        num_rows= btc_df.shape[0]
        num_cols= btc_df.shape[1]
        btc_df.insert(loc = num_cols, column = 'High_Past_' + str(i + 1), value = ['N/A'] * num_rows)
        btc_df.insert(loc = num_cols, column = 'Low_Past_' + str(i + 1), value = ['N/A'] * num_rows)
        btc_df.insert(loc = num_cols, column = 'Open_Past_' + str(i + 1), value = ['N/A'] * num_rows)
        btc_df.insert(loc = num_cols, column = 'Close_Past_' + str(i + 1), value = ['N/A'] * num_rows)
        btc_df.insert(loc = num_cols, column = 'Volume_Past_' + str(i + 1), value = ['N/A'] * num_rows)
        btc_df.insert(loc = num_cols, column = 'Adj Close_Past_' + str(i + 1), value = ['N/A'] * num_rows)

        
createLookBackCols() #create additional look back columns
btc_df = btc_df.loc[~btc_df.index.duplicated(keep='first')] #delete all duplicate indices

for index, row in btc_df.iterrows():
    current_date = index.to_pydatetime()
    if (current_date - start_date).days >= LOOK_BACK_DAYS - 1:
        for i in range(LOOK_BACK_DAYS):
            index = i + 1
            look_back_date = current_date - timedelta(days=index)
            
            while True: #we need to keep subtracting days to the look_back_date because there is no index for that day
                if look_back_date in btc_df.index:
                    break
                look_back_date = look_back_date - timedelta(days=1)
            
            btc_df.at[pd.Timestamp(current_date), 'High_Past_' + str(index)] = btc_df.at[pd.Timestamp(look_back_date), 'High']
            btc_df.at[pd.Timestamp(current_date), 'Low_Past_' + str(index)] = btc_df.at[pd.Timestamp(look_back_date), 'Low']
            btc_df.at[pd.Timestamp(current_date), 'Open_Past_' + str(index)] = btc_df.at[pd.Timestamp(look_back_date), 'Open']
            btc_df.at[pd.Timestamp(current_date), 'Close_Past_' + str(index)] = btc_df.at[pd.Timestamp(look_back_date), 'Close']
            btc_df.at[pd.Timestamp(current_date), 'Volume_Past_' + str(index)] = btc_df.at[pd.Timestamp(look_back_date), 'Volume']
            btc_df.at[pd.Timestamp(current_date), 'Adj Close_Past_' + str(index)] = btc_df.at[pd.Timestamp(look_back_date), 'Adj Close']
            
btc_df = btc_df[btc_df.High_Past_1 != 'N/A']
btc_df

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Adj Close_Past_1,Volume_Past_1,Close_Past_1,Open_Past_1,...,Close_Past_29,Open_Past_29,Low_Past_29,High_Past_29,Adj Close_Past_30,Volume_Past_30,Close_Past_30,Open_Past_30,Low_Past_30,High_Past_30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-13,9283.841797,9171.661133,9238.703125,9243.213867,18085040000.0,9243.213867,9243.61,17519800000.0,9243.61,9277.21,...,9450.7,9386.04,8990.18,9504.86,9386.79,18991700000.0,9386.79,9477.55,9347.59,9482.27
2020-07-14,9275.325195,9171.933594,9241.897461,9192.836914,15844730000.0,9192.836914,9243.21,18085000000.0,9243.21,9238.7,...,9538.02,9454.27,9400.45,9579.43,9450.7,26699700000.0,9450.7,9386.04,8990.18,9504.86
2020-07-15,9214.3125,9088.947266,9191.980469,9132.227539,15713970000.0,9132.227539,9192.84,15844700000.0,9192.84,9241.9,...,9480.25,9533.78,9327.34,9540.42,9538.02,21565500000.0,9538.02,9454.27,9400.45,9579.43
2020-07-16,9182.253906,9089.202148,9131.8125,9151.392578,13944570000.0,9151.392578,9132.23,15714000000.0,9132.23,9191.98,...,9411.84,9481.57,9328.4,9482.78,9480.25,20177700000.0,9480.25,9533.78,9327.34,9540.42
2020-07-17,9230.983398,9100.824219,9151.183594,9159.040039,12252600000.0,9159.040039,9151.39,13944600000.0,9151.39,9131.81,...,9288.02,9410.29,9274.3,9440.88,9411.84,17770100000.0,9411.84,9481.57,9328.4,9482.78
2020-07-18,9201.398438,9097.632812,9158.005859,9185.817383,12939000000.0,9185.817383,9159.04,12252600000.0,9159.04,9151.18,...,9332.34,9290.96,9247.38,9394.97,9288.02,19632200000.0,9288.02,9410.29,9274.3,9440.88
2020-07-19,9214.270508,9137.509766,9187.220703,9164.231445,13755600000.0,9164.231445,9185.82,12939000000.0,9185.82,9158.01,...,9303.63,9330.93,9300.43,9401.11,9332.34,17130500000.0,9332.34,9290.96,9247.38,9394.97
2020-07-20,9407.262695,9149.389648,9162.514648,9374.887695,18069580000.0,9374.887695,9164.23,13755600000.0,9164.23,9187.22,...,9648.72,9300.92,9296.87,9655.07,9303.63,15324300000.0,9303.63,9330.93,9300.43,9401.11
2020-07-21,9530.518555,9319.65332,9375.080078,9525.363281,16532250000.0,9525.363281,9374.89,18069600000.0,9374.89,9162.51,...,9629.66,9644.08,9547.25,9670.54,9648.72,21104000000.0,9648.72,9300.92,9296.87,9655.07
2020-07-22,9610.24707,9483.003906,9527.141602,9581.072266,18146400000.0,9581.072266,9525.36,16532300000.0,9525.36,9375.08,...,9313.61,9632.15,9278.23,9680.37,9629.66,17006400000.0,9629.66,9644.08,9547.25,9670.54


In [3]:
btc_df = btc_df.assign(MonthTrend=lambda x: "N/A") #introduce new column that will be the trend we are predicting
btc_df = btc_df.loc[~btc_df.index.duplicated(keep='first')] #delete all duplicate indices

#Populate the classification column MonthTrend
for index, row in btc_df.iterrows():
    current_date = index.to_pydatetime()
   
    if (end_date - current_date).days >= LOOK_AHEAD_DAYS:
        look_ahead_date = current_date + timedelta(days=LOOK_AHEAD_DAYS)
        
        num_attempts = 0
        while True: #we need to keep adding days to the look_ahead_date because there is no index for that day
            if look_ahead_date in btc_df.index:
                break
            look_ahead_date = look_ahead_date + timedelta(days=1)
            num_attempts = num_attempts + 1
            if num_attempts > 10:
                break
        if num_attempts > 10:
            print('failure')
            break   
        
        #Here we check if the open price in the future is higher than today's open price.
        if btc_df.loc[pd.Timestamp(look_ahead_date)]['Open'] > btc_df.loc[index]['Open']:
            btc_df.at[pd.Timestamp(index), 'MonthTrend'] = 'UP'
        else:
            btc_df.at[pd.Timestamp(index), 'MonthTrend'] = 'DOWN'

In [4]:
def generateFeatures(features):
    complete_features = []
    for feature in features:
        complete_features.append(feature)
        for i in range(LOOK_BACK_DAYS):
            complete_features.append(feature + "_Past_" + str(i + 1))
    return complete_features

features = generateFeatures(['Close','Open'])
features.append('MonthTrend')

# selecting the important feature open close
btc = btc_df[features]
todays_row = btc_df.tail(1)
btc_df = btc_df[btc_df.MonthTrend != 'N/A']
btc = btc_df.tail(30)
btc

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Adj Close_Past_1,Volume_Past_1,Close_Past_1,Open_Past_1,...,Open_Past_29,Low_Past_29,High_Past_29,Adj Close_Past_30,Volume_Past_30,Close_Past_30,Open_Past_30,Low_Past_30,High_Past_30,MonthTrend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-08-01,12034.144531,11018.129883,11758.764648,11053.614258,27410070000.0,11053.614258,11759.6,26075700000.0,11759.6,11322.6,...,9084.23,9053.63,9183.3,9087.3,13079000000.0,9087.3,9124.84,9058.79,9202.34,DOWN
2020-08-02,11453.079102,11012.415039,11043.768555,11246.348633,20271710000.0,11246.348633,11053.6,27410100000.0,11053.6,11758.8,...,9126.09,8977.02,9162.18,9132.49,12290500000.0,9132.49,9084.23,9053.63,9183.3,UP
2020-08-04,11786.617188,11158.285156,11203.823242,11747.022461,24411250000.0,11747.022461,11246.3,20271700000.0,11246.3,11043.8,...,9349.16,9201.82,9360.62,9375.47,17889300000.0,9375.47,9072.85,9058.66,9375.47,UP
2020-08-05,11902.335938,11598.713867,11749.871094,11779.773438,23400740000.0,11779.773438,11747.0,24411300000.0,11747.0,11203.8,...,9253.02,9249.5,9450.34,9252.28,13839700000.0,9252.28,9349.16,9201.82,9360.62,UP
2020-08-06,11898.038086,11408.59375,11778.894531,11601.472656,23132310000.0,11601.472656,11779.8,23400700000.0,11779.8,11749.9,...,9427.99,9235.0,9431.38,9428.33,19702400000.0,9428.33,9253.02,9249.5,9450.34,DOWN
2020-08-07,11800.064453,11558.431641,11604.553711,11754.045898,17572060000.0,11754.045898,11601.5,23132300000.0,11601.5,11778.9,...,9273.36,9118.0,9287.47,9277.97,18000700000.0,9277.97,9427.99,9235.0,9431.38,UP
2020-08-08,11806.056641,11548.78418,11737.325195,11675.739258,17489610000.0,11675.739258,11754.0,17572100000.0,11754.0,11604.6,...,9277.51,9199.49,9293.53,9278.81,16860000000.0,9278.81,9273.36,9118.0,9287.47,DOWN
2020-08-09,12045.140625,11662.256836,11662.256836,11878.111328,26114110000.0,11878.111328,11675.7,17489600000.0,11675.7,11737.3,...,9241.05,9197.45,9319.42,9240.35,13249900000.0,9240.35,9277.51,9199.49,9293.53,UP
2020-08-10,11932.710938,11195.708984,11881.647461,11410.525391,27039780000.0,11410.525391,11878.1,26114100000.0,11878.1,11662.3,...,9277.21,9224.29,9306.41,9276.5,14452400000.0,9276.5,9241.05,9197.45,9319.42,DOWN
2020-08-11,11748.396484,11249.605469,11404.59668,11584.93457,25064550000.0,11584.93457,11410.5,27039800000.0,11410.5,11881.6,...,9238.7,9171.66,9283.84,9243.61,17519800000.0,9243.61,9277.21,9224.29,9306.41,UP


In [5]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np
Y = btc['MonthTrend'] # 1 is up, 0 is down
le =LabelEncoder()
Y = le.fit_transform(Y)
X = btc.drop(['MonthTrend'], axis=1)
X = StandardScaler().fit_transform(X)
todays_row = todays_row.drop(['MonthTrend'], axis=1)
todays_row = StandardScaler().fit_transform(todays_row)


In [6]:
Y

array([0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1])

In [7]:
# logistic regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
model_regression = LogisticRegression()
param_grid = {'C': [1,2,4,6,8,10],
             'penalty':["l1", "l2"],
             'fit_intercept':[True,False],
              'class_weight':["balanced",None],
              'warm_start':[True,False]}  #simplified the CV so it runs faster
regression_grid =  GridSearchCV(model_regression, param_grid, cv=TimeSeriesSplit(max_train_size=None, n_splits=10), verbose=1,n_jobs=6)
regression_grid.fit(X,Y)
model_regresssion= regression_grid.best_estimator_


Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  48 tasks      | elapsed:    1.1s
[Parallel(n_jobs=6)]: Done 960 out of 960 | elapsed:    1.7s finished


In [8]:
model_regresssion.predict(todays_row)[0]

1