In [14]:
# import the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.regression import *
import pandas_datareader.data as pdr
import datetime as dt
import yfinance as yf
yf.pdr_override()


In [15]:
crypto_currency = 'BTC'
against_currency = 'USD'

start = dt.datetime(2020, 1, 1)
end = dt.datetime.now()

data = pdr.get_data_yahoo(f'{crypto_currency}-{against_currency}', start=start, end=end)
data = data.reset_index()
data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-01-01 00:00:00+00:00,7194.892090,7254.330566,7174.944336,7200.174316,7200.174316,18565664997
1,2020-01-02 00:00:00+00:00,7202.551270,7212.155273,6935.270020,6985.470215,6985.470215,20802083465
2,2020-01-03 00:00:00+00:00,6984.428711,7413.715332,6914.996094,7344.884277,7344.884277,28111481032
3,2020-01-04 00:00:00+00:00,7345.375488,7427.385742,7309.514160,7410.656738,7410.656738,18444271275
4,2020-01-05 00:00:00+00:00,7410.451660,7544.497070,7400.535645,7411.317383,7411.317383,19725074095
...,...,...,...,...,...,...,...
1128,2023-02-02 00:00:00+00:00,23720.824219,24167.210938,23468.595703,23471.871094,23471.871094,32066936882
1129,2023-02-03 00:00:00+00:00,23469.412109,23678.103516,23279.955078,23449.322266,23449.322266,27083066007
1130,2023-02-04 00:00:00+00:00,23446.320312,23556.949219,23291.794922,23331.847656,23331.847656,15639298538
1131,2023-02-05 00:00:00+00:00,23332.248047,23423.435547,22841.759766,22955.666016,22955.666016,19564262605


In [16]:
#A variable for predicting one day into the future
future_days = 1
#Create a new column (the target or dependent variable) shifted 'n' units up, in this case n=1

data['Future_Price'] = data[['Close']].shift(-future_days)
#Choose the columns to keep

data = data[['Close', 'Future_Price']]
#show new data
data


Unnamed: 0,Close,Future_Price
0,7200.174316,6985.470215
1,6985.470215,7344.884277
2,7344.884277,7410.656738
3,7410.656738,7411.317383
4,7411.317383,7769.219238
...,...,...
1128,23471.871094,23449.322266
1129,23449.322266,23331.847656
1130,23331.847656,22955.666016
1131,22955.666016,22871.025391


In [17]:
#Make a copy of the dataset
df = data.copy()

#Crete the independent dataset
X = np.array(df[df.columns])

#Remove the last 'n' rows from the dataset where the 'n' is the future_days=1
X = X[:len(data)-future_days]

#Create the dependent dataset (y) and convert the  dataframe to a numpy array
y = np.array(df['Future_Price'])

#Get all of the y values except for the last 'n' rows which is the future_days=1
y = y[:-future_days]

#Split the dataset into 85% training and 15% testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0, shuffle=False)

In [18]:
#Get the train data and transform it into dataframe
train_data = pd.DataFrame(x_train, columns=df.columns)

#Show the first 7 rows of the data
train_data.head(7)

Unnamed: 0,Close,Future_Price
0,7200.174316,6985.470215
1,6985.470215,7344.884277
2,7344.884277,7410.656738
3,7410.656738,7411.317383
4,7411.317383,7769.219238
5,7769.219238,8163.692383
6,8163.692383,8079.862793


In [19]:
#get the test data and transform it into a dataframe
test_data = pd.DataFrame(x_test, columns=df.columns)

#Show the first 7 rows of the test data
test_data.head(7)

Unnamed: 0,Close,Future_Price
0,21166.060547,21534.121094
1,21534.121094,21398.908203
2,21398.908203,21528.087891
3,21528.087891,21395.019531
4,21395.019531,21600.904297
5,21600.904297,20260.019531
6,20260.019531,20041.738281


In [20]:
#Initializes the setup
regression_setup = setup(data=train_data, target='Future_Price', session_id= 123, use_gpu=True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Future_Price
2,Target type,Regression
3,Original data shape,"(962, 2)"
4,Transformed data shape,"(962, 2)"
5,Transformed train set shape,"(673, 2)"
6,Transformed test set shape,"(289, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


In [21]:
#Train on all of the models and sort them by the R-squared metric aka (r2) and then store the model with the highest R-squared value
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,882.279,1947982.3062,1373.1744,0.9939,0.041,0.0284,0.041
llar,Lasso Least Angle Regression,884.9154,1948341.0375,1373.56,0.9939,0.0412,0.0287,0.033
lasso,Lasso Regression,882.2788,1947981.4062,1373.1741,0.9939,0.041,0.0284,0.038
br,Bayesian Ridge,882.2943,1947981.6312,1373.176,0.9939,0.041,0.0284,0.027
omp,Orthogonal Matching Pursuit,882.2785,1947981.6625,1373.1742,0.9939,0.041,0.0284,0.028
huber,Huber Regressor,879.6106,1945955.7646,1371.645,0.9939,0.0408,0.028,0.049
lar,Least Angle Regression,882.2785,1947981.6625,1373.1742,0.9939,0.041,0.0284,0.037
en,Elastic Net,882.2788,1947981.4062,1373.1741,0.9939,0.041,0.0284,0.035
ridge,Ridge Regression,882.2788,1947981.8562,1373.1743,0.9939,0.041,0.0284,0.03
lightgbm,Light Gradient Boosting Machine,996.5025,2264509.4346,1482.3327,0.9929,0.0478,0.0334,0.772


In [22]:
#Create the model and show its prediction metrics on the training data
model = create_model('huber')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,774.7646,1358720.7317,1165.6418,0.9962,0.0314,0.0236
1,726.5924,1243033.2579,1114.914,0.9962,0.034,0.0244
2,835.0774,1681305.378,1296.6516,0.996,0.036,0.0254
3,687.9011,1030946.221,1015.3552,0.9961,0.032,0.0228
4,1046.2051,2552189.2475,1597.5573,0.9927,0.0692,0.0362
5,982.6643,2084833.9023,1443.8954,0.9933,0.0406,0.0316
6,1011.9887,2555912.351,1598.7221,0.9912,0.041,0.0285
7,1154.4881,3625018.0099,1903.948,0.9886,0.0526,0.0347
8,806.8673,1659629.0229,1288.2659,0.9951,0.0368,0.0276
9,769.5566,1667969.5236,1291.4989,0.9938,0.0348,0.0249


In [11]:
#Evaluate the model
evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [23]:
#Get the predictions
unseen_predictions = predict_model(model, data=test_data)

#Show the predictions
unseen_predictions

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,334.4935,280968.9539,530.065,0.9372,0.0276,0.0174


Unnamed: 0,Close,Future_Price,prediction_label
0,21166.060547,21534.121094,21195.459631
1,21534.121094,21398.908203,21563.218107
2,21398.908203,21528.087891,21428.116187
3,21528.087891,21395.019531,21557.189856
4,21395.019531,21600.904297,21424.230707
...,...,...,...
165,23723.769531,23471.871094,23751.069483
166,23471.871094,23449.322266,23499.377781
167,23449.322266,23331.847656,23476.847459
168,23331.847656,22955.666016,23359.469262


In [25]:
actual_prices = unseen_predictions['Future_Price']
prediction_prices = unseen_predictions['prediction_label']
now = len(actual_prices)
from_days = 9


percent_error = abs((actual_prices[(now - from_days):] - prediction_prices[(now - from_days)]) / actual_prices[now - from_days]) * 100
avg_percent_error = percent_error.mean()
avg_percent_error = round(avg_percent_error, 1)

print(f"Average percent error: {avg_percent_error}%")


Average percent error: 1.399999976158142%
