In [1]:
# import the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.regression import *
import pandas_datareader.data as pdr
import datetime as dt
import yfinance as yf
yf.pdr_override()


In [2]:
crypto_currency = 'BTC'
against_currency = 'USD'

start = dt.datetime(2020, 1, 1)
end = dt.datetime.now()

data = pdr.get_data_yahoo(f'{crypto_currency}-{against_currency}', start=start, end=end)
data = data.reset_index()
data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-01-01 00:00:00+00:00,7194.892090,7254.330566,7174.944336,7200.174316,7200.174316,18565664997
1,2020-01-02 00:00:00+00:00,7202.551270,7212.155273,6935.270020,6985.470215,6985.470215,20802083465
2,2020-01-03 00:00:00+00:00,6984.428711,7413.715332,6914.996094,7344.884277,7344.884277,28111481032
3,2020-01-04 00:00:00+00:00,7345.375488,7427.385742,7309.514160,7410.656738,7410.656738,18444271275
4,2020-01-05 00:00:00+00:00,7410.451660,7544.497070,7400.535645,7411.317383,7411.317383,19725074095
...,...,...,...,...,...,...,...
1223,2023-05-08 00:00:00+00:00,28450.457031,28663.271484,27310.134766,27694.273438,27694.273438,19122903752
1224,2023-05-09 00:00:00+00:00,27695.068359,27821.400391,27375.601562,27658.775391,27658.775391,14128593256
1225,2023-05-10 00:00:00+00:00,27654.636719,28322.687500,26883.669922,27621.755859,27621.755859,20656025026
1226,2023-05-11 00:00:00+00:00,27621.085938,27621.941406,26781.826172,27000.789062,27000.789062,16724343943


In [3]:
#A variable for predicting one day into the future
future_days = 1
#Create a new column (the target or dependent variable) shifted 'n' units up, in this case n=1

data['Future_Price'] = data[['Close']].shift(-future_days)
#Choose the columns to keep

data = data[['Close', 'Future_Price']]
#show new data
data


Unnamed: 0,Close,Future_Price
0,7200.174316,6985.470215
1,6985.470215,7344.884277
2,7344.884277,7410.656738
3,7410.656738,7411.317383
4,7411.317383,7769.219238
...,...,...
1223,27694.273438,27658.775391
1224,27658.775391,27621.755859
1225,27621.755859,27000.789062
1226,27000.789062,26375.669922


In [4]:
#Make a copy of the dataset
df = data.copy()

#Crete the independent dataset
X = np.array(df[df.columns])

#Remove the last 'n' rows from the dataset where the 'n' is the future_days=1
X = X[:len(data)-future_days]

#Create the dependent dataset (y) and convert the  dataframe to a numpy array
y = np.array(df['Future_Price'])

#Get all of the y values except for the last 'n' rows which is the future_days=1
y = y[:-future_days]

#Split the dataset into 85% training and 15% testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0, shuffle=False)

In [5]:
#Get the train data and transform it into dataframe
train_data = pd.DataFrame(x_train, columns=df.columns)

#Show the first 7 rows of the data
train_data.head(7)

Unnamed: 0,Close,Future_Price
0,7200.174316,6985.470215
1,6985.470215,7344.884277
2,7344.884277,7410.656738
3,7410.656738,7411.317383
4,7411.317383,7769.219238
5,7769.219238,8163.692383
6,8163.692383,8079.862793


In [6]:
#get the test data and transform it into a dataframe
test_data = pd.DataFrame(x_test, columns=df.columns)

#Show the first 7 rows of the test data
test_data.head(7)

Unnamed: 0,Close,Future_Price
0,18541.271484,15880.780273
1,15880.780273,17586.771484
2,17586.771484,17034.292969
3,17034.292969,16799.185547
4,16799.185547,16353.365234
5,16353.365234,16618.199219
6,16618.199219,16884.613281


In [7]:
#Initializes the setup
regression_setup = setup(data=train_data, target='Future_Price', session_id= 123, use_gpu=True)

In [8]:
#Train on all of the models and sort them by the R-squared metric aka (r2) and then store the model with the highest R-squared value
best_model = compare_models()

In [9]:
#Create the model and show its prediction metrics on the training data
model = create_model('huber')

In [10]:
#Evaluate the model
evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [11]:
#Get the predictions
unseen_predictions = predict_model(model, data=test_data)

#Show the predictions
unseen_predictions

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,410.5086,377327.2548,614.2697,0.9835,0.0277,0.0181


Unnamed: 0,Close,Future_Price,prediction_label
0,18541.271484,15880.780273,18563.035791
1,15880.780273,17586.771484,15905.696996
2,17586.771484,17034.292969,17609.666778
3,17034.292969,16799.185547,17057.842894
4,16799.185547,16353.365234,16823.014051
...,...,...,...
180,28454.978516,27694.273438,28464.996069
181,27694.273438,27658.775391,27705.192351
182,27658.775391,27621.755859,27669.736366
183,27621.755859,27000.789062,27632.760699


In [17]:
actual_prices = unseen_predictions['Future_Price']
prediction_prices = unseen_predictions['prediction_label']
now = len(actual_prices)
from_days = 5


percent_error = abs((actual_prices[(now - from_days):] - prediction_prices[(now - from_days)]) / actual_prices[now - from_days]) * 100
avg_percent_error = percent_error.mean()
avg_percent_error = round(float(avg_percent_error), 2)

print(f"Average percent error: {avg_percent_error}%")


Average percent error: 4.31%
