<a href="https://colab.research.google.com/github/dquerales/jupyter-automation-github-actions/blob/main/notebooks/pycaret_regression_time_series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to PyCaret - An open source low-code ML library

## Load libraries

In [234]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [235]:
from pycaret.regression import *
import pandas as pd
import numpy as np

## Load data

In [236]:
df = pd.read_csv('https://api.blockchain.info/charts/market-price?format=csv', header=None, names=['date', 'price'])

In [237]:
df.head()

Unnamed: 0,date,price
0,2022-05-07 00:00:00,36013.03
1,2022-05-08 00:00:00,35471.42
2,2022-05-09 00:00:00,34082.21
3,2022-05-10 00:00:00,30175.71
4,2022-05-11 00:00:00,31003.93


In [238]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    366 non-null    object 
 1   price   366 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.8+ KB


## Data Preparation


In [239]:
df['date'] = pd.to_datetime(df['date'])
df['day'] = df["date"].dt.day
df['month'] = df["date"].dt.month
df['year'] = df["date"].dt.year

In [240]:
df['series'] = np.arange(1,len(df)+1)

In [241]:
df.head()

Unnamed: 0,date,price,day,month,year,series
0,2022-05-07,36013.03,7,5,2022,1
1,2022-05-08,35471.42,8,5,2022,2
2,2022-05-09,34082.21,9,5,2022,3
3,2022-05-10,30175.71,10,5,2022,4
4,2022-05-11,31003.93,11,5,2022,5


In [242]:
import plotly.express as px
fig = px.line(df, x="date", y="price", template = 'plotly_dark')
fig.show()

In [243]:
train_split = round(len(df)*0.7)
test_split = len(df) - train_split

In [244]:
train = df.head(train_split)
test = df.tail(test_split)

## Modelling

In [245]:
time_series = setup(data = train, 
                    test_data = test, 
                    target = 'price', 
                    fold_strategy = 'timeseries', 
                    # numeric_features = ['day', 'month', 'year', 'series'], ignore_features = ['date'], 
                    numeric_features = ['series'], ignore_features = ['day', 'month', 'year'], 
                    transform_target = True, 
                    session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,price
2,Target type,Regression
3,Original data shape,"(366, 6)"
4,Transformed data shape,"(366, 5)"
5,Transformed train set shape,"(256, 5)"
6,Transformed test set shape,"(110, 5)"
7,Ignore features,3
8,Numeric features,1
9,Date features,1


### Compare models

In [246]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,1853.3572,8579176.6941,2247.0752,-6.3893,0.1034,0.091,0.176
et,Extra Trees Regressor,1981.3389,8801502.5024,2371.0812,-6.9149,0.1102,0.0999,0.309
rf,Random Forest Regressor,1803.2192,7748188.7482,2157.0314,-7.6299,0.1008,0.0894,0.577
dt,Decision Tree Regressor,1990.5171,10320342.8872,2427.7753,-8.138,0.1103,0.0981,0.129
ada,AdaBoost Regressor,1802.705,7247413.4882,2123.3031,-8.2656,0.1,0.0891,0.152
omp,Orthogonal Matching Pursuit,2113.5953,10715097.4677,2434.0677,-9.4094,0.1178,0.0996,0.215
lar,Least Angle Regression,2055.262,10065716.3363,2373.8168,-9.5768,0.1161,0.0971,0.158
ridge,Ridge Regression,2173.1562,10617629.3433,2477.2366,-9.811,0.123,0.1037,0.114
knn,K Neighbors Regressor,1843.9281,7685197.5547,2200.5449,-12.2209,0.1043,0.0935,0.213
lightgbm,Light Gradient Boosting Machine,2368.0159,11805418.2352,2764.5854,-16.4633,0.1267,0.1186,0.137


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [247]:
tuned_best_model = tune_model(best_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4724.2289,39965225.6643,6321.8056,-0.9356,0.2527,0.2223
1,1945.1995,5786520.5135,2405.5188,-8.7959,0.1167,0.096
2,2130.3486,5389658.4378,2321.5638,-7.6127,0.1061,0.092
3,1860.7973,5233550.9889,2287.6956,-0.8482,0.104,0.0892
4,3033.9359,10129593.7383,3182.7023,-9.953,0.1501,0.1545
5,3403.5338,11721471.2576,3423.6634,-84.2912,0.1631,0.176
6,2273.2351,5740896.0003,2396.0167,-9.0138,0.1145,0.1149
7,5197.3949,27165170.5461,5212.0217,-177.4163,0.274,0.3143
8,3578.7193,12906032.0456,3592.4966,-129.6279,0.1919,0.2107
9,2902.9185,9818277.3852,3133.4131,-3.2406,0.1688,0.1708


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


### Performance

In [248]:
# evaluate_model(best_model)

In [249]:
final_best_model = finalize_model(best_model)

## Test data

In [250]:
predictions = predict_model(best_model, data = test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,20485.6942,419744993.3636,20487.6791,-1.052413016880658e+21,9.3207,24547.904


In [251]:
predictions

Unnamed: 0,date,day,month,year,series,price,prediction_label
256,2023-01-18,18,1,2023,257,21145.179688,20894.516749
257,2023-01-19,19,1,2023,258,20672.779297,20213.282615
258,2023-01-20,20,1,2023,259,21087.759766,20128.466998
259,2023-01-21,21,1,2023,260,22693.449219,20128.466998
260,2023-01-22,22,1,2023,261,22772.500000,20128.466998
...,...,...,...,...,...,...,...
361,2023-05-03,3,5,2023,362,28678.289062,20365.698497
362,2023-05-04,4,5,2023,363,29007.660156,20365.698497
363,2023-05-05,5,5,2023,364,28851.839844,20765.575337
364,2023-05-06,6,5,2023,365,29535.380859,20765.575337


In [252]:
fig = px.line(predictions, x="date", y=["price", 'prediction_label'], template = 'plotly_dark')
fig.show()

## Predictions

In [253]:
start_date = df['date'].max()

In [254]:
end_date = df['date'].max() +  pd.DateOffset(days=10)

In [255]:
future_df = pd.DataFrame() 
future_dates = pd.date_range(start = start_date, end = end_date, freq = '1D')
future_df['date'] = future_dates  
future_df['day'] = future_df["date"].dt.day
future_df['month'] = future_df["date"].dt.month
future_df['year'] = future_df["date"].dt.year  
future_df['series'] = range(len(df), len(df)+len(future_dates))
future_df

Unnamed: 0,date,day,month,year,series
0,2023-05-07,7,5,2023,366
1,2023-05-08,8,5,2023,367
2,2023-05-09,9,5,2023,368
3,2023-05-10,10,5,2023,369
4,2023-05-11,11,5,2023,370
5,2023-05-12,12,5,2023,371
6,2023-05-13,13,5,2023,372
7,2023-05-14,14,5,2023,373
8,2023-05-15,15,5,2023,374
9,2023-05-16,16,5,2023,375


In [256]:
predictions_future = predict_model(final_best_model, data=future_df)
predictions_future.head()

Unnamed: 0,date,day,month,year,series,prediction_label
0,2023-05-07,7,5,2023,366,28935.857564
1,2023-05-08,8,5,2023,367,28935.857564
2,2023-05-09,9,5,2023,368,28696.319539
3,2023-05-10,10,5,2023,369,28529.369797
4,2023-05-11,11,5,2023,370,28703.204273


In [257]:
concat_df = pd.concat([df, predictions_future], axis=0).reset_index(drop=True)

In [258]:
concat_df

Unnamed: 0,date,price,day,month,year,series,prediction_label
0,2022-05-07,36013.03,7,5,2022,1,
1,2022-05-08,35471.42,8,5,2022,2,
2,2022-05-09,34082.21,9,5,2022,3,
3,2022-05-10,30175.71,10,5,2022,4,
4,2022-05-11,31003.93,11,5,2022,5,
...,...,...,...,...,...,...,...
372,2023-05-13,,13,5,2023,372,28703.204273
373,2023-05-14,,14,5,2023,373,29234.210861
374,2023-05-15,,15,5,2023,374,29207.413863
375,2023-05-16,,16,5,2023,375,29207.413863


In [259]:
px.line(concat_df, x=concat_df.index, y=["price", "prediction_label"], template = 'plotly_dark')
fig.show()

### Save model

In [260]:
save_model(best_model, 'model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('target_transformation',
                  TransformerWrapperWithInverse(transformer=TargetTransformer(estimator=PowerTransformer(standardize=False)))),
                 ('date_feature_extractor',
                  TransformerWrapper(include=['date'],
                                     transformer=ExtractDateTimeFeatures())),
                 ('numerical_imputer',
                  TransformerWrapper(include=['series'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('trained_model', GradientBoostingRegressor(random_state=123))]),
 'model.pkl')