In [8]:
from keras.models import load_model
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import r2_score
import joblib

## Load the ANN model for performance check later

In [9]:
# load the best model from the training and testing
ann_model = load_model('../trained_models/ANN4_reg_nextday300Adam0-01relu64Int4_341.hdf5')


## Get the csv data to be used to fit scaler and test price pred

In [10]:
# get the datasets from csv files
df = pd.read_csv('./data/final_feats_df.csv', index_col=[0])

price_df = pd.read_csv('./data/price_df.csv', index_col=[0])


In [11]:
price_df.head()

Unnamed: 0,Date,priceUSD
0,2010/07/17,0.0495
1,2010/07/18,0.0726
2,2010/07/19,0.0859
3,2010/07/20,0.0783
4,2010/07/21,0.0767


## Create a holdout dataset of the 30 days used in the model training notebook

In [12]:
date_bool = (df.iloc[:, 0] >= '2021/09/09') & (df.iloc[:, 0] <= '2021/10/08')
X_holdout = df[date_bool]

In [13]:
X = df.copy()
X.head()

Unnamed: 0,Date,median_transaction_fee3momUSD,fee_to_reward7momUSD,top100cap7mom,mining_profitability7rsi,top100cap14mom,price3wmaUSD,transactionvalue90emaUSD,difficulty30sma,fee_to_reward90smaUSD
0,2010/07/17,0.0,0.0,-0.508,0.0,-0.088,0.0,0.0,29.004,0.0
1,2010/07/18,0.0,0.0,-0.477,0.0,-0.327,0.0,0.0,34.476,0.0
2,2010/07/19,0.0,0.0,-0.184,0.0,-0.49,0.075,0.0,39.948,0.0
3,2010/07/20,0.0,0.0,0.005,0.0,-0.532,0.08,0.0,45.421,0.0
4,2010/07/21,0.0,0.0,0.163,0.0,-0.599,0.079,0.0,50.893,0.0


In [14]:
# create a holdout dataset for the price data
y = price_df['priceUSD'].copy()

# create a row for next day price
price_df['next_day_price'] = price_df['priceUSD'].shift(-1)
price_bool = (price_df['Date'] >= '2021/09/09')&(price_df['Date'] <= '2021/10/08')
holdout_y = price_df[price_bool]

In [15]:
holdout_y.head()

Unnamed: 0,Date,priceUSD,next_day_price
4072,2021/09/09,46406.0,45928.0
4073,2021/09/10,45928.0,45367.0
4074,2021/09/11,45367.0,45624.0
4075,2021/09/12,45624.0,44807.0
4076,2021/09/13,44807.0,46027.0


## Create the scaler using the same split random state as in the training notebook

In [16]:
# extract the data from the dates of Interval 4
date_bool = (X.iloc[:, 0] >= '2013/04/01') & (X.iloc[:, 0] <= '2021/09/01')
X = X[date_bool]
# remove the date column
X = X.iloc[:, 1:]
# adjust the range of y df
y = y[X.index]
    
# create train test split of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)
    
# scale the data
estimators = [] # create a list for the scalers
estimators.append(['minmax', MinMaxScaler()])
estimators.append(['robust', RobustScaler()])
# add the scalers to the Pipeline
scaler = Pipeline(estimators, verbose=True)
# fit the scaler to the training data
scaler.fit(X_train)
# transform the training, testing, and holdout data
X_train_tran = scaler.transform(X_train)
X_test_tran = scaler.transform(X_test)
X_holdout_tran = scaler.transform(X_holdout.iloc[:, 1:])

[Pipeline] ............ (step 1 of 2) Processing minmax, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing robust, total=   0.0s


## Test the performance of the scaler

In [17]:
# make the prediction
pred_next_day_price = ann_model.predict(X_holdout_tran)

In [18]:
pred_next_day_price

array([[47190.62 ],
       [46391.094],
       [46268.383],
       [45928.793],
       [45854.363],
       [46305.07 ],
       [47603.21 ],
       [48215.492],
       [48753.285],
       [48586.21 ],
       [48387.203],
       [46863.777],
       [44517.766],
       [43828.266],
       [44344.992],
       [43995.19 ],
       [43532.945],
       [43552.953],
       [44078.562],
       [43460.08 ],
       [42964.117],
       [43913.12 ],
       [45426.953],
       [47740.203],
       [48561.45 ],
       [49103.617],
       [50111.14 ],
       [52371.492],
       [55127.883],
       [55803.266]], dtype=float32)

In [19]:
# compare the prediction to the price holdout data
r2_score(holdout_y['next_day_price'],pred_next_day_price)

0.8053419618810123

## The performance is not as good as the model training showed
Here using the same holdout data as used in the trianing notebook, there is worse performance with an R-squared of 0.805 rather than an R-squared of 0.971

# Save the Scaler

In [20]:
scaler_filename = "../scaler/scaler.save"
joblib.dump(scaler, scaler_filename) 


['../scaler/scaler.save']

## Check that the saved scaler loads and performs as expected on predictions

In [21]:
# And now to load...

scale = joblib.load(scaler_filename) 

In [22]:
saved_scaled_holdout = scale.transform(X_holdout.iloc[: , 1:])

In [23]:
saved_scaled_preds = ann_model.predict(saved_scaled_holdout)

In [24]:
r2_score(holdout_y['next_day_price'], saved_scaled_preds)

0.8053419618810123

## Consistent performance from the saved scaler
The scaler has been saved and when loaded scales the data to perform consistent model prediction

## View the scaler values to know what is going on

In [25]:

# view the values for the scaling function
minmax_scales = scale.named_steps['minmax'].scale_
robust_scales = scale.named_steps['robust'].scale_

In [26]:
minmax_scales

array([3.63821582e-02, 2.10482004e-02, 2.38714760e-02, 1.50084798e-02,
       2.40274874e-02, 1.59020548e-05, 2.23485162e-06, 4.28164288e-14,
       5.19831575e-02])

In [27]:
robust_scales

array([0.00146438, 0.01236056, 0.00432074, 0.16988849, 0.0071662 ,
       0.13195934, 0.07074367, 0.35705017, 0.27083225])