# Model Prediction using SAVIME and PYSAVIME

__The goal of this notebook is to introduce the model execution and prediction feature, using the Savime system and the PySavime access API. The models and data used will be the ones created in the first notebook: Part-01.

Check the variables `savime_host` and `savime_port`, which point to the host and port where Savime is listening to, respectively. We assume Savime is initialized, and Tfx is listening to the port 8501. 

In [None]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2


import os
import sys

if not 'notebooks' in os.listdir('.'):
    current_dir = os.path.abspath(os.getcwd())
    parent_dir = os.path.dirname(current_dir)
    os.chdir(parent_dir)

# We define the data file path : a json storing information about 
# the x and y partitions used in part-01.
    
data_file = 'saved_models_elastic_net/data.json'

# Configuring host and port where Savime is listening to
savime_host = '127.0.0.1'
savime_port = 65000

Next we include the necessary modules. Note the pysavime package.

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Savime imports
import pysavime
from pysavime.util.converter import DataVariableBlockConverter
from pysavime.util.data_variable import DataVariableBlockOps

# Importing Python prediction client for Tfx
from src.predictor_consumer import PredictionConsumer
from src.util import read_numpy_array_from_disk, export_numpy_array_to_c_array

# Ommiting tensorflow warnings
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

Here we load the data generated on part-01.

In [None]:
# Reading input json data file 
with open(data_file, 'r') as _in:
    data = json.load(_in)

# Reading x and y arrays
output_dir = data['output_dir']

# Directories Definition
x_file_path = os.path.join(output_dir, data['x_file_name'])
y_file_path = os.path.join(output_dir, data['y_file_name'])
x_c_file_path = os.path.join(output_dir, 'x_data')
y_c_file_path = os.path.join(output_dir, 'y_data')

# Converting generated data to format compatible with Savime
x_array = read_numpy_array_from_disk(x_file_path)
y_array = read_numpy_array_from_disk(y_file_path)
export_numpy_array_to_c_array(x_array, 'float64', x_c_file_path)
export_numpy_array_to_c_array(y_array, 'float64', y_c_file_path)

print('X values:\n', x_array)
print('Y values:\n', y_array)

Now we define the queries which we will run in Savime to build the necessary structures to create and load our datasets: CREATE_DATASET, CREATE_TAR and LOAD_SUBTAR.

In [None]:
# Definition of the dataset to be used:
num_observations = len(x_array)
num_features     = x_array.shape[1]
y_num_columns    = y_array.shape[1] if len(y_array.shape) == 2 else 1 

x_dataset = pysavime.define.file_dataset('x', x_c_file_path, 'double', length=num_features)
y_dataset = pysavime.define.file_dataset('y', y_c_file_path, 'double', length=y_num_columns)

# Tar Definition 
index = pysavime.define.implicit_tar_dimension('index', 'int32', 1, num_observations)
x = pysavime.define.tar_attribute('x', 'double', num_features)
y = pysavime.define.tar_attribute('y', 'double', y_num_columns)
tar = pysavime.define.tar('tutorialtar', [index], [x, y])

# Definition of subtar loading commands
subtar_index = pysavime.define.ordered_subtar_dimension(index, 1, num_observations)
subtar_x = pysavime.define.subtar_attribute(x, x_dataset)
subtar_y = pysavime.define.subtar_attribute(y, y_dataset)
subtar = pysavime.define.subtar(tar, [subtar_index], [subtar_x, subtar_y])

# The defined commands are:
print(x_dataset.create_query_str(), y_dataset.create_query_str(), sep='\n')
print(tar.create_query_str())
print(subtar.load_query_str())

Finally, we run the previously defined commands on savime 

1. We open and close the connection using Savime ('with' context)
2. Creation of a command execution object, attached to the opened conection
3. 
 1. Dataset Creation
 2. Subtar Creation
 3. Loading the datasets into the subtar

In [None]:
with pysavime.Client(host=savime_host, port=savime_port) as client:
    client.execute(pysavime.operator.create(x_dataset))
    client.execute(pysavime.operator.create(y_dataset))
    client.execute(pysavime.operator.create(tar))
    client.execute(pysavime.operator.load(subtar))

Next, for each saved model, we get the prediction's mean squared error on the data domain. To do so, we
execute the following steps:

1. Register the model on the system: `pysavime.operator.register_model`
2. Execute the predictive query: `pysavime.operator.predict`
3. We calculate the squared difference between the query output and the true y value:
`pysavime.operator.derive`
4. From this value, we calculate the mean squared error: `pysavime.operator.aggregate`

In [None]:
mse = {}
registered_models = data['iid']

with pysavime.Client(host=savime_host, port=savime_port) as client:
    # dim_spec specifies the size of the predictive query window.
    # It's a list of pairs, in which the first element specifies the dimension, 
    # and the second element specifies the number of observations        
    dim_spec = [(index.name, num_observations)]
    
    for model_name, i in registered_models.items():
        # A model is registered in Savime, i.e., we associate it with a Tar, identify what is the input attribute 
        # and the format of the multidimensional input array. In this case, we are sending the complete
        # observations array, but it's also possible to predict only a section of it        
        
        # Register the model that we will use        
        register_cmd = pysavime.operator.register_model(model_name=model_name, model_tar=tar.name, input_attribute=x.name,
                                               dim_specification=dim_spec)
        client.execute(register_cmd)
        
        # Calculate the mean squared error        
        predict_cmd = pysavime.operator.predict(tar=tar.name, model_name=model_name, input_attribute=x.name)
        derive_cmd = pysavime.operator.derive(predict_cmd, 'squared_difference', '(op_result - y)^2')
        aggregate_cmd = pysavime.operator.aggregate(derive_cmd, 'avg', 'squared_difference', 'mse')
        print(aggregate_cmd)
        mse[model_name] = client.execute(aggregate_cmd)        

Next, we register each model's error and we build a data frame.

In [None]:
print(mse[model_name])

d = {key: value[0].attrs['mse'][0][0] for key, value in mse.items()}
df = pd.DataFrame.from_dict(d, orient='index')
print(d)

Finally, we display each model's mean squared error. 
Note that model 25 exhibits the best results, since it was trained in points from
all of the partitions of the domain.

In [None]:
# Ordering the dataframe
df['index'] = df.index
df['index'] = df['index'].apply(lambda x: int(x.split('_')[-1]))
df = df.sort_values('index')

# Graph display
fig, ax = plt.subplots()
fig.set_size_inches(10, 6)
sns.barplot(x='index', y=0, data=df, ax=ax, color='darkblue')

# Adjusting the label
plt.xticks(rotation=90)
plt.xlabel('Models')
_ = plt.ylabel('MSE')