# A Grammar for the Automated Visual Presentation of Computations on Data

This notebook generates visualizations using the grammar described in submission 1404 for the 2023 IEEE VIS Full Papers CFP.

## Use Case 1: Model metrics in computational notebook

In this use case, we consider a data scientist who has built a model and is trying to communicate the quality of the model using performance metrics.  Using our python package `specmetric`, the data scientist is able to generate visualizations of different metrics.


In [1]:
# Setup - load data and model
# Starting from https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
# Then changing one-D linear regression plot to r2 plot
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# # Use one features
# diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
df_X_test = pd.DataFrame(data=diabetes_X_test, columns=['age', 'sex', 'bmi', 'bp', 'tc', 'ldl', 'hdl', 'tch', 'ltg', 'glu'])

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# Calculate a baseline - always predict the mean label of training set
mean_train_labels_baseline = np.full_like(diabetes_y_pred, np.mean(diabetes_y_train))


In [2]:
# Point notebook to local directory to pull in specmetric
import os
import sys
from pathlib import Path
module_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))))

if module_path not in sys.path:
    sys.path.append(module_path)

# Load up specmetric
from specmetric.parser import ComputationTreeParser
from specmetric.computation_tree import ComputationNode
from specmetric.renderer import AltairRenderer
from specmetric.visualization_container import VisualizationContainer


Now that the model is trained and the libraries are loaded, the data scientist calculates some metrics.  We write out these metrics explicitly as a computation tree.  In practice, it would be possible to write a function that takes Python's abstract syntax tree generated from the scoring functions written by the data scientist.  For this use case, we assume that the AST has already been parsed and written into the DSL that specmetric expects.

In [3]:
# r2
########### BEGIN COMPUTATION GRAPH ##########
## Everything below can be extracted from the abstract syntax tree
# Changes size of container
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>div.output_scroll { height: 44em; display: block;}</style>"))
# display(HTML("<style>.output { flex-direction: row; }</style>"))

y_i = diabetes_y_test
ids = np.arange(len(diabetes_y_test))
y_hat_i = diabetes_y_pred
X = diabetes_X_test
y_bar_scalar = np.mean(y_i)
y_bar_vector = np.full(y_i.shape, y_bar_scalar)
y_i_minus_y_hat_i = y_i - y_hat_i
y_i_minus_y_bar = y_i - y_bar_vector
y_i_minus_y_hat_i_squared = np.square(y_i_minus_y_hat_i)
y_i_minus_y_bar_squared = np.square(y_i_minus_y_bar)
ss_res = np.sum(y_i_minus_y_hat_i_squared)
ss_tot = np.sum(y_i_minus_y_bar_squared)
one = 1
ss_res_ss_tot_ratio = ss_res / ss_tot
r2 = one - ss_res_ss_tot_ratio
data_dict = {
    'ids': ids,
    'y_i': y_i,
    'y_hat_i': y_hat_i,
    'X': X,
    'y_bar_scalar': y_bar_scalar,
    'y_bar_vector': y_bar_vector,
    'y_i_minus_y_hat_i': y_i_minus_y_hat_i,
    'y_i_minus_y_bar': y_i_minus_y_bar,
    'y_i_minus_y_hat_i_squared': y_i_minus_y_hat_i_squared,
    'y_i_minus_y_bar_squared': y_i_minus_y_bar_squared,
    'ss_res': ss_res,
    'ss_tot': ss_tot,
    'one': one,
    'ss_res_ss_tot_ratio': ss_res_ss_tot_ratio,
    'r2': r2
}

input_vars = ['y_i', 'y_hat_i']
for col in df_X_test.columns:
    d = df_X_test[[col]]
    data_dict[col] = d
    input_vars.append(col)

minus_scalar = ComputationNode('minus_scalar', None, 'scalar_diff', input_data=['one', 'ss_res_ss_tot_ratio'], output_data='r2')
one = ComputationNode('one', minus_scalar, 'scalar', input_data=[], output_data='one')
ratio = ComputationNode('ratio', minus_scalar, 'scalar_ratio', input_data=['ss_res', 'ss_tot'], output_data='ss_res_ss_tot_ratio')
vector_sum_ss_res = ComputationNode('ss_res', ratio, 'vector_sum',input_data=['y_i_minus_y_hat_i_squared'], output_data='ss_res')
vector_sum_ss_tot = ComputationNode('ss_tot', ratio, 'vector_sum', input_data=['y_i_minus_y_bar_squared'], output_data='ss_tot')
square_residuals = ComputationNode('square_residuals', vector_sum_ss_res, 'vector_square', input_data=['y_i_minus_y_hat_i'], output_data='y_i_minus_y_hat_i_squared')
square_variances = ComputationNode('square_variances', vector_sum_ss_tot, 'vector_square', input_data=['y_i_minus_y_bar'], output_data='y_i_minus_y_bar_squared')
vector_difference_residuals = ComputationNode('vector_difference_residuals', square_residuals, 'vector_diff', input_data=['y_i', 'y_hat_i'], output_data='y_i_minus_y_hat_i')
vector_difference_variances = ComputationNode('vector_difference_variances', square_variances, 'vector_diff', input_data=['y_i', 'y_bar_vector'], output_data='y_i_minus_y_bar')
y_i_var_node = ComputationNode('literal_yi_var', vector_difference_variances, 'vector', output_data='y_i')
broadcast = ComputationNode('broadcast_mean', vector_difference_variances, 'broadcast', input_data=['y_bar_scalar', 'y_i'], output_data='y_bar_vector')
mean_y = ComputationNode('mean_y', broadcast, 'mean', input_data=['y_i'], output_data='y_bar_scalar')
y_i_mean_node = ComputationNode('literal_yi_mean', mean_y, 'vector', output_data='y_i')
y_i_res_node = ComputationNode('literal_yi_res', vector_difference_residuals, 'vector', output_data='y_i')
y_hat_node = ComputationNode('literal_yhat', vector_difference_residuals, 'vector', output_data='y_hat_i')

parser = ComputationTreeParser(minus_scalar)
parser.parse_computation_tree()
vis_containers = parser.visualization_containers
########### END COMPUTATION GRAPH ##########

r = AltairRenderer(vis_containers, data_dict, input_vars=input_vars)
charts = r.convert_to_charts()

charts.display()

input_data[0] y_i  is not in self.encodings {}
dot plot is  alt.Chart(...)
square plot is  alt.Chart(...)
yequalsx plot is  alt.Chart(...)
dot plot is  alt.Chart(...)
square plot is  alt.Chart(...)
yequalsx plot is  alt.Chart(...)
bar chart plot is  alt.Chart(...)
bar chart spacefilling plot is  alt.Chart(...)
bar chart plot is  alt.Chart(...)
result_chart is  alt.LayerChart(...)  and curr is  alt.LayerChart(...)
result_chart is  alt.HConcatChart(...)  and curr is  alt.LayerChart(...)
result_chart is  alt.HConcatChart(...)  and curr is  alt.Chart(...)


In [4]:
# mean absolute error
########### BEGIN COMPUTATION GRAPH ##########
## Everything below can be extracted from the abstract syntax tree

abs_y_i_minus_y_hat_i = np.abs(y_i_minus_y_hat_i)
mean_abs_error = np.mean(abs_y_i_minus_y_hat_i)
data_dict['abs_y_i_minus_y_hat_i'] = abs_y_i_minus_y_hat_i
data_dict['mean_abs_error'] = mean_abs_error

mean = ComputationNode('mean', None, 'mean', input_data=['abs_y_i_minus_y_hat_i'], output_data='mean_abs_error')
abs_y_i_minus_y_hat_i_node = ComputationNode('abs_y_i_minus_y_hat_i', mean, 'vector_abs', input_data=['y_i_minus_y_hat_i'], output_data='abs_y_i_minus_y_hat_i')
vector_difference_residuals = ComputationNode('vector_difference_residuals', abs_y_i_minus_y_hat_i_node, 'vector_diff', input_data=['y_i', 'y_hat_i'], output_data='y_i_minus_y_hat_i')
y_i_res_node = ComputationNode('literal_yi_res', vector_difference_residuals, 'vector', output_data='y_i')
y_hat_node = ComputationNode('literal_yhat', vector_difference_residuals, 'vector', output_data='y_hat_i')

parser = ComputationTreeParser(mean)
parser.parse_computation_tree()
vis_containers = parser.visualization_containers
########### END COMPUTATION GRAPH ##########
# print("vis_containers is ")
[vc.pp() for vc in vis_containers]
r = AltairRenderer(vis_containers, data_dict, input_vars=input_vars)
charts = r.convert_to_charts()

charts.display()





mean encoding was found
dot plot is  alt.Chart(...)
 we are creating a bar plot.  I hope!
appending bar_plot  alt.Chart(...)
yequalsx plot is  alt.Chart(...)
result_chart is  alt.LayerChart(...)  and curr is  None


ValueError: Only Chart objects can be concatenated.

In [None]:
# mean squared error
########### BEGIN COMPUTATION GRAPH ##########
## Everything below can be extracted from the abstract syntax tree

square_y_i_minus_y_hat_i = np.square(y_i_minus_y_hat_i)
mean_square_error = np.mean(square_y_i_minus_y_hat_i)
data_dict['square_y_i_minus_y_hat_i'] = square_y_i_minus_y_hat_i
data_dict['mean_square_error'] = mean_square_error

mean = ComputationNode('mean', None, 'mean', input_data=['square_y_i_minus_y_hat_i'], output_data='mean_square_error')
square_residuals = ComputationNode('square_y_i_minus_y_hat_i', mean, 'vector_square', input_data=['y_i_minus_y_hat_i'], output_data='square_y_i_minus_y_hat_i')
vector_difference_residuals = ComputationNode('vector_difference_residuals', square_residuals, 'vector_diff', input_data=['y_i', 'y_hat_i'], output_data='y_i_minus_y_hat_i')
y_i_res_node = ComputationNode('literal_yi_res', vector_difference_residuals, 'vector', output_data='y_i')
y_hat_node = ComputationNode('literal_yhat', vector_difference_residuals, 'vector', output_data='y_hat_i')

parser = ComputationTreeParser(mean)
parser.parse_computation_tree()
vis_containers = parser.visualization_containers
########### END COMPUTATION GRAPH ##########
# print("vis_containers is ")
# [vc.pp() for vc in vis_containers]
r = AltairRenderer(vis_containers, data_dict, input_vars=input_vars)
charts = r.convert_to_charts()

charts.display()

In [None]:
# root mean squared error
########### BEGIN COMPUTATION GRAPH ##########
## Everything below can be extracted from the abstract syntax tree

root_mean_square_error = np.sqrt(mean_square_error)
data_dict['root_mean_square_error'] = root_mean_square_error

root = ComputationNode('root', None, 'scalar_sqrt', input_data=['mean_square_error'], output_data='root_mean_square_error')
mean = ComputationNode('mean', root, 'mean', input_data=['square_y_i_minus_y_hat_i'], output_data='mean_square_error')
square_residuals = ComputationNode('square_y_i_minus_y_hat_i', mean, 'vector_square', input_data=['y_i_minus_y_hat_i'], output_data='square_y_i_minus_y_hat_i')
vector_difference_residuals = ComputationNode('vector_difference_residuals', square_residuals, 'vector_diff', input_data=['y_i', 'y_hat_i'], output_data='y_i_minus_y_hat_i')
y_i_res_node = ComputationNode('literal_yi_res', vector_difference_residuals, 'vector', output_data='y_i')
y_hat_node = ComputationNode('literal_yhat', vector_difference_residuals, 'vector', output_data='y_hat_i')

parser = ComputationTreeParser(root)
parser.parse_computation_tree()
vis_containers = parser.visualization_containers
########### END COMPUTATION GRAPH ##########
# print("vis_containers is ")
# [vc.pp() for vc in vis_containers]
r = AltairRenderer(vis_containers, data_dict, input_vars=input_vars)
charts = r.convert_to_charts()

charts.display()

In [None]:
# mean absolute percentage error

########### BEGIN COMPUTATION GRAPH ##########
## Everything below can be extracted from the abstract syntax tree

EPSILON=1e-1
abs_y_i = np.maximum(y_i, np.full_like(y_i, EPSILON))
ape = abs_y_i_minus_y_hat_i / abs_y_i
mape = np.mean(ape)
data_dict['abs_y_i'] = abs_y_i
data_dict['ape'] = ape
data_dict['mape'] = mape

mean = ComputationNode('mean', None, 'mean', input_data=['ape'], output_data='mape')
ape_node = ComputationNode('ape', mean, 'vector_ratio', input_data=['abs_y_i_node', 'abs_y_i_minus_y_hat_i'], output_data='ape')

abs_y_i_minus_y_hat_i_node = ComputationNode('abs_y_i_minus_y_hat_i', ape_node, 'vector_abs', input_data=['y_i_minus_y_hat_i'], output_data='abs_y_i_minus_y_hat_i')
vector_difference_residuals = ComputationNode('vector_difference_residuals', abs_y_i_minus_y_hat_i_node, 'vector_diff', input_data=['y_i', 'y_hat_i'], output_data='y_i_minus_y_hat_i')
y_i_res_node = ComputationNode('literal_yi_res', vector_difference_residuals, 'vector', output_data='y_i')
y_hat_node = ComputationNode('literal_yhat', vector_difference_residuals, 'vector', output_data='y_hat_i')

abs_y_i_node = ComputationNode('abs_y_i', ape_node, 'vector_abs', input_data=['y_i'], output_data='abs_y_i')
y_i_den_node = ComputationNode('literal_yi_den', abs_y_i_node, 'vector', output_data='y_i')

parser = ComputationTreeParser(mean)
parser.parse_computation_tree()
vis_containers = parser.visualization_containers
########### END COMPUTATION GRAPH ##########
# print("vis_containers is ")
[vc.pp() for vc in vis_containers]
r = AltairRenderer(vis_containers, data_dict, input_vars=input_vars)
charts = r.convert_to_charts()

charts.display()
