# Schedule TPC-DS1 Statistical Recommendation Evaluation

This experiment is intended at quantifying the statistical recommendation technique, through comparison of two query streams. The query streams are denoted as follows:

* Prior Stream - Denotes a sequence of baseline query plans, against which comparison will be made.
* Upcoming Stream - Denotes a sequence of upcoming query plans. Queries found within the upcoming stream mirror those established in the Prior Stream, with a number of exceptions. These exceptions are considered as query variants, and contain a degree of change from the original queries taken from the prior stream.

Query variants are denoted below, and are therefore eligable to be flagged during the evaluation phase:

* Query 5  
* Query 10
* Query 14
* Query 18
* Query 22
* Query 27
* Query 35
* Query 36
* Query 51
* Query 67
* Query 70
* Query 77
* Query 80
* Query 86

In [None]:
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# matplotlib
import matplotlib.pyplot as plt
# sklearn
import sklearn as sk
from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances
#
# AnyTree
from anytree import Node, RenderTree, PostOrderIter

### Configuration Cell

Tweak parametric changes from this cell to influence outcome of experiment

In [None]:
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test
test_split=.2
y_labels = ['COST',
            'CARDINALITY',
            'BYTES',
            'CPU_COST',
            'IO_COST',
            'TEMP_SPACE',
            'TIME']
black_list = ['TIMESTAMP',
              'SQL_ID',
              'OPERATION',
              'OPTIONS',
              'OBJECT_NAME',
              'OBJECT_OWNER',
              'PARTITION_STOP',
              'PARTITION_START'] # Columns which will be ignored during type conversion, and later used for aggregation
nrows = 10000

### Read data from file into pandas dataframes

In [None]:
# Root path
base_dir = 'C:/Users/gabriel.sammut/University/'
#base_dir = 'D:/Projects/ICS5200/'
root_dir = base_dir + 'Data_ICS5200/Schedule/' + tpcds
src_dir = base_dir + 'ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/'

rep_vsql_plan_path = root_dir + '/rep_vsql_plan.csv'
#rep_vsql_plan_path = root_dir + '/rep_vsql_plan.csv'

dtype={'COST':'int64',
       'CARDINALITY':'int64',
       'BYTES':'int64',
       'CPU_COST':'int64',
       'IO_COST':'int64',
       'TEMP_SPACE':'int64',
       'TIME':'int64',
       'OPERATION':'str',
       'OBJECT_NAME':'str'}
rep_vsql_plan_df = pd.read_csv(rep_vsql_plan_path, nrows=nrows, dtype=dtype)
print(rep_vsql_plan_df.head())
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_vsql_plan_df.columns = prettify_header(rep_vsql_plan_df.columns.values)
print('------------------------------------------')
print(rep_vsql_plan_df.columns)

### Read outlier data from file into pandas dataframes and concatenate

In [None]:
#
# CSV Outlier Paths
outlier_hints_q5_path = src_dir + 'hints/output/query_5.csv'
outlier_hints_q10_path = src_dir + 'hints/output/query_10.csv'
outlier_hints_q14_path = src_dir + 'hints/output/query_14.csv'
outlier_hints_q18_path = src_dir + 'hints/output/query_18.csv'
outlier_hints_q22_path = src_dir + 'hints/output/query_22.csv'
outlier_hints_q27_path = src_dir + 'hints/output/query_27.csv'
outlier_hints_q35_path = src_dir + 'hints/output/query_35.csv'
outlier_hints_q36_path = src_dir + 'hints/output/query_36.csv'
outlier_hints_q51_path = src_dir + 'hints/output/query_51.csv'
outlier_hints_q67_path = src_dir + 'hints/output/query_67.csv'
outlier_hints_q70_path = src_dir + 'hints/output/query_70.csv'
outlier_hints_q77_path = src_dir + 'hints/output/query_77.csv'
outlier_hints_q80_path = src_dir + 'hints/output/query_80.csv'
outlier_hints_q86_path = src_dir + 'hints/output/query_86.csv'
#
outlier_predicates_q5_path = src_dir + 'predicates/output/query_5.csv'
outlier_predicates_q10_path = src_dir + 'predicates/output/query_10.csv'
outlier_predicates_q14_path = src_dir + 'predicates/output/query_14.csv'
outlier_predicates_q18_path = src_dir + 'predicates/output/query_18.csv'
outlier_predicates_q22_path = src_dir + 'predicates/output/query_22.csv'
outlier_predicates_q27_path = src_dir + 'predicates/output/query_27.csv'
outlier_predicates_q35_path = src_dir + 'predicates/output/query_35.csv'
outlier_predicates_q36_path = src_dir + 'predicates/output/query_36.csv'
outlier_predicates_q51_path = src_dir + 'predicates/output/query_51.csv'
outlier_predicates_q67_path = src_dir + 'predicates/output/query_67.csv'
outlier_predicates_q70_path = src_dir + 'predicates/output/query_70.csv'
outlier_predicates_q77_path = src_dir + 'predicates/output/query_77.csv'
outlier_predicates_q80_path = src_dir + 'predicates/output/query_80.csv'
outlier_predicates_q86_path = src_dir + 'predicates/output/query_86.csv'
#
outlier_rownum_q5_path = src_dir + 'rownum/output/query_5.csv'
outlier_rownum_q10_path = src_dir + 'rownum/output/query_10.csv'
outlier_rownum_q14_path = src_dir + 'rownum/output/query_14.csv'
outlier_rownum_q18_path = src_dir + 'rownum/output/query_18.csv'
outlier_rownum_q22_path = src_dir + 'rownum/output/query_22.csv'
outlier_rownum_q27_path = src_dir + 'rownum/output/query_27.csv'
outlier_rownum_q35_path = src_dir + 'rownum/output/query_35.csv'
outlier_rownum_q36_path = src_dir + 'rownum/output/query_36.csv'
outlier_rownum_q51_path = src_dir + 'rownum/output/query_51.csv'
outlier_rownum_q67_path = src_dir + 'rownum/output/query_67.csv'
outlier_rownum_q70_path = src_dir + 'rownum/output/query_70.csv'
outlier_rownum_q77_path = src_dir + 'rownum/output/query_77.csv'
outlier_rownum_q80_path = src_dir + 'rownum/output/query_80.csv'
outlier_rownum_q86_path = src_dir + 'rownum/output/query_86.csv'
#
# Read CSV Paths
outlier_hints_q5_df = pd.read_csv(outlier_hints_q5_path,dtype=str)
outlier_hints_q10_df = pd.read_csv(outlier_hints_q10_path,dtype=str)
outlier_hints_q14_df = pd.read_csv(outlier_hints_q14_path,dtype=str)
outlier_hints_q18_df = pd.read_csv(outlier_hints_q18_path,dtype=str)
outlier_hints_q22_df = pd.read_csv(outlier_hints_q22_path,dtype=str)
outlier_hints_q27_df = pd.read_csv(outlier_hints_q27_path,dtype=str)
outlier_hints_q35_df = pd.read_csv(outlier_hints_q35_path,dtype=str)
outlier_hints_q36_df = pd.read_csv(outlier_hints_q36_path,dtype=str)
outlier_hints_q51_df = pd.read_csv(outlier_hints_q51_path,dtype=str)
outlier_hints_q67_df = pd.read_csv(outlier_hints_q67_path,dtype=str)
outlier_hints_q70_df = pd.read_csv(outlier_hints_q70_path,dtype=str)
outlier_hints_q77_df = pd.read_csv(outlier_hints_q77_path,dtype=str)
outlier_hints_q80_df = pd.read_csv(outlier_hints_q80_path,dtype=str)
outlier_hints_q86_df = pd.read_csv(outlier_hints_q86_path,dtype=str)
#
outlier_predicates_q5_df = pd.read_csv(outlier_predicates_q5_path,dtype=str)
outlier_predicates_q10_df = pd.read_csv(outlier_predicates_q10_path,dtype=str)
outlier_predicates_q14_df = pd.read_csv(outlier_predicates_q14_path,dtype=str)
outlier_predicates_q18_df = pd.read_csv(outlier_predicates_q18_path,dtype=str)
outlier_predicates_q22_df = pd.read_csv(outlier_predicates_q22_path,dtype=str)
outlier_predicates_q27_df = pd.read_csv(outlier_predicates_q27_path,dtype=str)
outlier_predicates_q35_df = pd.read_csv(outlier_predicates_q35_path,dtype=str)
outlier_predicates_q36_df = pd.read_csv(outlier_predicates_q36_path,dtype=str)
outlier_predicates_q51_df = pd.read_csv(outlier_predicates_q51_path,dtype=str)
outlier_predicates_q67_df = pd.read_csv(outlier_predicates_q67_path,dtype=str)
outlier_predicates_q70_df = pd.read_csv(outlier_predicates_q70_path,dtype=str)
outlier_predicates_q77_df = pd.read_csv(outlier_predicates_q77_path,dtype=str)
outlier_predicates_q80_df = pd.read_csv(outlier_predicates_q80_path,dtype=str)
outlier_predicates_q86_df = pd.read_csv(outlier_predicates_q86_path,dtype=str)
#
outlier_rownum_q5_df = pd.read_csv(outlier_rownum_q5_path,dtype=str)
outlier_rownum_q10_df = pd.read_csv(outlier_rownum_q10_path,dtype=str)
outlier_rownum_q14_df = pd.read_csv(outlier_rownum_q14_path,dtype=str)
outlier_rownum_q18_df = pd.read_csv(outlier_rownum_q18_path,dtype=str)
outlier_rownum_q22_df = pd.read_csv(outlier_rownum_q22_path,dtype=str)
outlier_rownum_q27_df = pd.read_csv(outlier_rownum_q27_path,dtype=str)
outlier_rownum_q35_df = pd.read_csv(outlier_rownum_q35_path,dtype=str)
outlier_rownum_q36_df = pd.read_csv(outlier_rownum_q36_path,dtype=str)
outlier_rownum_q51_df = pd.read_csv(outlier_rownum_q51_path,dtype=str)
outlier_rownum_q67_df = pd.read_csv(outlier_rownum_q67_path,dtype=str)
outlier_rownum_q70_df = pd.read_csv(outlier_rownum_q70_path,dtype=str)
outlier_rownum_q77_df = pd.read_csv(outlier_rownum_q77_path,dtype=str)
outlier_rownum_q80_df = pd.read_csv(outlier_rownum_q80_path,dtype=str)
outlier_rownum_q86_df = pd.read_csv(outlier_rownum_q86_path,dtype=str)
#
# Merge dataframes into a single pandas matrix
df_hints_outliers = pd.concat([outlier_hints_q5_df, outlier_hints_q10_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q14_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q18_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q22_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q27_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q35_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q36_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q51_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q67_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q70_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q77_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q80_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q86_df], sort=False)
#
df_predicate_outliers = pd.concat([outlier_predicate_q5_df, outlier_predicate_q10_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q14_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q18_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q22_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q27_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q35_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q36_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q51_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q67_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q70_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q77_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q80_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q86_df], sort=False)
#
df_rownum_outliers = pd.concat([outlier_rownum_q5_df, outlier_rownum_q10_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q14_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q18_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q22_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q27_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q35_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q36_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q51_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q67_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q70_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q77_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q80_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q86_df], sort=False)
#
print(df_hints_outliers.shape)
print(df_hints_outliers.head())
print('------------------------------------------')
print(df_predicate_outliers.shape)
print(df_predicate_outliers.head())
print('------------------------------------------')
print(df_rownum_outliers.shape)
print(df_rownum_outliers.head())