# Schedule TPC-DS100 Plan Comparison (Evaluation)

This experiment is intended at quantifying the statistical recommendation technique, through comparison of two query streams. The original query template is compared with the following variants:

* Hint Variants
* Predicate Variants
* Rownum Variants

Query variants are denoted below:

* Query 5  
* Query 10
* Query 14
* Query 18
* Query 22
* Query 27
* Query 35
* Query 36
* Query 51
* Query 67
* Query 70
* Query 77
* Query 80
* Query 86



In [15]:
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# matplotlib
import matplotlib.pyplot as plt
# sklearn
import sklearn as sk
from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances
#
# AnyTree
from anytree import Node, RenderTree, PostOrderIter

pandas: 0.24.1
numpy: 1.16.1


### Configuration Cell

Tweak parametric changes from this cell to influence outcome of experiment

In [16]:
# Experiment Config
tpcds='TPCDS100' # Schema upon which to operate test
test_split=.2
y_labels = ['COST',
            'CARDINALITY',
            'BYTES',
            'CPU_COST',
            'IO_COST',
            'TEMP_SPACE',
            'TIME']
black_list = ['TIMESTAMP',
              'SQL_ID',
              'TIME',
              'IO_COST',
              'OPERATION',
              'OPTIONS',
              'OBJECT_NAME',
              'OBJECT_OWNER',
              'OBJECT_TYPE',
              'PARTITION_STOP',
              'PARTITION_START',
              'CPU_COST'] # Columns which will be ignored during type conversion, and later used for aggregation
nrows = 10000

### Read data from file into pandas dataframes

In [17]:
# Root path
base_dir = 'C:/Users/gabriel.sammut/University/'
#base_dir = 'D:/Projects/'
root_dir = base_dir + 'Data_ICS5200/Schedule/' + tpcds
src_dir = base_dir + 'ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/'

dtype={'COST':'int64',
       'CARDINALITY':'int64',
       'BYTES':'int64',
       'CPU_COST':'int64',
       'IO_COST':'int64',
       'TEMP_SPACE':'int64',
       'TIME':'int64',
       'OPERATION':'str',
       'OBJECT_NAME':'str'}

### Read outlier data from file into pandas dataframes and concatenate

In [18]:
#
# CSV Outlier Paths
outlier_hints_q5_path = src_dir + 'hints/output/query_5.csv'
outlier_hints_q10_path = src_dir + 'hints/output/query_10.csv'
outlier_hints_q14_path = src_dir + 'hints/output/query_14.csv'
outlier_hints_q18_path = src_dir + 'hints/output/query_18.csv'
outlier_hints_q22_path = src_dir + 'hints/output/query_22.csv'
outlier_hints_q27_path = src_dir + 'hints/output/query_27.csv'
outlier_hints_q35_path = src_dir + 'hints/output/query_35.csv'
outlier_hints_q36_path = src_dir + 'hints/output/query_36.csv'
outlier_hints_q51_path = src_dir + 'hints/output/query_51.csv'
outlier_hints_q67_path = src_dir + 'hints/output/query_67.csv'
outlier_hints_q70_path = src_dir + 'hints/output/query_70.csv'
outlier_hints_q77_path = src_dir + 'hints/output/query_77.csv'
outlier_hints_q80_path = src_dir + 'hints/output/query_80.csv'
outlier_hints_q86_path = src_dir + 'hints/output/query_86.csv'
#
outlier_predicates_q5_path = src_dir + 'predicates/output/query_5.csv'
outlier_predicates_q10_path = src_dir + 'predicates/output/query_10.csv'
outlier_predicates_q14_path = src_dir + 'predicates/output/query_14.csv'
outlier_predicates_q18_path = src_dir + 'predicates/output/query_18.csv'
outlier_predicates_q22_path = src_dir + 'predicates/output/query_22.csv'
outlier_predicates_q27_path = src_dir + 'predicates/output/query_27.csv'
outlier_predicates_q35_path = src_dir + 'predicates/output/query_35.csv'
outlier_predicates_q36_path = src_dir + 'predicates/output/query_36.csv'
outlier_predicates_q51_path = src_dir + 'predicates/output/query_51.csv'
outlier_predicates_q67_path = src_dir + 'predicates/output/query_67.csv'
outlier_predicates_q70_path = src_dir + 'predicates/output/query_70.csv'
outlier_predicates_q77_path = src_dir + 'predicates/output/query_77.csv'
outlier_predicates_q80_path = src_dir + 'predicates/output/query_80.csv'
outlier_predicates_q86_path = src_dir + 'predicates/output/query_86.csv'
#
outlier_rownum_q5_path = src_dir + 'rownum/output/query_5.csv'
outlier_rownum_q10_path = src_dir + 'rownum/output/query_10.csv'
outlier_rownum_q14_path = src_dir + 'rownum/output/query_14.csv'
outlier_rownum_q18_path = src_dir + 'rownum/output/query_18.csv'
outlier_rownum_q22_path = src_dir + 'rownum/output/query_22.csv'
outlier_rownum_q27_path = src_dir + 'rownum/output/query_27.csv'
outlier_rownum_q35_path = src_dir + 'rownum/output/query_35.csv'
outlier_rownum_q36_path = src_dir + 'rownum/output/query_36.csv'
outlier_rownum_q51_path = src_dir + 'rownum/output/query_51.csv'
outlier_rownum_q67_path = src_dir + 'rownum/output/query_67.csv'
outlier_rownum_q70_path = src_dir + 'rownum/output/query_70.csv'
outlier_rownum_q77_path = src_dir + 'rownum/output/query_77.csv'
outlier_rownum_q80_path = src_dir + 'rownum/output/query_80.csv'
outlier_rownum_q86_path = src_dir + 'rownum/output/query_86.csv'
#
outlier_original_q5_path = src_dir + 'original/output/query_5.csv'
outlier_original_q10_path = src_dir + 'original/output/query_10.csv'
outlier_original_q14_path = src_dir + 'original/output/query_14.csv'
outlier_original_q18_path = src_dir + 'original/output/query_18.csv'
outlier_original_q22_path = src_dir + 'original/output/query_22.csv'
outlier_original_q27_path = src_dir + 'original/output/query_27.csv'
outlier_original_q35_path = src_dir + 'original/output/query_35.csv'
outlier_original_q36_path = src_dir + 'original/output/query_36.csv'
outlier_original_q51_path = src_dir + 'original/output/query_51.csv'
outlier_original_q67_path = src_dir + 'original/output/query_67.csv'
outlier_original_q70_path = src_dir + 'original/output/query_70.csv'
outlier_original_q77_path = src_dir + 'original/output/query_77.csv'
outlier_original_q80_path = src_dir + 'original/output/query_80.csv'
outlier_original_q86_path = src_dir + 'original/output/query_86.csv'
#
# Read CSV Paths
outlier_hints_q5_df = pd.read_csv(outlier_hints_q5_path,dtype=str)
outlier_hints_q10_df = pd.read_csv(outlier_hints_q10_path,dtype=str)
outlier_hints_q14_df = pd.read_csv(outlier_hints_q14_path,dtype=str)
outlier_hints_q18_df = pd.read_csv(outlier_hints_q18_path,dtype=str)
outlier_hints_q22_df = pd.read_csv(outlier_hints_q22_path,dtype=str)
outlier_hints_q27_df = pd.read_csv(outlier_hints_q27_path,dtype=str)
outlier_hints_q35_df = pd.read_csv(outlier_hints_q35_path,dtype=str)
outlier_hints_q36_df = pd.read_csv(outlier_hints_q36_path,dtype=str)
outlier_hints_q51_df = pd.read_csv(outlier_hints_q51_path,dtype=str)
outlier_hints_q67_df = pd.read_csv(outlier_hints_q67_path,dtype=str)
outlier_hints_q70_df = pd.read_csv(outlier_hints_q70_path,dtype=str)
outlier_hints_q77_df = pd.read_csv(outlier_hints_q77_path,dtype=str)
outlier_hints_q80_df = pd.read_csv(outlier_hints_q80_path,dtype=str)
outlier_hints_q86_df = pd.read_csv(outlier_hints_q86_path,dtype=str)
#
outlier_predicates_q5_df = pd.read_csv(outlier_predicates_q5_path,dtype=str)
outlier_predicates_q10_df = pd.read_csv(outlier_predicates_q10_path,dtype=str)
outlier_predicates_q14_df = pd.read_csv(outlier_predicates_q14_path,dtype=str)
outlier_predicates_q18_df = pd.read_csv(outlier_predicates_q18_path,dtype=str)
outlier_predicates_q22_df = pd.read_csv(outlier_predicates_q22_path,dtype=str)
outlier_predicates_q27_df = pd.read_csv(outlier_predicates_q27_path,dtype=str)
outlier_predicates_q35_df = pd.read_csv(outlier_predicates_q35_path,dtype=str)
outlier_predicates_q36_df = pd.read_csv(outlier_predicates_q36_path,dtype=str)
outlier_predicates_q51_df = pd.read_csv(outlier_predicates_q51_path,dtype=str)
outlier_predicates_q67_df = pd.read_csv(outlier_predicates_q67_path,dtype=str)
outlier_predicates_q70_df = pd.read_csv(outlier_predicates_q70_path,dtype=str)
outlier_predicates_q77_df = pd.read_csv(outlier_predicates_q77_path,dtype=str)
outlier_predicates_q80_df = pd.read_csv(outlier_predicates_q80_path,dtype=str)
outlier_predicates_q86_df = pd.read_csv(outlier_predicates_q86_path,dtype=str)
#
outlier_rownum_q5_df = pd.read_csv(outlier_rownum_q5_path,dtype=str)
outlier_rownum_q10_df = pd.read_csv(outlier_rownum_q10_path,dtype=str)
outlier_rownum_q14_df = pd.read_csv(outlier_rownum_q14_path,dtype=str)
outlier_rownum_q18_df = pd.read_csv(outlier_rownum_q18_path,dtype=str)
outlier_rownum_q22_df = pd.read_csv(outlier_rownum_q22_path,dtype=str)
outlier_rownum_q27_df = pd.read_csv(outlier_rownum_q27_path,dtype=str)
outlier_rownum_q35_df = pd.read_csv(outlier_rownum_q35_path,dtype=str)
outlier_rownum_q36_df = pd.read_csv(outlier_rownum_q36_path,dtype=str)
outlier_rownum_q51_df = pd.read_csv(outlier_rownum_q51_path,dtype=str)
outlier_rownum_q67_df = pd.read_csv(outlier_rownum_q67_path,dtype=str)
outlier_rownum_q70_df = pd.read_csv(outlier_rownum_q70_path,dtype=str)
outlier_rownum_q77_df = pd.read_csv(outlier_rownum_q77_path,dtype=str)
outlier_rownum_q80_df = pd.read_csv(outlier_rownum_q80_path,dtype=str)
outlier_rownum_q86_df = pd.read_csv(outlier_rownum_q86_path,dtype=str)
#
outlier_original_q5_df = pd.read_csv(outlier_original_q5_path,dtype=str)
outlier_original_q10_df = pd.read_csv(outlier_original_q10_path,dtype=str)
outlier_original_q14_df = pd.read_csv(outlier_original_q14_path,dtype=str)
outlier_original_q18_df = pd.read_csv(outlier_original_q18_path,dtype=str)
outlier_original_q22_df = pd.read_csv(outlier_original_q22_path,dtype=str)
outlier_original_q27_df = pd.read_csv(outlier_original_q27_path,dtype=str)
outlier_original_q35_df = pd.read_csv(outlier_original_q35_path,dtype=str)
outlier_original_q36_df = pd.read_csv(outlier_original_q36_path,dtype=str)
outlier_original_q51_df = pd.read_csv(outlier_original_q51_path,dtype=str)
outlier_original_q67_df = pd.read_csv(outlier_original_q67_path,dtype=str)
outlier_original_q70_df = pd.read_csv(outlier_original_q70_path,dtype=str)
outlier_original_q77_df = pd.read_csv(outlier_original_q77_path,dtype=str)
outlier_original_q80_df = pd.read_csv(outlier_original_q80_path,dtype=str)
outlier_original_q86_df = pd.read_csv(outlier_original_q86_path,dtype=str)
#
# Merge dataframes into a single pandas matrix
df_hints = pd.concat([outlier_hints_q5_df, outlier_hints_q10_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q14_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q18_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q22_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q27_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q35_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q36_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q51_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q67_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q70_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q77_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q80_df], sort=False)
df_hints = pd.concat([df_hints, outlier_hints_q86_df], sort=False)
#
df_predicates = pd.concat([outlier_predicates_q5_df, outlier_predicates_q10_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q14_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q18_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q22_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q27_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q35_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q36_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q51_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q67_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q70_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q77_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q80_df], sort=False)
df_predicates = pd.concat([df_predicates, outlier_predicates_q86_df], sort=False)
#
df_rownum = pd.concat([outlier_rownum_q5_df, outlier_rownum_q10_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q10_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q14_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q18_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q22_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q27_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q35_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q36_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q51_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q67_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q70_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q77_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q80_df], sort=False)
df_rownum = pd.concat([df_rownum, outlier_rownum_q86_df], sort=False)
#
df_original = pd.concat([outlier_original_q5_df, outlier_original_q10_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q14_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q18_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q22_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q27_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q35_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q36_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q51_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q67_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q70_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q77_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q80_df], sort=False)
df_original = pd.concat([df_original, outlier_original_q86_df], sort=False)
#
print('Hint Outliers')
print(df_hints.shape)
print(df_hints.head())
print(df_hints.columns)
print('------------------------------------------')
print('Predicate Outliers')
print(df_predicates.shape)
print(df_predicates.head())
print(df_predicates.columns)
print('------------------------------------------')
print('Rownum Outliers')
print(df_rownum.shape)
print(df_rownum.head())
print(df_rownum.columns)
print('------------------------------------------')
print('Original')
print(df_original.shape)
print(df_original.head())
print(df_original.columns)

Hint Outliers
(461, 35)
  PLAN_ID            TIMESTAMP REMARKS         OPERATION          OPTIONS  \
0   12447  11/20/2018 09:56:46     NaN  SELECT STATEMENT              NaN   
1   12447  11/20/2018 09:56:46     NaN             COUNT          STOPKEY   
2   12447  11/20/2018 09:56:46     NaN              VIEW              NaN   
3   12447  11/20/2018 09:56:46     NaN              SORT  GROUP BY ROLLUP   
4   12447  11/20/2018 09:56:46     NaN              VIEW              NaN   

  OBJECT_NODE OBJECT_OWNER OBJECT_NAME                OBJECT_ALIAS  \
0         NaN          NaN         NaN                         NaN   
1         NaN          NaN         NaN                         NaN   
2         NaN     TPCDS100         NaN  from$_subquery$_018@SEL$11   
3         NaN          NaN         NaN                         NaN   
4         NaN     TPCDS100         NaN                    X@SEL$12   

  OBJECT_INSTANCE  ...                                          OTHER_XML  \
0             N

### Read Evaluation Files

These files consist of manually flagged optimum access paths for each of the query variants. These will be used as correct baselines, and will be used to gauge of the proposed method.

In [19]:
eval_hint_path = src_dir + 'hints/evaluation.csv'
eval_predicates_path = src_dir + 'predicates/evaluation.csv'
eval_rownum_path = src_dir + 'rownum/evaluation.csv'

# Read CSV Paths
eval_hint_df = pd.read_csv(eval_hint_path,dtype=str)
eval_predicates_df = pd.read_csv(eval_predicates_path,dtype=str)
eval_rownum_df = pd.read_csv(eval_rownum_path,dtype=str)

# Structure evaluation metrics into list of lists
def convert_pandas_to_dict(df):
    eval_dict = {}
    for i in range(1, 15):
        temp_df = df[df['id'] == str(i)]
        temp_list = []
        for index, row in temp_df.iterrows():
            temp_list.append([row['object_type'],row['object_name']])
        eval_dict[i] = temp_list
    return eval_dict

eval_hint_dict = convert_pandas_to_dict(df=eval_hint_df)
eval_predicates_dict = convert_pandas_to_dict(df=eval_predicates_df)
eval_rownum_dict = convert_pandas_to_dict(df=eval_rownum_df)
print(eval_hint_dict)
print('-'*30)
print(eval_predicates_dict)
print('-'*30)
print(eval_rownum_dict)

{1: [['INDEX', 'SS_SOLD_TIME_SK_INDEX'], ['TABLE', 'STORE_SALES'], ['TABLE', 'STORE_RETURNS']], 2: [['INDEX', 'C_CURRENT_ADDR_SK_INDEX'], ['TABLE', 'CUSTOMER'], ['TABLE', 'CUSTOMER ADDRESS']], 3: [['INDEX', 'SS_SOLD_DATE_SK_INDEX'], ['TABLE', 'DATE_DIM']], 4: [['TABLE', 'CUSTOMER_ADDRESS'], ['TABLE', 'CATALOG_SALES'], ['INDEX', 'CS_SOLD_DATE_SK_INDEX'], ['TABLE', 'CUSTOMER']], 5: [['TABLE', 'DATE_DIM'], ['TABLE', 'INVENTORY'], ['TABLE', 'ITEM']], 6: [['TABLE', 'CUSTOMER_DEMOGRAPHICS'], ['TABLE', 'STORE_SALES'], ['INDEX', 'SS_SOLD_DATE_SK_INDEX'], ['TABLE', 'DATE_DIM']], 7: [['TABLE', 'DATE_DIM'], ['INDEX', 'CS_SOLD_DATE_SK_INDEX'], ['TABLE', 'CATALOG_SALES']], 8: [['TABLE', 'STORE'], ['TABLE', 'ITEM'], ['TABLE', 'DATE_DIM'], ['INDEX', 'SS_SOLD_DATE_SK_INDEX'], ['TABLE', 'STORE_SALES']], 9: [['INDEX', 'WS_SOLD_DATE_SK_INDEX'], ['TABLE', 'WEB_SALES'], ['TABLE', 'DATE_DIM']], 10: [['TABLE', 'DATE_DIM'], ['INDEX', 'SS_SOLD_DATE_SK_INDEX']], 11: [['INDEX', 'SS_SOLD_DATE_SK_INDEX'], ['TABLE'

### Dealing with empty values

In [20]:
def get_na_columns(df, headers):
    """
    Return columns which consist of NAN values
    """
    na_list = []
    for head in headers:
        if df[head].isnull().values.any():
            na_list.append(head)
    return na_list

print('N/A Columns\n')
print('Original Features ' + str(len(df_original.columns)) + ': ' + str(get_na_columns(df=df_original,headers=df_original.columns)) + "\n")
print('Rownum Features ' + str(len(df_rownum.columns)) + ': ' + str(get_na_columns(df=df_rownum,headers=df_rownum.columns)) + "\n")
print('Predicates Features ' + str(len(df_predicates.columns)) + ': ' + str(get_na_columns(df=df_predicates,headers=df_predicates.columns)) + "\n")
print('Hints Features ' + str(len(df_hints.columns)) + ': ' + str(get_na_columns(df=df_hints,headers=df_hints.columns)) + "\n")

def fill_na(df):
    """
    Replaces NA columns with 0s
    """
    return df.fillna(0)

# Populating NaN values with amount '0'
df_original = fill_na(df=df_original)
df_rownum = fill_na(df=df_rownum)
df_predicates = fill_na(df=df_predicates)
df_hints = fill_na(df=df_hints)

N/A Columns

Original Features 35: ['REMARKS', 'OPTIONS', 'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'OPTIMIZER', 'SEARCH_COLUMNS', 'PARENT_ID', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG', 'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'OTHER_XML', 'DISTRIBUTION', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME', 'QBLOCK_NAME']

Rownum Features 35: ['REMARKS', 'OPTIONS', 'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'OPTIMIZER', 'SEARCH_COLUMNS', 'PARENT_ID', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG', 'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'OTHER_XML', 'DISTRIBUTION', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME', 'QBLOCK_NAME']

Predicates Features 35: ['REMARKS', 'OPTIONS', 'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJ

### Type conversion

Each column is converted into a column of type values which are Integer64.

In [21]:
def handle_numeric_overflows(x):
    """
    Accepts a dataframe column, and 
    """
    try:
        #df = df.astype('int64')
        x1 = pd.DataFrame([x],dtype='int64')
    except ValueError:
        x = 9223372036854775807 # Max int size
    return x

for col in df_original.columns:
    try:
        if col in black_list:
            continue
        df_original[col] = df_original[col].astype('int64')
    except OverflowError:
        
        # Handles numeric overflow conversions by replacing such values with max value inside the dataset.
        df_original[col] = df_original[col].apply(handle_numeric_overflows)
        df_original[col] = df_original[col].astype('int64')
    except Exception as e:
        df_original.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')

print('Original:')
print(df_original.columns)

print('-------------------------------------------------------------')

for col in df_predicates.columns:
    try:
        if col in black_list:
            continue
        df_predicates[col] = df_predicates[col].astype('int64')
    except OverflowError:
        
        # Handles numeric overflow conversions by replacing such values with max value inside the dataset.
        df_predicates[col] = df_predicates[col].apply(handle_numeric_overflows)
        df_predicates[col] = df_predicates[col].astype('int64')
    except Exception as e:
        df_predicates.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')

print('Predicates:')
print(df_predicates.columns)

print('-------------------------------------------------------------')

for col in df_hints.columns:
    try:
        if col in black_list:
            continue
        df_hints[col] = df_hints[col].astype('int64')
    except OverflowError:
        
        # Handles numeric overflow conversions by replacing such values with max value inside the dataset.
        df_hints[col] = df_hints[col].apply(handle_numeric_overflows)
        df_hints[col] = df_hints[col].astype('int64')
    except Exception as e:
        df_hints.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')

print('Hints:')
print(df_hints.columns)

print('-------------------------------------------------------------')

for col in df_rownum.columns:
    try:
        if col in black_list:
            continue
        df_rownum[col] = df_rownum[col].astype('int64')
    except OverflowError:
        
        # Handles numeric overflow conversions by replacing such values with max value inside the dataset.
        df_rownum[col] = df_rownum[col].apply(handle_numeric_overflows)
        df_rownum[col] = df_rownum[col].astype('int64')
    except Exception as e:
        df_rownum.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')
print('Rownum:')
print(df_rownum.columns)

Dropped column [OBJECT_ALIAS]
Dropped column [OPTIMIZER]
Dropped column [OTHER_XML]
Dropped column [ACCESS_PREDICATES]
Dropped column [FILTER_PREDICATES]
Dropped column [PROJECTION]
Dropped column [QBLOCK_NAME]
Original:
Index(['PLAN_ID', 'TIMESTAMP', 'REMARKS', 'OPERATION', 'OPTIONS',
       'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_INSTANCE',
       'OBJECT_TYPE', 'SEARCH_COLUMNS', 'ID', 'PARENT_ID', 'DEPTH', 'POSITION',
       'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG', 'PARTITION_START',
       'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'DISTRIBUTION', 'CPU_COST',
       'IO_COST', 'TEMP_SPACE', 'TIME'],
      dtype='object')
-------------------------------------------------------------
Dropped column [OBJECT_ALIAS]
Dropped column [OPTIMIZER]
Dropped column [OTHER_XML]
Dropped column [ACCESS_PREDICATES]
Dropped column [FILTER_PREDICATES]
Dropped column [PROJECTION]
Dropped column [QBLOCK_NAME]
Predicates:
Index(['PLAN_ID', 'TIMESTAMP', 'REMARKS', 'OPERATION', 'OPTIONS'

### Feature Selection

In this step, redundant features are dropped. Features are considered redundant if exhibit a standard devaition of 0 (meaning no change in value).

In [22]:
def drop_flatline_columns(df):
    columns = df.columns
    flatline_features = []
    for i in range(len(columns)):
        try:
            
            if columns[i] in black_list:
                continue
            
            std = df[columns[i]].std()
            if std == 0:
                flatline_features.append(columns[i])
        except:
            pass
    
    #print('Features which are considered flatline:\n')
    #for col in flatline_features:
    #    print(col)
    print('\nShape before changes: [' + str(df.shape) + ']')
    df = df.drop(columns=flatline_features)
    print('Shape after changes: [' + str(df.shape) + ']')
    print('Dropped a total [' + str(len(flatline_features)) + ']')
    return df

df_original = drop_flatline_columns(df=df_original)
df_predicates = drop_flatline_columns(df=df_predicates)
df_rownum = drop_flatline_columns(df=df_rownum)
df_hints = drop_flatline_columns(df=df_hints)

print('\nAfter original flatline column drop:')
print(df_original.shape)
print(df_original.columns)

print('\nAfter predicates flatline column drop:')
print(df_predicates.shape)
print(df_predicates.columns)

print('\nAfter rownum flatline column drop:')
print(df_rownum.shape)
print(df_rownum.columns)

print('\nAfter hints flatline column drop:')
print(df_hints.shape)
print(df_hints.columns)


Shape before changes: [(621, 28)]
Shape after changes: [(621, 22)]
Dropped a total [6]

Shape before changes: [(489, 28)]
Shape after changes: [(489, 22)]
Dropped a total [6]

Shape before changes: [(510, 28)]
Shape after changes: [(510, 22)]
Dropped a total [6]

Shape before changes: [(461, 28)]
Shape after changes: [(461, 22)]
Dropped a total [6]

After original flatline column drop:
(621, 22)
Index(['PLAN_ID', 'TIMESTAMP', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER',
       'OBJECT_NAME', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'SEARCH_COLUMNS', 'ID',
       'PARENT_ID', 'DEPTH', 'POSITION', 'COST', 'CARDINALITY', 'BYTES',
       'PARTITION_START', 'PARTITION_STOP', 'CPU_COST', 'IO_COST',
       'TEMP_SPACE', 'TIME'],
      dtype='object')

After predicates flatline column drop:
(489, 22)
Index(['PLAN_ID', 'TIMESTAMP', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER',
       'OBJECT_NAME', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'SEARCH_COLUMNS', 'ID',
       'PARENT_ID', 'DEPTH', 'POSITION', 'COST', 'CARDIN

### Scaling columns

This section attempts to process a number of data columns through a MinMax Scaler. This is done, to normalize data on a similar scaler, particularly before comparing column measurements using a euclidean based measure. The following columns will be targetted:

* CARDINALITY
* BYTES
* PARTITION_START
* PARTITION_STOP
* CPU_COST
* IO_COST
* TEMP_SPACE
* TIME

In [23]:
scaler = preprocessing.MinMaxScaler()
scaled_columns = ['CARDINALITY',
                'BYTES',
                'PARTITION_START',
                'PARTITION_STOP',
                'CPU_COST',
                'IO_COST',
                'TEMP_SPACE',
                'TIME']
scaler.fit(df_original[scaled_columns])
df_original[scaled_columns] = scaler.transform(df_original[scaled_columns])
df_predicates[scaled_columns] = scaler.transform(df_predicates[scaled_columns])
df_rownum[scaled_columns] = scaler.transform(df_rownum[scaled_columns])
df_hints[scaled_columns] = scaler.transform(df_hints[scaled_columns])
print("Minimal Vector Points: " + str(scaler.data_min_))
print("Maximal Vector Points: " + str(scaler.data_max_))
#
print('\nAfter scaled column transformation:')
print(df_original.shape)
print(df_original.columns)
#
print('--------------------------------------------------------')
print('\nAfter predicates scaled column transformation:')
print(df_predicates.shape)
print(df_predicates.columns)
#
print('--------------------------------------------------------')
print('\nAfter rownum scaled column transformation:')
print(df_rownum.shape)
print(df_rownum.columns)
#
print('--------------------------------------------------------')
print('\nAfter hints scaled column transformation:')
print(df_hints.shape)
print(df_hints.columns)

Minimal Vector Points: [0. 0. 0. 0. 0. 0. 0. 0.]
Maximal Vector Points: [3.11714680e+07 8.10458168e+08 0.00000000e+00 0.00000000e+00
 7.57081846e+09 1.34901000e+05 4.92670000e+07 6.00000000e+00]

After scaled column transformation:
(621, 22)
Index(['PLAN_ID', 'TIMESTAMP', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER',
       'OBJECT_NAME', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'SEARCH_COLUMNS', 'ID',
       'PARENT_ID', 'DEPTH', 'POSITION', 'COST', 'CARDINALITY', 'BYTES',
       'PARTITION_START', 'PARTITION_STOP', 'CPU_COST', 'IO_COST',
       'TEMP_SPACE', 'TIME'],
      dtype='object')
--------------------------------------------------------

After predicates scaled column transformation:
(489, 22)
Index(['PLAN_ID', 'TIMESTAMP', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER',
       'OBJECT_NAME', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'SEARCH_COLUMNS', 'ID',
       'PARENT_ID', 'DEPTH', 'POSITION', 'COST', 'CARDINALITY', 'BYTES',
       'PARTITION_START', 'PARTITION_STOP', 'CPU_COST', 'IO_COST',
       'T

### Adding Grouping Column

An extra column is added to allow access plans to be isolated per instance

In [24]:
# Adds a columns per SQL_ID, PLAN_HASH_VALUE grouping, which can be used to group instances together
def add_grouping_column(df, column_identifier):
    """
    Receives a pandas dataframe, and adds a new column which allows dataframe to be aggregated per 
    SQL_ID, PLAN_HASH_VALUE combination.
    
    :param: df                - Pandas Dataframe
    :param: column_identifier - String denoting matrix column to group by
    
    :return: Pandas Dataframe, with added column    
    """
    print('Shape before transformation: ' + str(df.shape))
    new_grouping_col = []
    counter = 0
    last_sql_id = df[column_identifier].iloc(0) # Starts with first SQL_ID
    for index, row in df.iterrows():
        if column_identifier == 'SQL_ID':
            if last_sql_id != row.SQL_ID:
                last_sql_id = row.SQL_ID
                counter += 1
        elif column_identifier == 'PLAN_ID':
            if last_sql_id != row.PLAN_ID:
                last_sql_id = row.PLAN_ID
                counter += 1
        else:
            raise ValueError('Column does not exist!')
        new_grouping_col.append(counter)
    #
    # Append list as new column
    new_col = pd.Series(new_grouping_col)
    df['PLAN_INSTANCE'] = new_col.values
    print('Shape after transformation: ' + str(df.shape))
    return df
#
df_original = add_grouping_column(df=df_original,column_identifier='PLAN_ID')
df_predicates = add_grouping_column(df=df_predicates,column_identifier='PLAN_ID')
df_rownum = add_grouping_column(df=df_rownum,column_identifier='PLAN_ID')
df_hints = add_grouping_column(df=df_hints,column_identifier='PLAN_ID')

Shape before transformation: (621, 22)
Shape after transformation: (621, 23)
Shape before transformation: (489, 22)
Shape after transformation: (489, 23)
Shape before transformation: (510, 22)
Shape after transformation: (510, 23)
Shape before transformation: (461, 22)
Shape after transformation: (461, 23)


### Tree Formatting

Constructs the tree plan structure

In [25]:
class PlanTreeModeller:
    """
    This class simulates an access plan in the form of a tree structure
    """
    
    @staticmethod
    def __create_node(node_name, parent=None):
        """
        Builds a node which will be added to the tree. If the parent is 'None', it is assumed that this
        node will be used as the root/parent Node.
        
        :param: node_name - String specifying node name.
        :param: parent    - Parent node specifying parent node name.
        
        :return: anytree object
        """
        if node_name is None:
            raise ValueError('Node name was not specified!')
        
        if parent is None:
            node = Node(node_name)
        else:
            node = Node(node_name, parent=parent)
        
        return node
    
    @staticmethod
    def build_tree(df):
        """
        This method receives a pandas dataframe, and converts it into a searchable python tree
        
        :param: df - Pandas Dataframe, pertaining to input access plan
        
        :return: Dictionary object, consisting of node objects (which are linked in a tree fashion)
        """
        parent_node = None
        node_dict = {}
        for index, row in df.iterrows():
            
            # Build Node and add to parent
            row_id = int(row['ID'])
            parent_id = int(row['PARENT_ID'])
            
            if row_id == 0:
                node = PlanTreeModeller.__create_node(node_name=row_id)
            else:
                parent_node = node_dict[parent_id]
                node = PlanTreeModeller.__create_node(node_name=row_id, parent=parent_node)
            node_dict[row_id] = node
        
        return node_dict # Dictionary consisting of tree nodes
    
    @staticmethod
    def __retrieve_plan_details(df, node_name):
        """
        Accepts a dataframe, and the node_name. Retrieves features pertaining to the row id in the access plan
        
        :param: df - Dataframe consisting of access plan features
        :param: id - String id denoting which row to retrieve from the parameter dataframe
        
        :return: Dictionary consisting of access plan attributes
        """
        operation = str(df[df['ID'] == node_name]['OPERATION'].iloc[0])
        options = str(df[df['ID'] == node_name]['OPTIONS'].iloc[0])
        object_name = str(df[df['ID'] == node_name]['OBJECT_NAME'].iloc[0])
        try:
            object_type = str(df[df['ID'] == node_name]['OBJECT_TYPE'].iloc[0])
        except KeyError: # This is required because variant query plans do not have this node.
            object_type = None
        cardinality = int(df[df['ID'] == node_name]['CARDINALITY'].iloc[0])
        bytess = int(df[df['ID'] == node_name]['BYTES'].iloc[0])
        partition_delta = int(df[df['ID'] == node_name]['PARTITION_STOP'].iloc[0]) - int(df[df['ID'] == node_name]['PARTITION_START'].iloc[0])
        cpu_cost = int(df[df['ID'] == node_name]['CPU_COST'].iloc[0])
        io_cost = int(df[df['ID'] == node_name]['IO_COST'].iloc[0])
        temp_space = int(df[df['ID'] == node_name]['TEMP_SPACE'].iloc[0])
        time = int(df[df['ID'] == node_name]['TIME'].iloc[0]) 
        
        return {'OPERATION':operation,
                'OPTIONS':options,
                'OBJECT_NAME':object_name,
                'OBJECT_TYPE':object_type,
                'CARDINALITY':cardinality,
                'BYTES':bytess,
                'PARTITION_DELTA':partition_delta,
                'CPU_COST':cpu_cost,
                'IO_COST':io_cost,
                'TEMP_SPACE':temp_space,
                'TIME':time}
    
    @staticmethod
    def __tree_node_euclidean(tree1, tree2):
        """
        This method calculates the eucldiean distance between two vectors.
        
        :param: tree_dict1 - Dictionary denoting a single node within plan / tree 1
        :param: tree_dict2 - Dictionary denoting a single node within plan / tree 2
        
        :return: List denoting euclidean distance
        """
        labels = ['CARDINALITY','BYTES', 'CPU_COST','IO_COST','TEMP_SPACE','TIME']
        
        tree_sum_1, tree_sum_2 = [], []
        for index, row in tree1[labels].iterrows():
            tree_sum_1.append(sum(row))
            
        for index, row in tree2[labels].iterrows():
            tree_sum_2.append(sum(row))
        
        max_size = max(len(tree_sum_1), len(tree_sum_2))
        if len(tree_sum_1) != max_size:
            for i in range(max_size-len(tree_sum_1)):
                tree_sum_1.append(0)
        if len(tree_sum_2) != max_size:
            for i in range(max_size-len(tree_sum_2)):
                tree_sum_2.append(0) 
        
        euc_distance = euclidean_distances([tree_sum_1],[tree_sum_2])
        return euc_distance[0][0]
    
    @staticmethod
    def render_tree(tree, df):
        """
        Renders Tree by printing to screen
        
        :param: tree - AnyTree object, representing tree modelled access plan
        :param: df   - Pandas dataframe representatnt of the access plan about to be rendered
        
        :return: None
        """
        for pre, fill, node in RenderTree(tree):
            
            access_plan_dict = PlanTreeModeller.__retrieve_plan_details(df=df,
                                                                        node_name = node.name)
            
            if access_plan_dict['OBJECT_NAME'] == '0':
                print("%s%s > %s" % (pre, node.name, access_plan_dict['OPERATION']))
            else:
                if access_plan_dict['OPTIONS'] == '0': 
                    print("%s%s > %s (%s)" % (pre, node.name, access_plan_dict['OPERATION'], access_plan_dict['OBJECT_NAME']))
                else:
                    print("%s%s > %s | %s (%s)" % (pre, node.name, access_plan_dict['OPERATION'], access_plan_dict['OPTIONS'], access_plan_dict['OBJECT_NAME']))
    
    @staticmethod
    def __postorder(tree):
        """
        Accepts a tree, and iterates in post order fashion (left,right,root)
        
        :param: tree - Dictionary consisting of AnyTree Nodes
        
        :return: List consisting of tree traversal order
        """
        post_order_traversal = [node.name for node in PostOrderIter(tree[0])]
        return post_order_traversal
    
    @staticmethod
    def tree_compare(tree1, tree2, df1, df2):
        """
        Accepts two trees of type 'AnyTree', along with respective dataframe denoting each respective access
        path.
        
        :param: tree1 - Dictionary consisting of 'AnyTree' nodes, belonging to tree 1
        :param: tree2 - Dictionary consisting of 'AnyTree' nodes, belonging to tree 2
        :param: df1   - Pandas dataframe consisting of access plan instructions opted for by tree 1
        :param: df2   - Pandas dataframe consisting of access plan instructions opted for by tree 2
        
        :return: None
        """
        
        # Retrieves traversal order for both trees
        operator_tracker = []
        post_order_traversal1 = PlanTreeModeller.__postorder(tree1)
        post_order_traversal2 = PlanTreeModeller.__postorder(tree2)
        
        # Iterates over traversal order, until a change is encountered
        max_range = max(len(post_order_traversal1),len(post_order_traversal2))
        delta_flag = True
        for i in range(0,max_range):
            
            # Retrive prior, current, and next nodes
            try:
                id_1_prev = post_order_traversal1[i-1]
                id_2_prev = post_order_traversal2[i-1]
            except IndexError:
                id_1_prev = None
                id_2_prev = None
            try:
                id_1 = post_order_traversal1[i]
                id_2 = post_order_traversal2[i]
            except IndexError:
                id_1 = None
                id_2 = None
            try:
                id_1_next = post_order_traversal1[i+1]
                id_2_next = post_order_traversal2[i+1]
            except IndexError:
                id_1_next = None
                id_2_next = None

            if id_1_prev is not None and id_2_prev is not None:
                pd_tree1_prev = PlanTreeModeller.__retrieve_plan_details(df=df1, node_name=id_1_prev)
                pd_tree2_prev = PlanTreeModeller.__retrieve_plan_details(df=df2, node_name=id_2_prev)
            if id_1 is not None and id_2 is not None:
                pd_tree1 = PlanTreeModeller.__retrieve_plan_details(df=df1, node_name=id_1)
                pd_tree2 = PlanTreeModeller.__retrieve_plan_details(df=df2, node_name=id_2)
            if id_1_next is not None and id_2_next is not None:
                pd_tree1_next = PlanTreeModeller.__retrieve_plan_details(df=df1, node_name=id_1_next)
                pd_tree2_next = PlanTreeModeller.__retrieve_plan_details(df=df2, node_name=id_2_next)
            
            if (pd_tree1['OPERATION'] != pd_tree2['OPERATION'] or pd_tree1['OBJECT_NAME'] != pd_tree2['OBJECT_NAME'] or pd_tree1['OPTIONS'] != pd_tree2['OPTIONS']) and delta_flag:
                print('Access Predicate Difference detected!')
                print('Tree 1 difference at node [' + str(id_1) + '] operator > ' + str(pd_tree1['OPERATION']) + '(' + str(pd_tree1['OPTIONS']) + ') on object [' + pd_tree1['OBJECT_NAME'] + ']')
                print('Tree 2 difference at node [' + str(id_2) + '] operator > ' + str(pd_tree2['OPERATION']) + '(' + str(pd_tree2['OPTIONS']) + ') on object [' + pd_tree2['OBJECT_NAME'] + ']')
                PlanTreeModeller.render_tree(tree=tree1[0], df=df1) # Tree rendederer uses root node and traverses downwards
                PlanTreeModeller.render_tree(tree=tree2[0], df=df2) # Tree rendederer uses root node and traverses downwards
                
                encountered_recommendations = []
                print('Stat Recommendation: ')
                display_counter = 1
                if pd_tree1['OBJECT_TYPE'] != '0' and pd_tree1['OBJECT_NAME'] not in encountered_recommendations:
                    print(str(display_counter) + ') Collect [' + str(pd_tree1['OBJECT_TYPE']) + '] stats on [' + str(pd_tree1['OBJECT_NAME']) + ']')
                    encountered_recommendations.append(pd_tree1['OBJECT_NAME'])
                    display_counter += 1
                    operator_tracker.append([pd_tree1['OBJECT_TYPE'], pd_tree1['OBJECT_NAME']])
                if pd_tree2['OBJECT_TYPE'] != '0' and pd_tree2['OBJECT_NAME'] not in encountered_recommendations:
                    print(str(display_counter) + ') Collect [' + str(pd_tree2['OBJECT_TYPE']) + '] stats on [' + str(pd_tree2['OBJECT_NAME']) + ']')
                    encountered_recommendations.append(pd_tree2['OBJECT_NAME'])
                    display_counter += 1
                    operator_tracker.append([pd_tree2['OBJECT_TYPE'], pd_tree2['OBJECT_NAME']])
                if pd_tree1_prev['OBJECT_TYPE'] != '0' and pd_tree1_prev['OBJECT_NAME'] not in encountered_recommendations:
                    print(str(display_counter) + ') Collect [' + pd_tree1_prev['OBJECT_TYPE'] + '] stats on [' + pd_tree1_prev['OBJECT_NAME'] + ']')
                    encountered_recommendations.append(pd_tree1_prev['OBJECT_NAME'])
                    display_counter += 1
                    operator_tracker.append([pd_tree1_prev['OBJECT_TYPE'], pd_tree1_prev['OBJECT_NAME']])
                if pd_tree2_prev['OBJECT_TYPE'] != '0' and pd_tree2_prev['OBJECT_NAME'] not in encountered_recommendations:
                    print(str(display_counter) + ') Collect [' + pd_tree2_prev['OBJECT_TYPE'] + '] stats on [' + pd_tree2_prev['OBJECT_NAME'] + ']')
                    encountered_recommendations.append(pd_tree2_prev['OBJECT_NAME'])
                    display_counter += 1
                    operator_tracker.append([pd_tree2_prev['OBJECT_TYPE'], pd_tree2_prev['OBJECT_NAME']])
                if pd_tree1_next['OBJECT_TYPE'] != '0' and pd_tree1_next['OBJECT_NAME'] not in encountered_recommendations:
                    print(str(display_counter) + ') Collect [' + str(pd_tree1_next['OBJECT_TYPE']) + '] stats on [' + str(pd_tree1_next['OBJECT_NAME']) + ']')
                    encountered_recommendations.append(pd_tree1_next['OBJECT_NAME'])
                    display_counter += 1
                    operator_tracker.append([pd_tree1_next['OBJECT_TYPE'], pd_tree1_next['OBJECT_NAME']])
#                 if pd_tree2_next['OBJECT_TYPE'] != '0' and pd_tree2_next['OBJECT_NAME'] not in encountered_recommendations:
#                     print(str(display_counter) + ') Collect [' + str(pd_tree2_next['OBJECT_TYPE']) + '] stats on [' + str(pd_tree2_next['OBJECT_NAME'])+ ']')
#                     encountered_recommendations.append(pd_tree2_prev['OBJECT_NAME'])
#                     display_counter += 1
#                     operator_tracker.append([pd_tree2_prev['OBJECT_TYPE'], pd_tree2_prev['OBJECT_NAME']])
                delta_flag = False
            
        # Calculate Node Euclidean Measure
        euc_distance = PlanTreeModeller.__tree_node_euclidean(tree1=df1,
                                                              tree2=df2)
            
        if delta_flag is not False and euc_distance != 0:
            print('Access Predicate Difference detected!')
            print('Plan structure was the same, but an operator difference was detected with delta score [' + str(euc_distance)  + ']')
            PlanTreeModeller.render_tree(tree=tree1[0], df=df1) # Tree rendederer uses root node and traverses downwards
            PlanTreeModeller.render_tree(tree=tree2[0], df=df2) # Tree rendederer uses root node and traverses downwards
        
        if delta_flag:
            print('No plan differences detected.')
            
        if len(operator_tracker) == 0:
            operator_tracker.append(['No plan differences detected.','No plan differences detected.'])
        
        print('Total computed delta score [' + str(euc_distance) + ']')
        
        return operator_tracker

### Captured Outlier Plans

This section contains metrics pertaining to outlier plans. There are three categories of captured outliers denoted below, each assigned a total of 14 queries

* Hint Enhanced Queries
* Predicate Enhanced Queries
* Rownum Stopkey Enhanced Queries

In [26]:
# Retrieve Unique set of Plans (Hints)
np_plan_hints_id, np_plan_hints_instance = pd.unique(df_hints['PLAN_ID']), pd.unique(df_hints['PLAN_INSTANCE'])
print(np_plan_hints_id)
print(type(np_plan_hints_id))
print(np_plan_hints_instance)
print(type(np_plan_hints_instance))
print('-'*100)

# Iterate over each PLAN_HASH_VALUE, and retrieve PLAN subset                                                                                                                 
for plan_instance in np_plan_hints_instance:
    
    # Retrieve only a single instance of the plan (as annotated at beginning of experiment)
    df_temp_plan = df_hints[df_hints['PLAN_INSTANCE'] == plan_instance]
    #
    # This step ensures that only TPC-DS related queries are displayed
    tpc_check = df_temp_plan['OBJECT_OWNER'].tolist()
    if tpcds not in tpc_check:
        continue
    
    # Discards plans with double entries - Due to the parallel nature of the throughput test for 
    # TPC-DS, multiple threads may execute the same query at the same time, resulting in sql access
    # plans with the same SQL_ID, same PLAN_HASH_VALUE, and same TIMESTAMP. Such occurances are skipped.
    df_temp_count = df_temp_plan[df_temp_plan['ID'] == 0]
    if df_temp_count.shape[0] != 1:
        continue
    
    # Sorts by ID ascending - This clause may be redundant due to the natural order of the data capture tool
    df_temp_plan = df_temp_plan.sort_values(by='ID', ascending=True)
    
    # Builds Tree
    tree = PlanTreeModeller.build_tree(df=df_temp_plan)
    
    # Renders Tree
    print('PLAN_ID [' + str(df_temp_plan['PLAN_ID'].iloc[0]) + ']\n')
    PlanTreeModeller.render_tree(tree=tree[0], df=df_temp_plan) # Tree rendederer uses root node and traverses downwards
    print('-'*100) 

# Retrieve Unique set of Plans (Rownum)
np_plan_rownum_id, np_plan_rownum_instance = pd.unique(df_rownum['PLAN_ID']), pd.unique(df_rownum['PLAN_INSTANCE'])
print(np_plan_rownum_id)
print(type(np_plan_rownum_id))
print(np_plan_rownum_instance)
print(type(np_plan_rownum_instance))
print('-'*100)

# Iterate over each PLAN_HASH_VALUE, and retrieve PLAN subset                                                                                                                 
for plan_instance in np_plan_rownum_instance:
    
    # Retrieve only a single instance of the plan (as annotated at beginning of experiment)
    df_temp_plan = df_rownum[df_rownum['PLAN_INSTANCE'] == plan_instance]
    
    # This step ensures that only TPC-DS related queries are displayed
    tpc_check = df_temp_plan['OBJECT_OWNER'].tolist()
    if tpcds not in tpc_check:
        continue
    
    # Discards plans with double entries - Due to the parallel nature of the throughput test for 
    # TPC-DS, multiple threads may execute the same query at the same time, resulting in sql access
    # plans with the same SQL_ID, same PLAN_HASH_VALUE, and same TIMESTAMP. Such occurances are skipped.
    df_temp_count = df_temp_plan[df_temp_plan['ID'] == 0]
    if df_temp_count.shape[0] != 1:
        continue
    
    # Sorts by ID ascending - This clause may be redundant due to the natural order of the data capture tool
    df_temp_plan = df_temp_plan.sort_values(by='ID', ascending=True)
    
    # Builds Tree
    tree = PlanTreeModeller.build_tree(df=df_temp_plan)
    
    # Renders Tree
    print('PLAN_ID [' + str(df_temp_plan['PLAN_ID'].iloc[0]) + ']\n')
    PlanTreeModeller.render_tree(tree=tree[0], df=df_temp_plan) # Tree rendederer uses root node and traverses downwards
    print('-'*100)
    
# Retrieve Unique set of Plans (Predicates)
np_plan_predicates_id, np_plan_predicates_instance = pd.unique(df_predicates['PLAN_ID']), pd.unique(df_predicates['PLAN_INSTANCE'])
print(np_plan_predicates_id)
print(type(np_plan_predicates_id))
print(np_plan_predicates_instance)
print(type(np_plan_predicates_instance))
print('-'*100)

# Iterate over each PLAN_HASH_VALUE, and retrieve PLAN subset                                                                                                                 
for plan_instance in np_plan_predicates_instance:
    
    # Retrieve only a single instance of the plan (as annotated at beginning of experiment)
    df_temp_plan = df_predicates[df_predicates['PLAN_INSTANCE'] == plan_instance]
    
    # This step ensures that only TPC-DS related queries are displayed
    tpc_check = df_temp_plan['OBJECT_OWNER'].tolist()
    if tpcds not in tpc_check:
        continue
    
    # Discards plans with double entries - Due to the parallel nature of the throughput test for 
    # TPC-DS, multiple threads may execute the same query at the same time, resulting in sql access
    # plans with the same SQL_ID, same PLAN_HASH_VALUE, and same TIMESTAMP. Such occurances are skipped.
    df_temp_count = df_temp_plan[df_temp_plan['ID'] == 0]
    if df_temp_count.shape[0] != 1:
        continue
    
    # Sorts by ID ascending - This clause may be redundant due to the natural order of the data capture tool
    df_temp_plan = df_temp_plan.sort_values(by='ID', ascending=True)
    
    # Builds Tree
    tree = PlanTreeModeller.build_tree(df=df_temp_plan)
    
    # Renders Tree
    print('PLAN_ID [' + str(df_temp_plan['PLAN_ID'].iloc[0]) + ']\n')
    PlanTreeModeller.render_tree(tree=tree[0], df=df_temp_plan) # Tree rendederer uses root node and traverses downwards
    print('-'*100)

[12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458
 12459 12460]
<class 'numpy.ndarray'>
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14]
<class 'numpy.ndarray'>
----------------------------------------------------------------------------------------------------
PLAN_ID [12447]

0 > SELECT STATEMENT
└── 1 > COUNT
    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > VIEW
                └── 5 > UNION-ALL
                    ├── 6 > HASH
                    │   └── 7 > NESTED LOOPS
                    │       ├── 8 > NESTED LOOPS
                    │       │   ├── 9 > HASH JOIN
                    │       │   │   ├── 10 > TABLE ACCESS | FULL (DATE_DIM)
                    │       │   │   └── 11 > VIEW
                    │       │   │       └── 12 > UNION-ALL
                    │       │   │           ├── 13 > TABLE ACCESS | FULL (STORE_SALES)
                    │       │   │           └── 14 > TABLE ACCESS | FULL (STORE_RETURNS)
                    │       │   └

                        │   │       ├── 55 > TABLE ACCESS | FULL (ITEM)
                        │   │       └── 56 > HASH JOIN
                        │   │           ├── 57 > HASH JOIN
                        │   │           │   ├── 58 > NESTED LOOPS
                        │   │           │   │   ├── 59 > NESTED LOOPS
                        │   │           │   │   │   ├── 60 > STATISTICS COLLECTOR
                        │   │           │   │   │   │   └── 61 > TABLE ACCESS | FULL (DATE_DIM)
                        │   │           │   │   │   └── 62 > INDEX | RANGE SCAN (SS_SOLD_DATE_SK_INDEX)
                        │   │           │   │   └── 63 > TABLE ACCESS | BY INDEX ROWID (STORE_SALES)
                        │   │           │   └── 64 > TABLE ACCESS | FULL (STORE_SALES)
                        │   │           └── 65 > VIEW (VW_NSO_1)
                        │   │               └── 66 > VIEW
                        │   │                   └── 67 > TABLE ACCESS | FULL (SYS_TEM

                            ├── 8 > VIEW
                            │   └── 9 > WINDOW
                            │       └── 10 > SORT
                            │           └── 11 > HASH JOIN
                            │               ├── 12 > TABLE ACCESS | FULL (DATE_DIM)
                            │               └── 13 > TABLE ACCESS | FULL (STORE_SALES)
                            └── 14 > VIEW
                                └── 15 > WINDOW
                                    └── 16 > SORT
                                        └── 17 > NESTED LOOPS
                                            ├── 18 > NESTED LOOPS
                                            │   ├── 19 > TABLE ACCESS | FULL (DATE_DIM)
                                            │   └── 20 > INDEX | RANGE SCAN (WS_SOLD_DATE_SK_INDEX)
                                            └── 21 > TABLE ACCESS | BY INDEX ROWID (WEB_SALES)
---------------------------------------------------------------------------------

                    │               ├── 31 > NESTED LOOPS
                    │               │   ├── 32 > STATISTICS COLLECTOR
                    │               │   │   └── 33 > HASH JOIN
                    │               │   │       ├── 34 > TABLE ACCESS | FULL (ITEM)
                    │               │   │       └── 35 > HASH JOIN
                    │               │   │           ├── 36 > NESTED LOOPS
                    │               │   │           │   ├── 37 > NESTED LOOPS
                    │               │   │           │   │   ├── 38 > STATISTICS COLLECTOR
                    │               │   │           │   │   │   └── 39 > TABLE ACCESS | FULL (DATE_DIM)
                    │               │   │           │   │   └── 40 > INDEX | RANGE SCAN (CS_SOLD_DATE_SK_INDEX)
                    │               │   │           │   └── 41 > TABLE ACCESS | BY INDEX ROWID (CATALOG_SALES)
                    │               │   │           └── 42 > TABLE ACCESS | FULL (CATALOG

    │                       ├── 24 > TABLE ACCESS | FULL (ITEM)
    │                       └── 25 > NESTED LOOPS
    │                           ├── 26 > NESTED LOOPS
    │                           │   ├── 27 > TABLE ACCESS | FULL (DATE_DIM)
    │                           │   └── 28 > INDEX | RANGE SCAN (WS_SOLD_DATE_SK_INDEX)
    │                           └── 29 > TABLE ACCESS | BY INDEX ROWID (WEB_SALES)
    ├── 30 > LOAD AS SELECT (SYS_TEMP_0FD9F17C0_141942F5)
    │   └── 31 > SORT
    │       └── 32 > VIEW
    │           └── 33 > UNION-ALL
    │               ├── 34 > HASH JOIN
    │               │   ├── 35 > TABLE ACCESS | FULL (DATE_DIM)
    │               │   └── 36 > TABLE ACCESS | FULL (STORE_SALES)
    │               ├── 37 > NESTED LOOPS
    │               │   ├── 38 > NESTED LOOPS
    │               │   │   ├── 39 > TABLE ACCESS | FULL (DATE_DIM)
    │               │   │   └── 40 > INDEX | RANGE SCAN (CS_SOLD_DATE_SK_INDEX)
    │               │   └── 41 > TABLE

                │           └── 15 > TABLE ACCESS | BY INDEX ROWID (DATE_DIM)
                │               └── 16 > INDEX | UNIQUE SCAN (SYS_C0021186)
                └── 17 > VIEW (VW_SQ_1)
                    └── 18 > UNION-ALL
                        ├── 19 > HASH JOIN
                        │   ├── 20 > NESTED LOOPS
                        │   │   ├── 21 > NESTED LOOPS
                        │   │   │   ├── 22 > STATISTICS COLLECTOR
                        │   │   │   │   └── 23 > TABLE ACCESS | FULL (DATE_DIM)
                        │   │   │   └── 24 > INDEX | RANGE SCAN (CS_SOLD_DATE_SK_INDEX)
                        │   │   └── 25 > TABLE ACCESS | BY INDEX ROWID (CATALOG_SALES)
                        │   └── 26 > TABLE ACCESS | FULL (CATALOG_SALES)
                        └── 27 > HASH JOIN
                            ├── 28 > NESTED LOOPS
                            │   ├── 29 > NESTED LOOPS
                            │   │   ├── 30 > STATISTICS COLLECTOR
             

                        │               │   └── 46 > INDEX | RANGE SCAN (WS_SOLD_DATE_SK_INDEX)
                        │               └── 47 > TABLE ACCESS | BY INDEX ROWID (WEB_SALES)
                        └── 48 > VIEW
                            └── 49 > HASH
                                └── 50 > NESTED LOOPS
                                    ├── 51 > NESTED LOOPS
                                    │   ├── 52 > TABLE ACCESS | FULL (DATE_DIM)
                                    │   └── 53 > TABLE ACCESS | BY INDEX ROWID BATCHED (WEB_RETURNS)
                                    │       └── 54 > INDEX | RANGE SCAN (WR_RETURNED_DATE_SK_INDEX)
                                    └── 55 > INDEX | UNIQUE SCAN (SYS_C0021223)
----------------------------------------------------------------------------------------------------
PLAN_ID [12487]

0 > SELECT STATEMENT
└── 1 > COUNT
    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > VIEW
                └── 5 > UNION-ALL
          

                            └── 47 > TABLE ACCESS | BY INDEX ROWID (WEB_SITE)
----------------------------------------------------------------------------------------------------
PLAN_ID [12462]

0 > SELECT STATEMENT
└── 1 > COUNT
    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > FILTER
                ├── 5 > HASH JOIN
                │   ├── 6 > NESTED LOOPS
                │   │   ├── 7 > NESTED LOOPS
                │   │   │   ├── 8 > TABLE ACCESS | FULL (CUSTOMER_ADDRESS)
                │   │   │   └── 9 > INDEX | RANGE SCAN (C_CURRENT_ADDR_SK_INDEX)
                │   │   └── 10 > TABLE ACCESS | BY INDEX ROWID (CUSTOMER)
                │   └── 11 > TABLE ACCESS | FULL (CUSTOMER_DEMOGRAPHICS)
                ├── 12 > NESTED LOOPS
                │   ├── 13 > TABLE ACCESS | BY INDEX ROWID BATCHED (STORE_SALES)
                │   │   └── 14 > INDEX | RANGE SCAN (SS_CUSTOMER_SK_INDEX)
                │   └── 15 > TABLE ACCESS | BY INDEX ROWID (DATE_DIM)
                │

    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > SORT
                └── 5 > HASH JOIN
                    ├── 6 > TABLE ACCESS | FULL (ITEM)
                    └── 7 > HASH JOIN
                        ├── 8 > INDEX | FAST FULL SCAN (SYS_C0021183)
                        └── 9 > HASH JOIN
                            ├── 10 > TABLE ACCESS | FULL (CUSTOMER_ADDRESS)
                            └── 11 > HASH JOIN
                                ├── 12 > TABLE ACCESS | FULL (CUSTOMER)
                                └── 13 > HASH JOIN
                                    ├── 14 > TABLE ACCESS | FULL (CUSTOMER_DEMOGRAPHICS)
                                    └── 15 > NESTED LOOPS
                                        ├── 16 > NESTED LOOPS
                                        │   ├── 17 > TABLE ACCESS | FULL (DATE_DIM)
                                        │   └── 18 > INDEX | RANGE SCAN (CS_SOLD_DATE_SK_INDEX)
                                        └── 19 > TABLE ACCESS |

                    │   │               │   ├── 13 > TABLE ACCESS | FULL (DATE_DIM)
                    │   │               │   └── 14 > INDEX | RANGE SCAN (SS_SOLD_DATE_SK_INDEX)
                    │   │               └── 15 > TABLE ACCESS | BY INDEX ROWID (STORE_SALES)
                    │   └── 16 > VIEW
                    │       └── 17 > HASH
                    │           └── 18 > NESTED LOOPS
                    │               ├── 19 > NESTED LOOPS
                    │               │   ├── 20 > TABLE ACCESS | FULL (DATE_DIM)
                    │               │   └── 21 > TABLE ACCESS | BY INDEX ROWID BATCHED (STORE_RETURNS)
                    │               │       └── 22 > INDEX | RANGE SCAN (SR_RETURNED_DATE_SK_INDEX)
                    │               └── 23 > INDEX | UNIQUE SCAN (SYS_C0021206)
                    ├── 24 > MERGE JOIN
                    │   ├── 25 > VIEW
                    │   │   └── 26 > HASH
                    │   │       └── 27 > HASH JOIN
 

                            │   │       │   │   │       └── 72 > INDEX | UNIQUE SCAN (SYS_C0021186)
                            │   │       │   │   └── 73 > TABLE ACCESS | FULL (DATE_DIM)
                            │   │       │   └── 74 > TABLE ACCESS | BY INDEX ROWID (WEB_SITE)
                            │   │       │       └── 75 > INDEX | UNIQUE SCAN (SYS_C0021215)
                            │   │       └── 76 > TABLE ACCESS | BY INDEX ROWID (PROMOTION)
                            │   │           └── 77 > INDEX | UNIQUE SCAN (SYS_C0021226)
                            │   └── 78 > TABLE ACCESS | BY INDEX ROWID (WEB_RETURNS)
                            │       └── 79 > INDEX | UNIQUE SCAN (SYS_C0021239)
                            └── 80 > TABLE ACCESS | FULL (WEB_RETURNS)
----------------------------------------------------------------------------------------------------
PLAN_ID [12474]

0 > SELECT STATEMENT
└── 1 > COUNT
    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > 

### Captured Original Plans

This section contains metrics pertaining to the original plans, which will serve as out baseline against which we compare.

In [27]:
# Retrieve Unique set of PLAN_HASH_VALUES
np_plan_original_id, np_plan_original_instance = pd.unique(df_original['PLAN_ID']), pd.unique(df_original['PLAN_INSTANCE'])
print(np_plan_original_id)
print(type(np_plan_original_id))
print(np_plan_original_instance)
print(type(np_plan_original_instance))
print('-'*100)
#
# Iterate over each PLAN_HASH_VALUE, and retrieve PLAN subset                                                                                                                 
for plan_instance in np_plan_original_instance:
    
    # Retrieve only a single instance of the plan (as annotated at beginning of experiment)
    df_temp_plan = df_original[df_original['PLAN_INSTANCE'] == plan_instance]
    #
    # This step ensures that only TPC-DS related queries are displayed
    tpc_check = df_temp_plan['OBJECT_OWNER'].tolist()
    if tpcds not in tpc_check:
        continue
    
    # Discards plans with double entries - Due to the parallel nature of the throughput test for 
    # TPC-DS, multiple threads may execute the same query at the same time, resulting in sql access
    # plans with the same SQL_ID, same PLAN_HASH_VALUE, and same TIMESTAMP. Such occurances are skipped.
    df_temp_count = df_temp_plan[df_temp_plan['ID'] == 0]
    if df_temp_count.shape[0] != 1:
        continue
    
    # Sorts by ID ascending - This clause may be redundant due to the natural order of the data capture tool
    df_temp_plan = df_temp_plan.sort_values(by='ID', ascending=True)
    
    # Builds Tree
    tree = PlanTreeModeller.build_tree(df=df_temp_plan)
    
    # Renders Tree
    print('PLAN_ID [' + str(df_temp_plan['PLAN_ID'].iloc[0]) + ']\n')
    PlanTreeModeller.render_tree(tree=tree[0], df=df_temp_plan) # Tree rendederer uses root node and traverses downwards
    print('-'*100)

[13068 13069 13040 13056 13070 13057 13071 13072 13073 13060 13074 13075
 13076 13077 13078 13079 13080]
<class 'numpy.ndarray'>
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
<class 'numpy.ndarray'>
----------------------------------------------------------------------------------------------------
PLAN_ID [13068]

0 > SELECT STATEMENT
└── 1 > COUNT
    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > VIEW
                └── 5 > UNION-ALL
                    ├── 6 > COUNT
                    │   └── 7 > VIEW
                    │       └── 8 > SORT
                    │           └── 9 > COUNT
                    │               └── 10 > HASH JOIN
                    │                   ├── 11 > TABLE ACCESS | FULL (STORE)
                    │                   └── 12 > NESTED LOOPS
                    │                       ├── 13 > TABLE ACCESS | FULL (DATE_DIM)
                    │                       └── 14 > VIEW
                    │                          

                            └── 33 > TABLE ACCESS | BY INDEX ROWID (DATE_DIM)
----------------------------------------------------------------------------------------------------
PLAN_ID [13070]

0 > SELECT STATEMENT
└── 1 > COUNT
    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > SORT
                └── 5 > COUNT
                    └── 6 > NESTED LOOPS
                        ├── 7 > NESTED LOOPS
                        │   ├── 8 > NESTED LOOPS
                        │   │   ├── 9 > NESTED LOOPS
                        │   │   │   ├── 10 > NESTED LOOPS
                        │   │   │   │   ├── 11 > NESTED LOOPS
                        │   │   │   │   │   ├── 12 > HASH JOIN
                        │   │   │   │   │   │   ├── 13 > TABLE ACCESS | BY INDEX ROWID BATCHED (DATE_DIM)
                        │   │   │   │   │   │   │   └── 14 > INDEX | RANGE SCAN (SYS_C0021186)
                        │   │   │   │   │   │   └── 15 > TABLE ACCESS | BY INDEX ROWID BATCHED (CATALOG_

                                │                   │   │               └── 22 > SORT
                                │                   │   │                   └── 23 > INDEX | RANGE SCAN (WS_SOLD_DATE_SK_INDEX)
                                │                   │   └── 24 > INDEX | UNIQUE SCAN (SYS_C0021186)
                                │                   └── 25 > TABLE ACCESS | BY INDEX ROWID (DATE_DIM)
                                └── 26 > VIEW
                                    └── 27 > WINDOW
                                        └── 28 > SORT
                                            └── 29 > COUNT
                                                └── 30 > NESTED LOOPS
                                                    ├── 31 > NESTED LOOPS
                                                    │   ├── 32 > TABLE ACCESS | BY INDEX ROWID (DATE_DIM)
                                                    │   │   └── 33 > INDEX | RANGE SCAN (SYS_C0021186)
                    

                            │   └── 52 > HASH
                            │       └── 53 > COUNT
                            │           └── 54 > NESTED LOOPS
                            │               ├── 55 > NESTED LOOPS
                            │               │   ├── 56 > TABLE ACCESS | BY INDEX ROWID BATCHED (DATE_DIM)
                            │               │   │   └── 57 > INDEX | RANGE SCAN (SYS_C0021186)
                            │               │   └── 58 > TABLE ACCESS | BY INDEX ROWID BATCHED (WEB_SALES)
                            │               │       └── 59 > INDEX | RANGE SCAN (WS_SOLD_DATE_SK_INDEX)
                            │               └── 60 > INDEX | UNIQUE SCAN (SYS_C0021223)
                            └── 61 > VIEW
                                └── 62 > HASH
                                    └── 63 > COUNT
                                        └── 64 > NESTED LOOPS
                                            ├── 65 > NESTED LOOPS
        

                                        │   │   │   └── 80 > NESTED LOOPS
                                        │   │   │       ├── 81 > HASH JOIN
                                        │   │   │       │   ├── 82 > NESTED LOOPS
                                        │   │   │       │   │   ├── 83 > STATISTICS COLLECTOR
                                        │   │   │       │   │   │   └── 84 > HASH JOIN
                                        │   │   │       │   │   │       ├── 85 > NESTED LOOPS
                                        │   │   │       │   │   │       │   ├── 86 > STATISTICS COLLECTOR
                                        │   │   │       │   │   │       │   │   └── 87 > HASH JOIN
                                        │   │   │       │   │   │       │   │       ├── 88 > NESTED LOOPS
                                        │   │   │       │   │   │       │   │       │   ├── 89 > STATISTICS COLLECTOR
                                        │   │   │       │   │   │

### Access Plan / Tree Comparison (DIFFERENT PLAN COMPARISON) Hints

In this section we compare each variant SQL to the actual SQL, and we attempt to denote any plan differences. The technique is evaluated by comparing the output suggestions to a record of 'manual' identified suggestions. Comparison is made in the following order:

* Hint Variants vs Original

In [28]:
print(np_plan_original_id)
print(np_plan_hints_id)
np_plan_original_id = np_plan_original_id.tolist()
# remove_indices = [2, 3, 5, 9]  # Data Anamoly
remove_indices = [2, 3, 5, 9]  # Data Anamoly
np_plan_original_id = [i for j, i in enumerate(np_plan_original_id) if j not in remove_indices]
np_plan_original_id = np.array(np_plan_original_id)
print(np_plan_original_id)
print(np_plan_hints_id)

tot_y_score, tot_yhat_score = [], []
for i in range(len(np_plan_original_id)):
    # Retrieve Original Plan
    df_orig_plan = df_original[df_original['PLAN_ID'] == np_plan_original_id[i]]
    df_orig_plan = df_orig_plan.sort_values(by='ID', ascending=True)
    # Builds Tree 1
    tree1 = PlanTreeModeller.build_tree(df=df_orig_plan)

    # Retrieve Variant Plan
    df_variant_plan = df_hints[df_hints['PLAN_ID'] == np_plan_hints_id[i]]
    df_variant_plan = df_variant_plan.sort_values(by='ID', ascending=True)
    # Builds Tree 2
    tree2 = PlanTreeModeller.build_tree(df=df_variant_plan)

    # Renders Trees
    print('Variant ' + str(i+1))
    print('Tree 1 with PLAN_ID [' + str(df_orig_plan['PLAN_ID'].iloc[0]) + ']')
    print('Tree 2 with PLAN_ID [' + str(df_variant_plan['PLAN_ID'].iloc[0]) + ']')

    # Compares both plans
    operator_tracker = PlanTreeModeller.tree_compare(tree1=tree1, 
                                                     tree2=tree2, 
                                                     df1=df_orig_plan, 
                                                     df2=df_variant_plan)
    y_score, yhat_score = 0, 0
    for y in eval_hint_dict[i+1]:
        for yhat in operator_tracker:
            if yhat[0].lower() == y[0].lower() and yhat[1].lower() == y[1].lower():
                yhat_score += 1
        y_score += 1

    print('Calculate Score (1 is best, 0 is worst): [' + str(yhat_score / y_score) + ']')
    tot_y_score.append(y_score)
    tot_yhat_score.append(yhat_score)
    print('-'*100)
    print('\n\n\n')
print('Total Score (1 is best, 0 is worst): [' + str(sum(tot_yhat_score) / sum(tot_y_score)) + ']')

[13068 13069 13040 13056 13070 13057 13071 13072 13073 13060 13074 13075
 13076 13077 13078 13079 13080]
[12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458
 12459 12460]
[13068 13069 13040 13070 13071 13072 13073 13074 13075 13076 13077 13078
 13079 13080]
[12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458
 12459 12460]
Variant 1
Tree 1 with PLAN_ID [13068]
Tree 2 with PLAN_ID [12447]
Access Predicate Difference detected!
Tree 1 difference at node [11] operator > TABLE ACCESS(FULL) on object [STORE]
Tree 2 difference at node [10] operator > TABLE ACCESS(FULL) on object [DATE_DIM]
0 > SELECT STATEMENT
└── 1 > COUNT
    └── 2 > VIEW
        └── 3 > SORT
            └── 4 > VIEW
                └── 5 > UNION-ALL
                    ├── 6 > COUNT
                    │   └── 7 > VIEW
                    │       └── 8 > SORT
                    │           └── 9 > COUNT
                    │               └── 10 > HASH JOIN
                    │   

                    │   │   │   │           ├── 12 > TABLE ACCESS | BY INDEX ROWID BATCHED (STORE_SALES)
                    │   │   │   │           │   ├── 13 > INDEX | RANGE SCAN (SS_CUSTOMER_SK_INDEX)
                    │   │   │   │           │   └── 13 > INDEX | RANGE SCAN (SS_CUSTOMER_SK_INDEX)
                    │   │   │   │           ├── 14 > TABLE ACCESS | BY INDEX ROWID (DATE_DIM)
                    │   │   │   │           └── 14 > TABLE ACCESS | BY INDEX ROWID (DATE_DIM)
                    │   │   │   │               ├── 15 > INDEX | UNIQUE SCAN (SYS_C0021186)
                    │   │   │   │               └── 15 > INDEX | UNIQUE SCAN (SYS_C0021186)
                    │   │   │   ├── 16 > TABLE ACCESS | BY INDEX ROWID (CUSTOMER_ADDRESS)
                    │   │   │   └── 16 > TABLE ACCESS | BY INDEX ROWID (CUSTOMER_ADDRESS)
                    │   │   │       ├── 17 > INDEX | UNIQUE SCAN (SYS_C0021181)
                    │   │   │       └── 17 > INDEX | UNIQUE SCAN 

KeyError: 33

### Access Plan / Tree Comparison (DIFFERENT PLAN COMPARISON) Predicates

In this section we compare each variant SQL to the actual SQL, and we attempt to denote any plan differences. The technique is evaluated by comparing the output suggestions to a record of 'manual' identified suggestions. Comparison is made in the following order:

* Predicate Variants vs Original

In [None]:
tot_y_score, tot_yhat_score = [], []
for i in range(len(np_plan_original_id)):
    
    # Retrieve Original Plan
    df_orig_plan = df_original[df_original['PLAN_ID'] == np_plan_original_id[i]]
    df_orig_plan = df_orig_plan.sort_values(by='ID', ascending=True)
    # Builds Tree 1
    tree1 = PlanTreeModeller.build_tree(df=df_orig_plan)
    
    # Retrieve Variant Plan
    df_variant_plan = df_predicates[df_predicates['PLAN_ID'] == np_plan_predicates_id[i]]
    df_variant_plan = df_variant_plan.sort_values(by='ID', ascending=True)
    # Builds Tree 2
    tree2 = PlanTreeModeller.build_tree(df=df_variant_plan)
    
    # Renders Trees
    print('Variant ' + str(i+1))
    print('Tree 1 with PLAN_ID [' + str(df_orig_plan['PLAN_ID'].iloc[0]) + ']')
    print('Tree 2 with PLAN_ID [' + str(df_variant_plan['PLAN_ID'].iloc[0]) + ']')
    
    # Compares both plans
    operator_tracker = PlanTreeModeller.tree_compare(tree1=tree1, 
                                                     tree2=tree2, 
                                                     df1=df_orig_plan, 
                                                     df2=df_variant_plan)
    
    y_score, yhat_score = 0, 0
    for y in eval_predicates_dict[i+1]:
        for yhat in operator_tracker:
            if yhat[0].lower() == y[0].lower() and yhat[1].lower() == y[1].lower():
                yhat_score += 1
        y_score += 1
            
    print('Calculate Score (1 is best, 0 is worst): [' + str(yhat_score / y_score) + ']')
    tot_y_score.append(y_score)
    tot_yhat_score.append(yhat_score)
    
    print('-'*100)
    print('\n\n\n')
    
print('Total Score (1 is best, 0 is worst): [' + str(sum(tot_yhat_score) / sum(tot_y_score)) + ']')

### Access Plan / Tree Comparison (DIFFERENT PLAN COMPARISON) Rownum

In this section we compare each variant SQL to the actual SQL, and we attempt to denote any plan differences. The technique is evaluated by comparing the output suggestions to a record of 'manual' identified suggestions. Comparison is made in the following order:

* Rownum Variants vs Original

In [None]:
tot_y_score, tot_yhat_score = [], []
for i in range(len(np_plan_original_id)):
    
    # Retrieve Original Plan
    df_orig_plan = df_original[df_original['PLAN_ID'] == np_plan_original_id[i]]
    df_orig_plan = df_orig_plan.sort_values(by='ID', ascending=True)
    # Builds Tree 1
    tree1 = PlanTreeModeller.build_tree(df=df_orig_plan)
    
    # Retrieve Variant Plan
    df_variant_plan = df_rownum[df_rownum['PLAN_ID'] == np_plan_rownum_id[i]]
    df_variant_plan = df_variant_plan.sort_values(by='ID', ascending=True)
    # Builds Tree 2
    tree2 = PlanTreeModeller.build_tree(df=df_variant_plan)
    
    # Renders Trees
    print('Variant ' + str(i+1))
    print('Tree 1 with PLAN_ID [' + str(df_orig_plan['PLAN_ID'].iloc[0]) + ']')
    print('Tree 2 with PLAN_ID [' + str(df_variant_plan['PLAN_ID'].iloc[0]) + ']')
    
    # Compares both plans
    operator_tracker = PlanTreeModeller.tree_compare(tree1=tree1, 
                                                     tree2=tree2, 
                                                     df1=df_orig_plan, 
                                                     df2=df_variant_plan)
    
    y_score, yhat_score = 0, 0
    for y in eval_rownum_dict[i+1]:
        for yhat in operator_tracker:
            if yhat[0].lower() == y[0].lower() and yhat[1].lower() == y[1].lower():
                yhat_score += 1
        y_score += 1
            
    print('Calculate Score (1 is best, 0 is worst): [' + str(yhat_score / y_score) + ']')
    tot_y_score.append(y_score)
    tot_yhat_score.append(yhat_score)
    
    print('-'*100)
    print('\n\n\n')

print('Total Score (1 is best, 0 is worst): [' + str(sum(tot_yhat_score) / sum(tot_y_score)) + ']')