# Schedule TPC-DS1 Statistical Recommendation Evaluation

This experiment is intended at quantifying the statistical recommendation technique, through comparison of two query streams. The query streams are denoted as follows:

* Expected Stream - Denotes a sequence of baseline query plans, against which comparison will be made.
* Variation Stream - Denotes a sequence of upcoming query plans. Queries found within the upcoming stream mirror those established in the Expected Stream, with a number of exceptions. These exceptions are considered as query variants, and contain a degree of change from the original queries taken from the prior stream.

Query variants are denoted below, and are therefore eligable to be flagged during the evaluation phase:

* Query 5  
* Query 10
* Query 14
* Query 18
* Query 22
* Query 27
* Query 35
* Query 36
* Query 51
* Query 67
* Query 70
* Query 77
* Query 80
* Query 86

In [10]:
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# matplotlib
import matplotlib.pyplot as plt
# sklearn
import sklearn as sk
from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances
#
# AnyTree
from anytree import Node, RenderTree, PostOrderIter

pandas: 0.24.1
numpy: 1.16.1


### Configuration Cell

Tweak parametric changes from this cell to influence outcome of experiment

In [11]:
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test
test_split=.2
y_labels = ['COST',
            'CARDINALITY',
            'BYTES',
            'CPU_COST',
            'IO_COST',
            'TEMP_SPACE',
            'TIME']
black_list = ['TIMESTAMP',
              'SQL_ID',
              'OPERATION',
              'OPTIONS',
              'OBJECT_NAME',
              'OBJECT_OWNER',
              'PARTITION_STOP',
              'PARTITION_START'] # Columns which will be ignored during type conversion, and later used for aggregation
nrows = 10000

### Read data from file into pandas dataframes

In [12]:
# Root path
base_dir = 'C:/Users/gabriel.sammut/University/'
#base_dir = 'D:/Projects/ICS5200/'
root_dir = base_dir + 'Data_ICS5200/Schedule/' + tpcds
src_dir = base_dir + 'ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/'

rep_vsql_plan_path = root_dir + '/rep_vsql_plan.csv'
#rep_vsql_plan_path = root_dir + '/rep_vsql_plan.csv'

dtype={'COST':'int64',
       'CARDINALITY':'int64',
       'BYTES':'int64',
       'CPU_COST':'int64',
       'IO_COST':'int64',
       'TEMP_SPACE':'int64',
       'TIME':'int64',
       'OPERATION':'str',
       'OBJECT_NAME':'str'}
rep_vsql_plan_df = pd.read_csv(rep_vsql_plan_path, nrows=nrows, dtype=dtype)
print(rep_vsql_plan_df.head())
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_vsql_plan_df.columns = prettify_header(rep_vsql_plan_df.columns.values)
print('------------------------------------------')
print(rep_vsql_plan_df.columns)

    ('DBID',)    ('SQL_ID',)  ('PLAN_HASH_VALUE',)  ('ID',)    ('OPERATION',)  \
0  2634225673  dxv968j0352kb             103598129        0  SELECT STATEMENT   
1  2634225673  dxv968j0352kb             103598129        1              SORT   
2  2634225673  dxv968j0352kb             103598129        2    PX COORDINATOR   
3  2634225673  dxv968j0352kb             103598129        3           PX SEND   
4  2634225673  dxv968j0352kb             103598129        4              SORT   

  ('OPTIONS',) ('OBJECT_NODE',)  ('OBJECT#',) ('OBJECT_OWNER',)  \
0          NaN              NaN           NaN               NaN   
1     GROUP BY              NaN           NaN               NaN   
2          NaN              NaN           NaN               NaN   
3  QC (RANDOM)           :Q1001           NaN               SYS   
4     GROUP BY           :Q1001           NaN               NaN   

  ('OBJECT_NAME',)  ... ('ACCESS_PREDICATES',) ('FILTER_PREDICATES',)  \
0              NaN  ...              

### Read outlier data from file into pandas dataframes and concatenate

In [13]:
#
# CSV Outlier Paths
outlier_hints_q5_path = src_dir + 'hints/output/query_5.csv'
outlier_hints_q10_path = src_dir + 'hints/output/query_10.csv'
outlier_hints_q14_path = src_dir + 'hints/output/query_14.csv'
outlier_hints_q18_path = src_dir + 'hints/output/query_18.csv'
outlier_hints_q22_path = src_dir + 'hints/output/query_22.csv'
outlier_hints_q27_path = src_dir + 'hints/output/query_27.csv'
outlier_hints_q35_path = src_dir + 'hints/output/query_35.csv'
outlier_hints_q36_path = src_dir + 'hints/output/query_36.csv'
outlier_hints_q51_path = src_dir + 'hints/output/query_51.csv'
outlier_hints_q67_path = src_dir + 'hints/output/query_67.csv'
outlier_hints_q70_path = src_dir + 'hints/output/query_70.csv'
outlier_hints_q77_path = src_dir + 'hints/output/query_77.csv'
outlier_hints_q80_path = src_dir + 'hints/output/query_80.csv'
outlier_hints_q86_path = src_dir + 'hints/output/query_86.csv'
#
outlier_predicates_q5_path = src_dir + 'predicates/output/query_5.csv'
outlier_predicates_q10_path = src_dir + 'predicates/output/query_10.csv'
outlier_predicates_q14_path = src_dir + 'predicates/output/query_14.csv'
outlier_predicates_q18_path = src_dir + 'predicates/output/query_18.csv'
outlier_predicates_q22_path = src_dir + 'predicates/output/query_22.csv'
outlier_predicates_q27_path = src_dir + 'predicates/output/query_27.csv'
outlier_predicates_q35_path = src_dir + 'predicates/output/query_35.csv'
outlier_predicates_q36_path = src_dir + 'predicates/output/query_36.csv'
outlier_predicates_q51_path = src_dir + 'predicates/output/query_51.csv'
outlier_predicates_q67_path = src_dir + 'predicates/output/query_67.csv'
outlier_predicates_q70_path = src_dir + 'predicates/output/query_70.csv'
outlier_predicates_q77_path = src_dir + 'predicates/output/query_77.csv'
outlier_predicates_q80_path = src_dir + 'predicates/output/query_80.csv'
outlier_predicates_q86_path = src_dir + 'predicates/output/query_86.csv'
#
outlier_rownum_q5_path = src_dir + 'rownum/output/query_5.csv'
outlier_rownum_q10_path = src_dir + 'rownum/output/query_10.csv'
outlier_rownum_q14_path = src_dir + 'rownum/output/query_14.csv'
outlier_rownum_q18_path = src_dir + 'rownum/output/query_18.csv'
outlier_rownum_q22_path = src_dir + 'rownum/output/query_22.csv'
outlier_rownum_q27_path = src_dir + 'rownum/output/query_27.csv'
outlier_rownum_q35_path = src_dir + 'rownum/output/query_35.csv'
outlier_rownum_q36_path = src_dir + 'rownum/output/query_36.csv'
outlier_rownum_q51_path = src_dir + 'rownum/output/query_51.csv'
outlier_rownum_q67_path = src_dir + 'rownum/output/query_67.csv'
outlier_rownum_q70_path = src_dir + 'rownum/output/query_70.csv'
outlier_rownum_q77_path = src_dir + 'rownum/output/query_77.csv'
outlier_rownum_q80_path = src_dir + 'rownum/output/query_80.csv'
outlier_rownum_q86_path = src_dir + 'rownum/output/query_86.csv'
#
# Read CSV Paths
outlier_hints_q5_df = pd.read_csv(outlier_hints_q5_path,dtype=str)
outlier_hints_q10_df = pd.read_csv(outlier_hints_q10_path,dtype=str)
outlier_hints_q14_df = pd.read_csv(outlier_hints_q14_path,dtype=str)
outlier_hints_q18_df = pd.read_csv(outlier_hints_q18_path,dtype=str)
outlier_hints_q22_df = pd.read_csv(outlier_hints_q22_path,dtype=str)
outlier_hints_q27_df = pd.read_csv(outlier_hints_q27_path,dtype=str)
outlier_hints_q35_df = pd.read_csv(outlier_hints_q35_path,dtype=str)
outlier_hints_q36_df = pd.read_csv(outlier_hints_q36_path,dtype=str)
outlier_hints_q51_df = pd.read_csv(outlier_hints_q51_path,dtype=str)
outlier_hints_q67_df = pd.read_csv(outlier_hints_q67_path,dtype=str)
outlier_hints_q70_df = pd.read_csv(outlier_hints_q70_path,dtype=str)
outlier_hints_q77_df = pd.read_csv(outlier_hints_q77_path,dtype=str)
outlier_hints_q80_df = pd.read_csv(outlier_hints_q80_path,dtype=str)
outlier_hints_q86_df = pd.read_csv(outlier_hints_q86_path,dtype=str)
#
outlier_predicates_q5_df = pd.read_csv(outlier_predicates_q5_path,dtype=str)
outlier_predicates_q10_df = pd.read_csv(outlier_predicates_q10_path,dtype=str)
outlier_predicates_q14_df = pd.read_csv(outlier_predicates_q14_path,dtype=str)
outlier_predicates_q18_df = pd.read_csv(outlier_predicates_q18_path,dtype=str)
outlier_predicates_q22_df = pd.read_csv(outlier_predicates_q22_path,dtype=str)
outlier_predicates_q27_df = pd.read_csv(outlier_predicates_q27_path,dtype=str)
outlier_predicates_q35_df = pd.read_csv(outlier_predicates_q35_path,dtype=str)
outlier_predicates_q36_df = pd.read_csv(outlier_predicates_q36_path,dtype=str)
outlier_predicates_q51_df = pd.read_csv(outlier_predicates_q51_path,dtype=str)
outlier_predicates_q67_df = pd.read_csv(outlier_predicates_q67_path,dtype=str)
outlier_predicates_q70_df = pd.read_csv(outlier_predicates_q70_path,dtype=str)
outlier_predicates_q77_df = pd.read_csv(outlier_predicates_q77_path,dtype=str)
outlier_predicates_q80_df = pd.read_csv(outlier_predicates_q80_path,dtype=str)
outlier_predicates_q86_df = pd.read_csv(outlier_predicates_q86_path,dtype=str)
#
outlier_rownum_q5_df = pd.read_csv(outlier_rownum_q5_path,dtype=str)
outlier_rownum_q10_df = pd.read_csv(outlier_rownum_q10_path,dtype=str)
outlier_rownum_q14_df = pd.read_csv(outlier_rownum_q14_path,dtype=str)
outlier_rownum_q18_df = pd.read_csv(outlier_rownum_q18_path,dtype=str)
outlier_rownum_q22_df = pd.read_csv(outlier_rownum_q22_path,dtype=str)
outlier_rownum_q27_df = pd.read_csv(outlier_rownum_q27_path,dtype=str)
outlier_rownum_q35_df = pd.read_csv(outlier_rownum_q35_path,dtype=str)
outlier_rownum_q36_df = pd.read_csv(outlier_rownum_q36_path,dtype=str)
outlier_rownum_q51_df = pd.read_csv(outlier_rownum_q51_path,dtype=str)
outlier_rownum_q67_df = pd.read_csv(outlier_rownum_q67_path,dtype=str)
outlier_rownum_q70_df = pd.read_csv(outlier_rownum_q70_path,dtype=str)
outlier_rownum_q77_df = pd.read_csv(outlier_rownum_q77_path,dtype=str)
outlier_rownum_q80_df = pd.read_csv(outlier_rownum_q80_path,dtype=str)
outlier_rownum_q86_df = pd.read_csv(outlier_rownum_q86_path,dtype=str)
#
# Merge dataframes into a single pandas matrix
df_hints_outliers = pd.concat([outlier_hints_q5_df, outlier_hints_q10_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q14_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q18_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q22_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q27_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q35_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q36_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q51_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q67_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q70_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q77_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q80_df], sort=False)
df_hints_outliers = pd.concat([df_hints_outliers, outlier_hints_q86_df], sort=False)
#
df_predicate_outliers = pd.concat([outlier_predicates_q5_df, outlier_predicates_q10_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q14_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q18_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q22_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q27_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q35_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q36_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q51_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q67_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q70_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q77_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q80_df], sort=False)
df_predicate_outliers = pd.concat([df_predicate_outliers, outlier_predicates_q86_df], sort=False)
#
df_rownum_outliers = pd.concat([outlier_rownum_q5_df, outlier_rownum_q10_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q14_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q18_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q22_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q27_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q35_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q36_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q51_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q67_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q70_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q77_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q80_df], sort=False)
df_rownum_outliers = pd.concat([df_rownum_outliers, outlier_rownum_q86_df], sort=False)
#
print(df_hints_outliers.shape)
print(df_hints_outliers.head())
print('------------------------------------------')
print(df_predicate_outliers.shape)
print(df_predicate_outliers.head())
print('------------------------------------------')
print(df_rownum_outliers.shape)
print(df_rownum_outliers.head())

(465, 35)
  PLAN_ID            TIMESTAMP REMARKS         OPERATION          OPTIONS  \
0   12354  11/20/2018 08:23:55     NaN  SELECT STATEMENT              NaN   
1   12354  11/20/2018 08:23:55     NaN             COUNT          STOPKEY   
2   12354  11/20/2018 08:23:55     NaN              VIEW              NaN   
3   12354  11/20/2018 08:23:55     NaN              SORT  GROUP BY ROLLUP   
4   12354  11/20/2018 08:23:55     NaN              VIEW              NaN   

  OBJECT_NODE OBJECT_OWNER OBJECT_NAME                OBJECT_ALIAS  \
0         NaN          NaN         NaN                         NaN   
1         NaN          NaN         NaN                         NaN   
2         NaN       TPCDS1         NaN  from$_subquery$_018@SEL$11   
3         NaN          NaN         NaN                         NaN   
4         NaN       TPCDS1         NaN                    X@SEL$12   

  OBJECT_INSTANCE  ...                                          OTHER_XML  \
0             NaN  ...       

### Dealing with empty values

In [14]:
def get_na_columns(df, headers):
    """
    Return columns which consist of NAN values
    """
    na_list = []
    for head in headers:
        if df[head].isnull().values.any():
            na_list.append(head)
    return na_list
#
print('N/A Columns\n')
print('\nREP_VSQL_PLAN Features ' + str(len(rep_vsql_plan_df.columns)) + ': ' + str(get_na_columns(df=rep_vsql_plan_df,headers=rep_vsql_plan_df.columns)) + "\n")
print('\nDF_HINT_OUTLIERS Features ' + str(len(df_hints_outliers.columns)) + ': ' + str(get_na_columns(df=df_hints_outliers,headers=df_hints_outliers.columns)) + "\n")
print('\nDF_PREDICATE_OUTLIERS Features ' + str(len(df_predicate_outliers.columns)) + ': ' + str(get_na_columns(df=df_predicate_outliers,headers=df_predicate_outliers.columns)) + "\n")
print('\nDF_ROWNUM_OUTLIERS Features ' + str(len(df_rownum_outliers.columns)) + ': ' + str(get_na_columns(df=df_rownum_outliers,headers=df_rownum_outliers.columns)) + "\n")
#
def fill_na(df):
    """
    Replaces NA columns with 0s
    """
    return df.fillna(0)
#
# Populating NaN values with amount '0'
df = fill_na(df=rep_vsql_plan_df)
df_hints_outliers = fill_na(df=df_hints_outliers)
df_predicate_outliers = fill_na(df=df_predicate_outliers)
df_rownum_outliers = fill_na(df=df_rownum_outliers)

N/A Columns


REP_VSQL_PLAN Features 39: ['OPTIONS', 'OBJECT_NODE', 'OBJECT#', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'PARENT_ID', 'COST', 'CARDINALITY', 'OTHER_TAG', 'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'DISTRIBUTION', 'IO_COST', 'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME', 'QBLOCK_NAME', 'REMARKS', 'OTHER_XML']


DF_HINT_OUTLIERS Features 35: ['REMARKS', 'OPTIONS', 'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'OPTIMIZER', 'SEARCH_COLUMNS', 'PARENT_ID', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG', 'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'OTHER_XML', 'DISTRIBUTION', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME', 'QBLOCK_NAME']


DF_PREDICATE_OUTLIERS Features 35: ['REMARKS', 'OPTIONS', 'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'O

### Type conversion

Each column is converted into a column of type values which are Integer64.

In [15]:
def handle_numeric_overflows(x):
    """
    Accepts a dataframe column, and 
    """
    try:
        #df = df.astype('int64')
        x1 = pd.DataFrame([x],dtype='int64')
    except ValueError:
        x = 9223372036854775807 # Max int size
    return x
#
for col in df.columns:
    try:
        if col in black_list:
            continue
        df[col] = df[col].apply(handle_numeric_overflows)
        df[col].astype('int64',inplace=True)
    except:
        df.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')
#
print('-------------------------------------------------------------')
#
for col in df_hints_outliers.columns:
    try:
        if col in black_list:
            continue
        df_hints_outliers[col] = df_hints_outliers[col].astype('int64')
    except OverflowError:
        #
        # Handles numeric overflow conversions by replacing such values with max value inside the dataset.
        df_hints_outliers[col] = df_hints_outliers[col].apply(handle_numeric_overflows)
        df_hints_outliers[col] = df_hints_outliers[col].astype('int64')
    except Exception as e:
        df_hints_outliers.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')
#
print('-------------------------------------------------------------')
#
for col in df_predicate_outliers.columns:
    try:
        if col in black_list:
            continue
        df_predicate_outliers[col] = df_predicate_outliers[col].astype('int64')
    except OverflowError:
        #
        # Handles numeric overflow conversions by replacing such values with max value inside the dataset.
        df_predicate_outliers[col] = df_predicate_outliers[col].apply(handle_numeric_overflows)
        df_predicate_outliers[col] = df_predicate_outliers[col].astype('int64')
    except Exception as e:
        df_predicate_outliers.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')       
#
print('-------------------------------------------------------------')
#
for col in df_rownum_outliers.columns:
    try:
        if col in black_list:
            continue
        df_rownum_outliers[col] = df_rownum_outliers[col].astype('int64')
    except OverflowError:
        #
        # Handles numeric overflow conversions by replacing such values with max value inside the dataset.
        df_rownum_outliers[col] = df_rownum_outliers[col].apply(handle_numeric_overflows)
        df_rownum_outliers[col] = df_rownum_outliers[col].astype('int64')
    except Exception as e:
        df_rownum_outliers.drop(columns=col, inplace=True)
        print('Dropped column [' + col + ']')    
#
print('-------------------------------------------------------------')
#        
print(df.columns)
print(df_hints_outliers.columns)
print(df_predicate_outliers.columns)
print(df_rownum_outliers.columns)

-------------------------------------------------------------
Dropped column [OBJECT_ALIAS]
Dropped column [OBJECT_TYPE]
Dropped column [OPTIMIZER]
Dropped column [OTHER_XML]
Dropped column [ACCESS_PREDICATES]
Dropped column [FILTER_PREDICATES]
Dropped column [PROJECTION]
Dropped column [QBLOCK_NAME]
-------------------------------------------------------------
Dropped column [OBJECT_ALIAS]
Dropped column [OBJECT_TYPE]
Dropped column [OPTIMIZER]
Dropped column [OTHER_XML]
Dropped column [ACCESS_PREDICATES]
Dropped column [FILTER_PREDICATES]
Dropped column [PROJECTION]
Dropped column [QBLOCK_NAME]
-------------------------------------------------------------
Dropped column [OBJECT_ALIAS]
Dropped column [OBJECT_TYPE]
Dropped column [OPTIMIZER]
Dropped column [OTHER_XML]
Dropped column [ACCESS_PREDICATES]
Dropped column [FILTER_PREDICATES]
Dropped column [PROJECTION]
Dropped column [QBLOCK_NAME]
-------------------------------------------------------------
Index(['DBID', 'SQL_ID', 'PLAN_H

### Feature Elimination

In this step, redundant features are dropped. Features are considered redundant if exhibit a standard devaition of 0 (meaning no change in value).

In [16]:
def drop_flatline_columns(df):
    columns = df.columns
    flatline_features = []
    for i in range(len(columns)):
        try:
            #
            if columns[i] in black_list:
                continue
            #
            std = df[columns[i]].std()
            if std == 0:
                flatline_features.append(columns[i])
        except:
            pass
    #
    #print('Features which are considered flatline:\n')
    #for col in flatline_features:
    #    print(col)
    print('\nShape before changes: [' + str(df.shape) + ']')
    df = df.drop(columns=flatline_features)
    print('Shape after changes: [' + str(df.shape) + ']')
    print('Dropped a total [' + str(len(flatline_features)) + ']')
    return df
#
df = drop_flatline_columns(df=df)
df_hints_outliers = drop_flatline_columns(df=df_hints_outliers)
df_predicate_outliers = drop_flatline_columns(df=df_predicate_outliers)
df_rownum_outliers = drop_flatline_columns(df=df_rownum_outliers)
#
print('\nAfter flatline column drop:')
print(df.shape)
print(df.columns)
#
print('--------------------------------------------------------')
print('\nAfter outlier flatline column drop [df_hints_outliers]:')
print(df_hints_outliers.shape)
print(df_hints_outliers.columns)
#
print('--------------------------------------------------------')
print('\nAfter outlier flatline column drop [df_predicate_outliers]:')
print(df_predicate_outliers.shape)
print(df_predicate_outliers.columns)
#
print('--------------------------------------------------------')
print('\nAfter outlier flatline column drop [df_rownum_outliers]:')
print(df_rownum_outliers.shape)
print(df_rownum_outliers.columns)


Shape before changes: [(10000, 39)]
Shape after changes: [(10000, 30)]
Dropped a total [9]

Shape before changes: [(465, 27)]
Shape after changes: [(465, 21)]
Dropped a total [6]

Shape before changes: [(491, 27)]
Shape after changes: [(491, 21)]
Dropped a total [6]

Shape before changes: [(500, 27)]
Shape after changes: [(500, 21)]
Dropped a total [6]

After flatline column drop:
(10000, 30)
Index(['SQL_ID', 'PLAN_HASH_VALUE', 'ID', 'OPERATION', 'OPTIONS',
       'OBJECT_NODE', 'OBJECT#', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS',
       'OBJECT_TYPE', 'OPTIMIZER', 'PARENT_ID', 'DEPTH', 'POSITION',
       'SEARCH_COLUMNS', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG',
       'PARTITION_START', 'PARTITION_STOP', 'DISTRIBUTION', 'CPU_COST',
       'IO_COST', 'TEMP_SPACE', 'TIME', 'QBLOCK_NAME', 'TIMESTAMP',
       'OTHER_XML'],
      dtype='object')
--------------------------------------------------------

After outlier flatline column drop [df_hints_outliers]:
(465, 21)
Index(['PLA

### Scaling columns

This section attempts to process a number of data columns through a MinMax Scaler. This is done, to normalize data on a similar scaler, particularly before comparing column measurements using a euclidean based measure. The following columns will be targetted:

* CARDINALITY
* BYTES
* PARTITION_START
* PARTITION_STOP
* CPU_COST
* IO_COST
* TEMP_SPACE
* TIME

In [17]:
scaler = preprocessing.MinMaxScaler()
scaled_columns = ['CARDINALITY',
                'BYTES',
                'PARTITION_START',
                'PARTITION_STOP',
                'CPU_COST',
                'IO_COST',
                'TEMP_SPACE',
                'TIME']
print(df['PARTITION_START'].iloc[0])
df[scaled_columns] = scaler.fit_transform(df[scaled_columns])
print(df['PARTITION_START'].iloc[0])
print("Minimal Vector Points: " + str(scaler.data_min_))
print("Maximal Vector Points: " + str(scaler.data_max_))
#
print('\nAfter scaled column transformation:')
print(df.shape)
print(df.columns)
#
print('--------------------------------------------------------')
print('\nAfter outlier scaled column transformation [df_hints_outliers]:')
print(df_hints_outliers.shape)
print(df_hints_outliers.columns)
#
print('--------------------------------------------------------')
print('\nAfter outlier scaled column transformation [df_predicate_outliers]:')
print(df_predicate_outliers.shape)
print(df_predicate_outliers.columns)
#
print('--------------------------------------------------------')
print('\nAfter outlier scaled column transformation [df_rownum_outliers]:')
print(df_rownum_outliers.shape)
print(df_rownum_outliers.columns)

0.0
0.0
Minimal Vector Points: [0.000e+00 2.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.156e+06
 0.000e+00]
Maximal Vector Points: [2.85716704e+16 9.22337204e+18 0.00000000e+00 0.00000000e+00
 9.22337204e+18 7.78425143e+09 9.22337204e+18 1.58287675e+08]

After scaled column transformation:
(10000, 30)
Index(['SQL_ID', 'PLAN_HASH_VALUE', 'ID', 'OPERATION', 'OPTIONS',
       'OBJECT_NODE', 'OBJECT#', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS',
       'OBJECT_TYPE', 'OPTIMIZER', 'PARENT_ID', 'DEPTH', 'POSITION',
       'SEARCH_COLUMNS', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG',
       'PARTITION_START', 'PARTITION_STOP', 'DISTRIBUTION', 'CPU_COST',
       'IO_COST', 'TEMP_SPACE', 'TIME', 'QBLOCK_NAME', 'TIMESTAMP',
       'OTHER_XML'],
      dtype='object')
--------------------------------------------------------

After outlier scaled column transformation [df_hints_outliers]:
(465, 21)
Index(['PLAN_ID', 'TIMESTAMP', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER',
       'OBJECT_NAME'

### Adding Grouping Column

An extra column is added to allow access plans to be isolated per instance

In [18]:
#
# Adds a columns per SQL_ID, PLAN_HASH_VALUE grouping, which can be used to group instances together
def add_grouping_column(df, column_identifier):
    """
    Receives a pandas dataframe, and adds a new column which allows dataframe to be aggregated per 
    SQL_ID, PLAN_HASH_VALUE combination.
    
    :param: df                - Pandas Dataframe
    :param: column_identifier - String denoting matrix column to group by
    
    :return: Pandas Dataframe, with added column    
    """
    print('Shape before transformation: ' + str(df.shape))
    new_grouping_col = []
    counter = 0
    last_sql_id = df[column_identifier].iloc(0) # Starts with first SQL_ID
    for index, row in df.iterrows():
        if column_identifier == 'SQL_ID':
            if last_sql_id != row.SQL_ID:
                last_sql_id = row.SQL_ID
                counter += 1
        elif column_identifier == 'PLAN_ID':
            if last_sql_id != row.PLAN_ID:
                last_sql_id = row.PLAN_ID
                counter += 1
        else:
            raise ValueError('Column does not exist!')
        new_grouping_col.append(counter)
    #
    # Append list as new column
    new_col = pd.Series(new_grouping_col)
    df['PLAN_INSTANCE'] = new_col.values
    print('Shape after transformation: ' + str(df.shape))
    return df
#
df = add_grouping_column(df=df,column_identifier='SQL_ID')
df_hints_outliers = add_grouping_column(df=df_hints_outliers,column_identifier='PLAN_ID')
df_predicate_outliers = add_grouping_column(df=df_predicate_outliers,column_identifier='PLAN_ID')
df_rownum_outliers = add_grouping_column(df=df_rownum_outliers,column_identifier='PLAN_ID')

Shape before transformation: (10000, 30)
Shape after transformation: (10000, 31)
Shape before transformation: (465, 21)
Shape after transformation: (465, 22)
Shape before transformation: (491, 21)
Shape after transformation: (491, 22)
Shape before transformation: (500, 21)
Shape after transformation: (500, 22)


### Tree Formatting

Constructs the tree plan structure

In [21]:
class PlanTreeModeller:
    """
    This class simulates an access plan in the form of a tree structure
    """
    
    @staticmethod
    def __create_node(node_name, parent=None):
        """
        Builds a node which will be added to the tree. If the parent is 'None', it is assumed that this
        node will be used as the root/parent Node.
        
        :param: node_name - String specifying node name.
        :param: parent    - Parent node specifying parent node name.
        
        :return: anytree object
        """
        if node_name is None:
            raise ValueError('Node name was not specified!')
        
        if parent is None:
            node = Node(node_name)
        else:
            node = Node(node_name, parent=parent)
        
        return node
    
    @staticmethod
    def build_tree(df):
        """
        This method receives a pandas dataframe, and converts it into a searchable python tree
        
        :param: df - Pandas Dataframe, pertaining to input access plan
        
        :return: Dictionary object, consisting of node objects (which are linked in a tree fashion)
        """
        parent_node = None
        node_dict = {}
        for index, row in df.iterrows():
            
            # Build Node and add to parent
            row_id = int(row['ID'])
            parent_id = int(row['PARENT_ID'])
            
            if row_id == 0:
                node = PlanTreeModeller.__create_node(node_name=row_id)
            else:
                parent_node = node_dict[parent_id]
                node = PlanTreeModeller.__create_node(node_name=row_id, parent=parent_node)
            node_dict[row_id] = node
        
        return node_dict # Dictionary consisting of tree nodes
    
    @staticmethod
    def __retrieve_plan_details(df, node_name):
        """
        Accepts a dataframe, and the node_name. Retrieves features pertaining to the row id in the access plan
        
        :param: df - Dataframe consisting of access plan features
        :param: id - String id denoting which row to retrieve from the parameter dataframe
        
        :return: Dictionary consisting of access plan attributes
        """
        operation = str(df[df['ID'] == node_name]['OPERATION'].iloc[0])
        options = str(df[df['ID'] == node_name]['OPTIONS'].iloc[0])
        object_name = str(df[df['ID'] == node_name]['OBJECT_NAME'].iloc[0])
        cardinality = int(df[df['ID'] == node_name]['CARDINALITY'].iloc[0])
        bytess = int(df[df['ID'] == node_name]['BYTES'].iloc[0])
        partition_delta = int(df[df['ID'] == node_name]['PARTITION_STOP'].iloc[0]) - int(df[df['ID'] == node_name]['PARTITION_START'].iloc[0])
        cpu_cost = int(df[df['ID'] == node_name]['CPU_COST'].iloc[0])
        io_cost = int(df[df['ID'] == node_name]['IO_COST'].iloc[0])
        temp_space = int(df[df['ID'] == node_name]['TEMP_SPACE'].iloc[0])
        time = int(df[df['ID'] == node_name]['TIME'].iloc[0]) 
        
        return {'OPERATION':operation,
                'OPTIONS':options,
                'OBJECT_NAME':object_name,
                'CARDINALITY':cardinality,
                'BYTES':bytess,
                'PARTITION_DELTA':partition_delta,
                'CPU_COST':cpu_cost,
                'IO_COST':io_cost,
                'TEMP_SPACE':temp_space,
                'TIME':time}
    
    @staticmethod
    def __tree_node_euclidean(tree_dict1, tree_dict2):
        """
        This method calculates the eucldiean distance between two vectors.
        
        :param: tree_dict1 - Dictionary denoting a single node within plan / tree 1
        :param: tree_dict2 - Dictionary denoting a single node within plan / tree 2
        
        :return: List denoting euclidean distance
        """
        tree_vector_1 = [tree_dict1['CARDINALITY'],
                         tree_dict1['BYTES'],
                         tree_dict1['PARTITION_DELTA'],
                         tree_dict1['CPU_COST'],
                         tree_dict1['IO_COST'],
                         tree_dict1['TEMP_SPACE'],
                         tree_dict1['TIME']]
        
        tree_vector_2 = [tree_dict2['CARDINALITY'],
                         tree_dict2['BYTES'],
                         tree_dict2['PARTITION_DELTA'],
                         tree_dict2['CPU_COST'],
                         tree_dict2['IO_COST'],
                         tree_dict2['TEMP_SPACE'],
                         tree_dict2['TIME']]
        
        euc_distance = euclidean_distances([tree_vector_1],[tree_vector_2])
        return euc_distance[0][0]
    
    @staticmethod
    def render_tree(tree, df):
        """
        Renders Tree by printing to screen
        
        :param: tree - AnyTree object, representing tree modelled access plan
        :param: df   - Pandas dataframe representatnt of the access plan about to be rendered
        
        :return: None
        """
        for pre, fill, node in RenderTree(tree):
            
            access_plan_dict = PlanTreeModeller.__retrieve_plan_details(df=df,
                                                                        node_name = node.name)
            
            if access_plan_dict['OBJECT_NAME'] == '0':
                print("%s%s > %s" % (pre, node.name, access_plan_dict['OPERATION']))
            else:
                if access_plan_dict['OPTIONS'] == '0': 
                    print("%s%s > %s (%s)" % (pre, node.name, access_plan_dict['OPERATION'], access_plan_dict['OBJECT_NAME']))
                else:
                    print("%s%s > %s | %s (%s)" % (pre, node.name, access_plan_dict['OPERATION'], access_plan_dict['OPTIONS'], access_plan_dict['OBJECT_NAME']))
    
    @staticmethod
    def __postorder(tree):
        """
        Accepts a tree, and iterates in post order fashion (left,right,root)
        
        :param: tree - Dictionary consisting of AnyTree Nodes
        
        :return: List consisting of tree traversal order
        """
        post_order_traversal = [node.name for node in PostOrderIter(tree[0])]
        return post_order_traversal
    # 
    @staticmethod
    def tree_compare(tree1, tree2, df1, df2):
        """
        Accepts two trees of type 'AnyTree', along with respective dataframe denoting each respective access
        path.
        
        :param: tree1 - Dictionary consisting of 'AnyTree' nodes, belonging to tree 1
        :param: tree2 - Dictionary consisting of 'AnyTree' nodes, belonging to tree 2
        :param: df1   - Pandas dataframe consisting of access plan instructions opted for by tree 1
        :param: df2   - Pandas dataframe consisting of access plan instructions opted for by tree 2
        
        :return: None
        """
        
        # Retrieves traversal order for both trees
        post_order_traversal1 = PlanTreeModeller.__postorder(tree1)
        post_order_traversal2 = PlanTreeModeller.__postorder(tree2)
        
        # Iterates over traversal order, until a change is encountered
        max_range = max(len(post_order_traversal1),len(post_order_traversal2))
        delta_flag = True
        euclidean_measure = []
        for i in range(0,max_range):
            
            # This check avoids a list IndexError for scebarious when one plan is bigger than the others,
            # and consequently the number of node traversals is bigger than the other tree.
            if i >= len(post_order_traversal1) or i >= len(post_order_traversal2):
                break
            
            id_1 = post_order_traversal1[i]
            id_2 = post_order_traversal2[i]
            
            pd_tree1 = PlanTreeModeller.__retrieve_plan_details(df=df1, node_name=id_1)
            pd_tree2 = PlanTreeModeller.__retrieve_plan_details(df=df2, node_name=id_2)
            
            if (pd_tree1['OPERATION'] != pd_tree2['OPERATION'] or pd_tree1['OBJECT_NAME'] != pd_tree2['OBJECT_NAME'] or pd_tree1['OPTIONS'] != pd_tree2['OPTIONS']) and delta_flag:
                print('Access Predicate Difference detected!')
                print('Tree 1 difference at node [' + str(id_1) + '] operator > ' + pd_tree1['OPERATION'] + '(' + pd_tree1['OPTIONS'] + ') on object [' + pd_tree1['OBJECT_NAME'] + ']')
                print('Tree 2 difference at node [' + str(id_2) + '] operator > ' + pd_tree2['OPERATION'] + '(' + pd_tree2['OPTIONS'] + ') on object [' + pd_tree2['OBJECT_NAME'] + ']')
                delta_flag = False
            
            # Calculate Node Euclidean Measure
            euclidean_vector = PlanTreeModeller.__tree_node_euclidean(tree_dict1=pd_tree1,
                                                                      tree_dict2=pd_tree2)
            euclidean_measure.append(euclidean_vector)
        
        if delta_flag:
            print('No plan differences detected.')
        
        print('Total computed delta score [' + str(sum(euclidean_measure)) + ']')

## Stream Comparison with Hint Based Outliers

Compares the expected stream with variation stream. Variations found here will be composed of SQL optimizer hint injections to purposely skew the plan.

In [20]:
# Retrieve Unique set of PLAN_HASH_VALUES
np_sql_id, np_plan_hash_value, np_plan_instance = pd.unique(df['SQL_ID']),pd.unique(df['PLAN_HASH_VALUE']),pd.unique(df['PLAN_INSTANCE'])
print(np_sql_id)
print(type(np_sql_id))
print(np_plan_hash_value)
print(type(np_plan_hash_value))
print(np_plan_instance)
print(type(np_plan_instance))
print('-'*100)

['dxv968j0352kb' '0aq14dznn91rg' '4q1rzhn63sgpt' 'g0b4snpj74cv5'
 '03ggjrmy0wa1w' '93n8wp5a8xyxn' 'gdh0vpjkmbtjw' '1r7b985mxqj71'
 '1jhyrdp21f2q6' 'bwsf4tnh0gcgv' '29mjaymwt5p6d' 'aggcw7yk1a7s6'
 '8t26unxsrxj72' '71uursqtj1j2m' '59zh1b9759nf4' '0f60bzgt9127c'
 '9vmcsc3prvxpa' '76ds5wxsv7f5t' '20bqsr6btd9x9' '84ntdbh48ctu9'
 'fx7sjdj48pn6z' '2wuhkcaz4uhs5' '2pz0tqbv91m11' '1u97hwfu7dcmz'
 '39nyc1pykjg41' '9ffht8tuysgx9' '7fbzhzg6ysu25' '14f5ngrj3cc5h'
 '4u268zn6r57tm' '6zcux9jb78w36' '2hnpu9m861609' '33vw5865cwyyn'
 'gvcadr1hm4arv' '1pv23p59mjs0v' '7vtvbg7s3zcyp' '6fvfqaw68q59b'
 '06dymzb481vnd' '6u001adh62r0f' 'cqrk7q7f6nk3d' 'fc0va0vju750z'
 'a6g97rawd3ggv' 'd6vqfmt62hypx' 'c1y1djkqjcd88' 'd7w1dugmzb9n9'
 '87gtj5jaq4a3t' '85cmvvurya34f' 'gh5w0gcyfaujs' '0ga8vk4nftz45'
 '13a9r2xkx1bxb' 'cjq93m442uprp' '7w116jy6ysqpm' 'au8ztarrm6vvs'
 '5g88vmdgd99f7' 'gkjkxbzzptg00' '4cgbvpjc134nu' 'a6fy23us0jz84'
 '64wct1dn3b771' 'fs2q92rb37cb0' 'cw6vxf0kbz3v1' 'dyk4dprp70d74'
 'd6vwqbw6r2ffk' '7hys3h7