# Schedule Access Plan Recommendation

This notebook is dedicated to model fitting in terms of database access plans.

In [1]:
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# sklearn
import sklearn as sk
from sklearn import preprocessing
print('sklearn: %s' % sk.__version__)

pandas: 0.23.4
numpy: 1.15.2
sklearn: 0.19.0


In [2]:
#
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test
y_labels = ['COST',
            'CARDINALITY',
            'BYTES',
            'CPU_COST',
            'IO_COST',
            'TEMP_SPACE',
            'TIME']

### Read data from file into pandas dataframes

In [3]:
rep_vsql_plan_path = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds + '/v2/rep_vsql_plan.csv'
#rep_vsql_plan_path = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds + '/v2/rep_vsql_plan.csv'
#
rep_vsql_plan_df = pd.read_csv(rep_vsql_plan_path,dtype=str)
print(rep_vsql_plan_df.head())
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_vsql_plan_df.columns = prettify_header(rep_vsql_plan_df.columns.values)
print('------------------------------------------')
print(rep_vsql_plan_df.columns)

     ('SQL_ID',) ('PLAN_HASH_VALUE',) ('ID',)    ('OPERATION',) ('OPTIONS',)  \
0  dxv968j0352kb            103598129       0  SELECT STATEMENT          NaN   
1  dxv968j0352kb            103598129       1              SORT     GROUP BY   
2  dxv968j0352kb            103598129       2    PX COORDINATOR          NaN   
3  dxv968j0352kb            103598129       3           PX SEND  QC (RANDOM)   
4  dxv968j0352kb            103598129       4              SORT     GROUP BY   

  ('OBJECT_NODE',) ('OBJECT_OWNER',) ('OBJECT_NAME',) ('OBJECT_ALIAS',)  \
0              NaN               NaN              NaN               NaN   
1              NaN               NaN              NaN               NaN   
2              NaN               NaN              NaN               NaN   
3           :Q1001               SYS         :TQ10001               NaN   
4           :Q1001               NaN              NaN               NaN   

  ('OBJECT_TYPE',)         ...          ('SEARCH_COLUMNS',) ('COST',

### Read outlier data from file into pandas dataframes and concatenate

In [4]:
#
# CSV Outlier Paths
outlier_hints_q5_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_5.csv'
outlier_hints_q10_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_10.csv'
outlier_hints_q14_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_14.csv'
outlier_hints_q18_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_18.csv'
outlier_hints_q22_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_22.csv'
outlier_hints_q27_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_27.csv'
outlier_hints_q35_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_35.csv'
outlier_hints_q36_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_36.csv'
outlier_hints_q51_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_51.csv'
outlier_hints_q67_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_67.csv'
outlier_hints_q70_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_70.csv'
outlier_hints_q77_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_77.csv'
outlier_hints_q80_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_80.csv'
outlier_hints_q86_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/hints/output/query_86.csv'
#
outlier_predicates_q5_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_5.csv'
outlier_predicates_q10_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_10.csv'
outlier_predicates_q14_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_14.csv'
outlier_predicates_q18_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_18.csv'
outlier_predicates_q22_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_22.csv'
outlier_predicates_q27_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_27.csv'
outlier_predicates_q35_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_35.csv'
outlier_predicates_q36_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_36.csv'
outlier_predicates_q51_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_51.csv'
outlier_predicates_q67_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_67.csv'
outlier_predicates_q70_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_70.csv'
outlier_predicates_q77_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_77.csv'
outlier_predicates_q80_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_80.csv'
outlier_predicates_q86_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/predicates/output/query_86.csv'
#
outlier_rownum_q5_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_5.csv'
outlier_rownum_q10_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_10.csv'
outlier_rownum_q14_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_14.csv'
outlier_rownum_q18_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_18.csv'
outlier_rownum_q22_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_22.csv'
outlier_rownum_q27_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_27.csv'
outlier_rownum_q35_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_35.csv'
outlier_rownum_q36_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_36.csv'
outlier_rownum_q51_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_51.csv'
outlier_rownum_q67_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_67.csv'
outlier_rownum_q70_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_70.csv'
outlier_rownum_q77_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_77.csv'
outlier_rownum_q80_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_80.csv'
outlier_rownum_q86_path = 'C:/Users/gabriel.sammut/University/ICS5200/src/sql/Runtime/TPC-DS/' + tpcds + '/Variants/rownum/output/query_86.csv'
#
# Read CSV Paths
outlier_hints_q5_df = pd.read_csv(outlier_hints_q5_path,dtype=str)
outlier_hints_q10_df = pd.read_csv(outlier_hints_q10_path,dtype=str)
outlier_hints_q14_df = pd.read_csv(outlier_hints_q14_path,dtype=str)
outlier_hints_q18_df = pd.read_csv(outlier_hints_q18_path,dtype=str)
outlier_hints_q22_df = pd.read_csv(outlier_hints_q22_path,dtype=str)
outlier_hints_q27_df = pd.read_csv(outlier_hints_q27_path,dtype=str)
outlier_hints_q35_df = pd.read_csv(outlier_hints_q35_path,dtype=str)
outlier_hints_q36_df = pd.read_csv(outlier_hints_q36_path,dtype=str)
outlier_hints_q51_df = pd.read_csv(outlier_hints_q51_path,dtype=str)
outlier_hints_q67_df = pd.read_csv(outlier_hints_q67_path,dtype=str)
outlier_hints_q70_df = pd.read_csv(outlier_hints_q70_path,dtype=str)
outlier_hints_q77_df = pd.read_csv(outlier_hints_q77_path,dtype=str)
outlier_hints_q80_df = pd.read_csv(outlier_hints_q80_path,dtype=str)
outlier_hints_q86_df = pd.read_csv(outlier_hints_q86_path,dtype=str)
#
outlier_predicates_q5_df = pd.read_csv(outlier_predicates_q5_path,dtype=str)
outlier_predicates_q10_df = pd.read_csv(outlier_predicates_q10_path,dtype=str)
outlier_predicates_q14_df = pd.read_csv(outlier_predicates_q14_path,dtype=str)
outlier_predicates_q18_df = pd.read_csv(outlier_predicates_q18_path,dtype=str)
outlier_predicates_q22_df = pd.read_csv(outlier_predicates_q22_path,dtype=str)
outlier_predicates_q27_df = pd.read_csv(outlier_predicates_q27_path,dtype=str)
outlier_predicates_q35_df = pd.read_csv(outlier_predicates_q35_path,dtype=str)
outlier_predicates_q36_df = pd.read_csv(outlier_predicates_q36_path,dtype=str)
outlier_predicates_q51_df = pd.read_csv(outlier_predicates_q51_path,dtype=str)
outlier_predicates_q67_df = pd.read_csv(outlier_predicates_q67_path,dtype=str)
outlier_predicates_q70_df = pd.read_csv(outlier_predicates_q70_path,dtype=str)
outlier_predicates_q77_df = pd.read_csv(outlier_predicates_q77_path,dtype=str)
outlier_predicates_q80_df = pd.read_csv(outlier_predicates_q80_path,dtype=str)
outlier_predicates_q86_df = pd.read_csv(outlier_predicates_q86_path,dtype=str)
#
outlier_rownum_q5_df = pd.read_csv(outlier_rownum_q5_path,dtype=str)
outlier_rownum_q10_df = pd.read_csv(outlier_rownum_q10_path,dtype=str)
outlier_rownum_q14_df = pd.read_csv(outlier_rownum_q14_path,dtype=str)
outlier_rownum_q18_df = pd.read_csv(outlier_rownum_q18_path,dtype=str)
outlier_rownum_q22_df = pd.read_csv(outlier_rownum_q22_path,dtype=str)
outlier_rownum_q27_df = pd.read_csv(outlier_rownum_q27_path,dtype=str)
outlier_rownum_q35_df = pd.read_csv(outlier_rownum_q35_path,dtype=str)
outlier_rownum_q36_df = pd.read_csv(outlier_rownum_q36_path,dtype=str)
outlier_rownum_q51_df = pd.read_csv(outlier_rownum_q51_path,dtype=str)
outlier_rownum_q67_df = pd.read_csv(outlier_rownum_q67_path,dtype=str)
outlier_rownum_q70_df = pd.read_csv(outlier_rownum_q70_path,dtype=str)
outlier_rownum_q77_df = pd.read_csv(outlier_rownum_q77_path,dtype=str)
outlier_rownum_q80_df = pd.read_csv(outlier_rownum_q80_path,dtype=str)
outlier_rownum_q86_df = pd.read_csv(outlier_rownum_q86_path,dtype=str)
#
# Merge dataframes into a single pandas matrix
df_outliers = pd.concat([outlier_hints_q5_df, outlier_hints_q10_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q14_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q18_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q22_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q27_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q35_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q36_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q51_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q67_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q70_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q77_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q80_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_hints_q86_df], sort=False)
#
df_outliers = pd.concat([df_outliers, outlier_predicates_q5_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q10_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q14_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q18_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q22_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q27_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q35_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q36_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q51_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q67_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q70_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q77_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q80_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_predicates_q86_df], sort=False)
#
df_outliers = pd.concat([df_outliers, outlier_rownum_q5_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q10_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q14_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q18_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q22_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q27_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q35_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q36_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q51_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q67_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q70_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q77_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q80_df], sort=False)
df_outliers = pd.concat([df_outliers, outlier_rownum_q86_df], sort=False)   
#
print(df_outliers.shape)
print(df_outliers.head())
print('------------------------------------------')
print(df_outliers.columns)

(1456, 35)
  PLAN_ID            TIMESTAMP REMARKS         OPERATION          OPTIONS  \
0   12354  11/20/2018 08:23:55     NaN  SELECT STATEMENT              NaN   
1   12354  11/20/2018 08:23:55     NaN             COUNT          STOPKEY   
2   12354  11/20/2018 08:23:55     NaN              VIEW              NaN   
3   12354  11/20/2018 08:23:55     NaN              SORT  GROUP BY ROLLUP   
4   12354  11/20/2018 08:23:55     NaN              VIEW              NaN   

  OBJECT_NODE OBJECT_OWNER OBJECT_NAME                OBJECT_ALIAS  \
0         NaN          NaN         NaN                         NaN   
1         NaN          NaN         NaN                         NaN   
2         NaN       TPCDS1         NaN  from$_subquery$_018@SEL$11   
3         NaN          NaN         NaN                         NaN   
4         NaN       TPCDS1         NaN                    X@SEL$12   

  OBJECT_INSTANCE     ...      \
0             NaN     ...       
1             NaN     ...       
2     

### Dealing with empty values

In [5]:
def get_na_columns(df, headers):
    """
    Return columns which consist of NAN values
    """
    na_list = []
    for head in headers:
        if df[head].isnull().values.any():
            na_list.append(head)
    return na_list
#
print('N/A Columns\n')
print('\nREP_VSQL_PLAN Features ' + str(len(rep_vsql_plan_df.columns)) + ': ' + str(get_na_columns(df=rep_vsql_plan_df,headers=rep_vsql_plan_df.columns)) + "\n")
print('\nDF_OUTLIERS Features ' + str(len(df_outliers.columns)) + ': ' + str(get_na_columns(df=df_outliers,headers=df_outliers.columns)) + "\n")
#
def fill_na(df):
    """
    Replaces NA columns with 0s
    """
    return df.fillna(0)
#
# Populating NaN values with amount '0'
df = fill_na(df=rep_vsql_plan_df)
df_outliers = fill_na(df=df_outliers)

N/A Columns


REP_VSQL_PLAN Features 22: ['OPTIONS', 'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'COST', 'CARDINALITY', 'BYTES', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'TIME', 'QBLOCK_NAME']


DF_OUTLIERS Features 35: ['REMARKS', 'OPTIONS', 'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_INSTANCE', 'OBJECT_TYPE', 'OPTIMIZER', 'SEARCH_COLUMNS', 'PARENT_ID', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG', 'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'OTHER_XML', 'DISTRIBUTION', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME', 'QBLOCK_NAME']



### Feature Selection

In this step, redundant features are dropped. Features are considered redundant if exhibit a standard devaition of 0 (meaning no change in value).

In [6]:
def drop_flatline_columns(df):
    columns = df.columns
    flatline_features = []
    for i in range(len(columns)):
        try:
            std = df[columns[i]].std()
            if std == 0:
                flatline_features.append(columns[i])
        except:
            pass
    #
    #print('Features which are considered flatline:\n')
    #for col in flatline_features:
    #    print(col)
    print('\nShape before changes: [' + str(df.shape) + ']')
    df = df.drop(columns=flatline_features)
    print('Shape after changes: [' + str(df.shape) + ']')
    print('Dropped a total [' + str(len(flatline_features)) + ']')
    return df
#
df = drop_flatline_columns(df=df)
df_outliers = drop_flatline_columns(df=df_outliers)
#
print('\nAfter flatline column drop:')
print(df.shape)
print(df.columns)
#
print('--------------------------------------------------------')
print('\nAfter outlier flatline column drop:')
print(df_outliers.shape)
print(df_outliers.columns)


Shape before changes: [(98794, 22)]
Shape after changes: [(98794, 22)]
Dropped a total [0]

Shape before changes: [(1456, 35)]
Shape after changes: [(1456, 27)]
Dropped a total [8]

After flatline column drop:
(98794, 22)
Index(['SQL_ID', 'PLAN_HASH_VALUE', 'ID', 'OPERATION', 'OPTIONS',
       'OBJECT_NODE', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS',
       'OBJECT_TYPE', 'OPTIMIZER', 'DEPTH', 'SEARCH_COLUMNS', 'COST',
       'CARDINALITY', 'BYTES', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'TIME',
       'QBLOCK_NAME', 'TIMESTAMP'],
      dtype='object')
--------------------------------------------------------

After outlier flatline column drop:
(1456, 27)
Index(['PLAN_ID', 'TIMESTAMP', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER',
       'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_INSTANCE', 'OBJECT_TYPE',
       'OPTIMIZER', 'SEARCH_COLUMNS', 'ID', 'PARENT_ID', 'DEPTH', 'POSITION',
       'COST', 'CARDINALITY', 'BYTES', 'OTHER_XML', 'CPU_COST', 'IO_COST',
       'TEMP_SPACE', 'ACCESS_PREDICAT

### Label Encoding

Converting labels/features into numerical representations

In [7]:
def encode(df, encoded_labels):
    for col in df.columns:
        if col in encoded_labels:
            le = preprocessing.LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    return df
#
# Determine labels used for encoding
encoded_labels = ['OPERATION','OPTIONS','OBJECT_OWNER','OBJECT_NAME','OBJECT_ALIAS','OBJECT_TYPE','OPTIMIZER','QBLOCK_NAME']
#
df = encode(df=df, encoded_labels=encoded_labels)
print('Encoded labels:\n' + str(encoded_labels) + "\n\n----------------------------------------------\n\n")
print(df.head())
#
df_outliers = encode(df=df_outliers, encoded_labels=encoded_labels)
print('Encoded labels:\n' + str(encoded_labels) + "\n\n----------------------------------------------\n\n")
print(df_outliers.head())

Encoded labels:
['OPERATION', 'OPTIONS', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'QBLOCK_NAME']

----------------------------------------------


          SQL_ID PLAN_HASH_VALUE ID  OPERATION  OPTIONS OBJECT_NODE  \
0  dxv968j0352kb       103598129  0         28        0           0   
1  dxv968j0352kb       103598129  1         30       14           0   
2  dxv968j0352kb       103598129  2         24        0           0   
3  dxv968j0352kb       103598129  3         26       26      :Q1001   
4  dxv968j0352kb       103598129  4         30       14      :Q1001   

   OBJECT_OWNER  OBJECT_NAME  OBJECT_ALIAS  OBJECT_TYPE         ...           \
0             0            0             0            0         ...            
1             0            0             0            0         ...            
2             0            0             0            0         ...            
3             1           11             0            0         ...     

### Floating point precision conversion

Each column is converted into a column of type values which are floating point for higher precision.

In [8]:
df[y_labels] = df[y_labels].astype(float)
df[y_labels] = np.round(df[y_labels], 3) # rounds to 3 dp
print(type(df))
print(df.shape)
#
df_outliers[y_labels] = df_outliers[y_labels].astype(float)
df_outliers[y_labels] = np.round(df_outliers[y_labels], 3) # rounds to 3 dp
print(type(df_outliers))
print(df_outliers.shape)

<class 'pandas.core.frame.DataFrame'>
(98794, 22)
<class 'pandas.core.frame.DataFrame'>
(1456, 27)


### Data Ordering

Sorting of datasets in order of 
* TIMESTAMP
* PLAN_HASH_VALUE
* ID

In [9]:
df.sort_values(by=['TIMESTAMP','PLAN_HASH_VALUE','ID'], ascending=True, inplace=True)
print(df.head())
print('-------------------------------')
df_outliers.sort_values(by=['TIMESTAMP','PLAN_ID','ID'], ascending=True, inplace=True)
print(df_outliers.head())

              SQL_ID PLAN_HASH_VALUE ID  OPERATION  OPTIONS OBJECT_NODE  \
0      dxv968j0352kb       103598129  0         28        0           0   
50644  dxv968j0352kb       103598129  0         28        0           0   
1      dxv968j0352kb       103598129  1         30       14           0   
50645  dxv968j0352kb       103598129  1         30       14           0   
2      dxv968j0352kb       103598129  2         24        0           0   

       OBJECT_OWNER  OBJECT_NAME  OBJECT_ALIAS  OBJECT_TYPE  \
0                 0            0             0            0   
50644             0            0             0            0   
1                 0            0             0            0   
50645             0            0             0            0   
2                 0            0             0            0   

              ...           SEARCH_COLUMNS   COST CARDINALITY  BYTES  \
0             ...                        0  880.0         0.0    0.0   
50644         ...         

### Access Plan Resource Aggregation

This method attempts to tackle the problem of access plan anomolies by aggregating resources per explain plan. Notable resources which are being considered are as follows:

* COST
* CARDINALITY
* BYTES
* PARTITION_DELTA (Partition End - Partition Start)
* CPU_COST
* IO_COST
* TEMP_SPACE
* TIME

The reasoning behind these fields in particular is mainly because these columns can be aggregated together.

In [10]:
print(df.shape)
df = df.groupby(['SQL_ID','PLAN_HASH_VALUE']).sum()
df.reset_index(inplace=True)
print(df.columns)
print(df.shape)
#
print(df_outliers.shape)
df_outliers = df_outliers.groupby(['PLAN_ID']).sum()
df_outliers.reset_index(inplace=True)
print(df_outliers.columns)
print(df_outliers.shape)

(98794, 22)
Index(['SQL_ID', 'PLAN_HASH_VALUE', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER',
       'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'COST',
       'CARDINALITY', 'BYTES', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'TIME',
       'QBLOCK_NAME'],
      dtype='object')
(270, 17)
(1456, 27)
Index(['PLAN_ID', 'OPERATION', 'OPTIONS', 'OBJECT_OWNER', 'OBJECT_NAME',
       'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'COST', 'CARDINALITY',
       'BYTES', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'TIME', 'QBLOCK_NAME'],
      dtype='object')
(42, 16)


### Column Reduction

Strips further columns unneccessary to the experiment, so as to have the same columns for both training data set and outlier set.

In [11]:
for col in df_outliers.columns:
    if col not in df.columns:
        df_outliers.drop(columns=[col], inplace=True)
for col in df.columns:
    if col not in df_outliers.columns:
        df.drop(columns=[col], inplace=True)
#
print(df.columns)
print(df.shape)
print('------------------------------------------')
print(df_outliers.columns)
print(df_outliers.shape)

Index(['OPERATION', 'OPTIONS', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS',
       'OBJECT_TYPE', 'OPTIMIZER', 'COST', 'CARDINALITY', 'BYTES', 'CPU_COST',
       'IO_COST', 'TEMP_SPACE', 'TIME', 'QBLOCK_NAME'],
      dtype='object')
(270, 15)
------------------------------------------
Index(['OPERATION', 'OPTIONS', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS',
       'OBJECT_TYPE', 'OPTIMIZER', 'COST', 'CARDINALITY', 'BYTES', 'CPU_COST',
       'IO_COST', 'TEMP_SPACE', 'TIME', 'QBLOCK_NAME'],
      dtype='object')
(42, 15)
