# Schedule Access Plan Recommendation

This notebook is dedicated to model fitting in terms of database access plans.

In [1]:
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# sklearn
import sklearn as sk
from sklearn import preprocessing
print('sklearn: %s' % sk.__version__)

pandas: 0.23.4
numpy: 1.15.2
sklearn: 0.18.1


In [2]:
#
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test

### Read data from file into pandas dataframes

In [3]:
rep_vsql_plan_path = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds + '/v2/rep_vsql_plan.csv'
#rep_vsql_plan_path = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds + '/v2/rep_vsql_plan.csv'
#
rep_vsql_plan_df = pd.read_csv(rep_vsql_plan_path)
print(rep_vsql_plan_df.head())
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_vsql_plan_df.columns = prettify_header(rep_vsql_plan_df.columns.values)
print('------------------------------------------')
print(rep_vsql_plan_df.columns)

                                       ('SQL_TEXT',)   ('DBID',)  \
0  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  2634225673   
1  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  2634225673   
2  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  2634225673   
3  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  2634225673   
4  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  2634225673   

     ('SQL_ID',)  ('PLAN_HASH_VALUE',)  ('ID',)    ('OPERATION',)  \
0  dxv968j0352kb             103598129        0  SELECT STATEMENT   
1  dxv968j0352kb             103598129        1              SORT   
2  dxv968j0352kb             103598129        2    PX COORDINATOR   
3  dxv968j0352kb             103598129        3           PX SEND   
4  dxv968j0352kb             103598129        4              SORT   

  ('OPTIONS',) ('OBJECT_NODE',)  ('OBJECT#',) ('OBJECT_OWNER',)     ...       \
0          NaN              NaN           NaN               NaN     ...        
1     GROUP BY  

  interactivity=interactivity, compiler=compiler, result=result)


### Dealing with empty values

In [4]:
def get_na_columns(df, headers):
    """
    Return columns which consist of NAN values
    """
    na_list = []
    for head in headers:
        if df[head].isnull().values.any():
            na_list.append(head)
    return na_list
#
print('N/A Columns\n')
print('\nREP_VSQL_PLAN Features ' + str(len(rep_vsql_plan_df.columns)) + ': ' + str(get_na_columns(df=rep_vsql_plan_df,headers=rep_vsql_plan_df.columns)) + "\n")
#
def fill_na(df):
    """
    Replaces NA columns with 0s
    """
    return df.fillna(0)
#
# Populating NaN values with amount '0'
df = fill_na(df=rep_vsql_plan_df)

N/A Columns


REP_VSQL_PLAN Features 40: ['OPTIONS', 'OBJECT_NODE', 'OBJECT#', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'PARENT_ID', 'COST', 'CARDINALITY', 'OTHER_TAG', 'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'DISTRIBUTION', 'IO_COST', 'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME', 'QBLOCK_NAME', 'REMARKS', 'OTHER_XML']



### Feature Selection

In this step, redundant features are dropped. Features are considered redundant if exhibit a standard devaition of 0 (meaning no change in value).

In [5]:
def drop_flatline_columns(df):
    columns = df.columns
    flatline_features = []
    for i in range(len(columns)):
        try:
            std = df[columns[i]].std()
            if std == 0:
                flatline_features.append(columns[i])
        except:
            pass
    #
    #print('Features which are considered flatline:\n')
    #for col in flatline_features:
    #    print(col)
    print('\nShape before changes: [' + str(df.shape) + ']')
    df = df.drop(columns=flatline_features)
    print('Shape after changes: [' + str(df.shape) + ']')
    print('Dropped a total [' + str(len(flatline_features)) + ']')
    return df
#
df = drop_flatline_columns(df=df)
print('\nAfter flatline column drop:')
print(df.shape)
print(df.columns)


Shape before changes: [(26704, 40)]
Shape after changes: [(26704, 32)]
Dropped a total [8]

After flatline column drop:
(26704, 32)
Index(['SQL_TEXT', 'SQL_ID', 'PLAN_HASH_VALUE', 'ID', 'OPERATION', 'OPTIONS',
       'OBJECT_NODE', 'OBJECT#', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS',
       'OBJECT_TYPE', 'OPTIMIZER', 'PARENT_ID', 'DEPTH', 'POSITION',
       'SEARCH_COLUMNS', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG',
       'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'DISTRIBUTION',
       'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'TIME', 'QBLOCK_NAME', 'TIMESTAMP',
       'OTHER_XML'],
      dtype='object')


### Data Ordering

Sorting of datasets in order of 
* SNAP_ID
* SQL_ID
* ID

In [6]:
df.sort_values(by=['TIMESTAMP','PLAN_HASH_VALUE','ID'], ascending=True, inplace=True)
print(df.head())

                                            SQL_TEXT         SQL_ID  \
0  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
1  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
2  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
3  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
4  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   

   PLAN_HASH_VALUE  ID         OPERATION      OPTIONS OBJECT_NODE  OBJECT#  \
0        103598129   0  SELECT STATEMENT            0           0      0.0   
1        103598129   1              SORT     GROUP BY           0      0.0   
2        103598129   2    PX COORDINATOR            0           0      0.0   
3        103598129   3           PX SEND  QC (RANDOM)      :Q1001      0.0   
4        103598129   4              SORT     GROUP BY      :Q1001      0.0   

  OBJECT_OWNER OBJECT_NAME                        ...                          \
0            0         

### Label Encoding

Converting labels/features into numerical representations

In [7]:
def encode(df, encoded_labels):
    for col in df.columns:
        if col in encoded_labels:
            le = preprocessing.LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    return df
#
#print(rep_vsql_plan_df.head())
encoded_labels = ['OPERATION','OPTIONS','OBJECT_OWNER','OBJECT_NAME','OBJECT_ALIAS','OBJECT_TYPE','OPTIMIZER','QBLOCK_NAME']
df = encode(df=df, encoded_labels=encoded_labels)
print('Encoded labels:\n' + str(encoded_labels) + "\n\n----------------------------------------------\n\n")
print(df.head())

Encoded labels:
['OPERATION', 'OPTIONS', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'QBLOCK_NAME']

----------------------------------------------


                                            SQL_TEXT         SQL_ID  \
0  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
1  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
2  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
3  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   
4  select /*+  parallel_index(t, "CS_SOLD_TIME_SK...  dxv968j0352kb   

   PLAN_HASH_VALUE  ID  OPERATION  OPTIONS OBJECT_NODE  OBJECT#  OBJECT_OWNER  \
0        103598129   0         23        0           0      0.0             0   
1        103598129   1         25       16           0      0.0             0   
2        103598129   2         20        0           0      0.0             0   
3        103598129   3         22       28      :Q1001      0.0     

### Access Plan Resource Aggregation

This method attempts to tackle the problem of access plan anomolies by aggregating resources per explain plan. Notable resources which are being considered are as follows:

* COST
* CARDINALITY
* BYTES
* PARTITION_DELTA (Partition End - Partition Start)
* CPU_COST
* IO_COST
* TEMP_SPACE
* TIME

The reasoning behind these fields in particular is mainly because these columns can be aggregated together.

In [8]:
df_aggregate = df.groupby(['SQL_ID','PLAN_HASH_VALUE']).sum()
df_aggregate.reset_index(inplace=True)
print(df_aggregate.columns)
print(df_aggregate.shape)

Index(['SQL_ID', 'PLAN_HASH_VALUE', 'ID', 'OPERATION', 'OPTIONS', 'OBJECT#',
       'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE',
       'OPTIMIZER', 'PARENT_ID', 'DEPTH', 'POSITION', 'SEARCH_COLUMNS', 'COST',
       'CARDINALITY', 'PARTITION_ID', 'IO_COST', 'TIME', 'QBLOCK_NAME'],
      dtype='object')
(190, 21)


### Isolation Forest Outlier Detection

Outlier detection of for plans opted for by sqls. A separate dataset consisting of explain plans generated with optimizer hint inclusion will be used to evaluate the achieved model score (this dataset will be considered outliers).

In [None]:
class IsolationForestWrapper:
    #
    def __init__(self, X, contamination=.1, parallel_degree=1):
        """
        Constructor Method
        :param X - Pandas Dataframe
        :param contamination - Real value
        """
        self.X = X.values
        self.model = IsolationForest(n_estimators=100, max_samples=256, contamination=contamination, random_state=0, n_jobs=parallel_degree)
        self.model.fit(self.X)
        self.scorings = []
        print(self.model)
    #
    def predict_labels(self):
        return self.model.predict(self.X)        