# Schedule Access Plan Recommendation

This notebook is dedicated to model fitting in terms of database access plans.

In [8]:
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)

pandas: 0.23.4


In [9]:
#
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test

### Read data from file into pandas dataframes

In [10]:
# rep_vsql_plan_path = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds + '/rep_vsql_plan.csv'
rep_vsql_plan_path = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds + '/rep_vsql_plan.csv'
#
rep_vsql_plan_df = pd.read_csv(rep_vsql_plan_path)
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_vsql_plan_df.columns = prettify_header(rep_vsql_plan_df.columns.values)
print('------------------------------------------')
print(rep_vsql_plan_df.columns)

------------------------------------------
Index(['DBID', 'SQL_ID', 'PLAN_HASH_VALUE', 'ID', 'OPERATION', 'OPTIONS',
       'OBJECT_NODE', 'OBJECT#', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS',
       'OBJECT_TYPE', 'OPTIMIZER', 'PARENT_ID', 'DEPTH', 'POSITION',
       'SEARCH_COLUMNS', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG',
       'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER',
       'DISTRIBUTION', 'CPU_COST', 'IO_COST', 'TEMP_SPACE',
       'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME',
       'QBLOCK_NAME', 'REMARKS', 'TIMESTAMP', 'OTHER_XML', 'CON_DBID',
       'CON_ID'],
      dtype='object')


### Dealing with empty values

In [11]:
def get_na_columns(df, headers):
    """
    Return columns which consist of NAN values
    """
    na_list = []
    for head in headers:
        if df[head].isnull().values.any():
            na_list.append(head)
    return na_list
#
print('N/A Columns\n')
print('\nREP_VSQL_PLAN Features ' + str(len(rep_vsql_plan_df.columns)) + ': ' + str(get_na_columns(df=rep_vsql_plan_df,headers=rep_vsql_plan_df.columns)) + "\n")
#
def fill_na(df):
    """
    Replaces NA columns with 0s
    """
    return df.fillna(0)
#
# Populating NaN values with amount '0'
rep_vsql_plan_df = fill_na(df=rep_vsql_plan_df)

N/A Columns


REP_VSQL_PLAN Features 39: ['OPTIONS', 'OBJECT_NODE', 'OBJECT#', 'OBJECT_OWNER', 'OBJECT_NAME', 'OBJECT_ALIAS', 'OBJECT_TYPE', 'OPTIMIZER', 'PARENT_ID', 'POSITION', 'COST', 'CARDINALITY', 'BYTES', 'OTHER_TAG', 'PARTITION_START', 'PARTITION_STOP', 'PARTITION_ID', 'OTHER', 'DISTRIBUTION', 'CPU_COST', 'IO_COST', 'TEMP_SPACE', 'ACCESS_PREDICATES', 'FILTER_PREDICATES', 'PROJECTION', 'TIME', 'QBLOCK_NAME', 'REMARKS', 'OTHER_XML']



### Data Ordering

Sorting of datasets in order of:

* REP_VSQL_PLAN - TIMESTAMP, SQL_ID, ID

In [12]:
rep_vsql_plan_df.sort_values(by=['TIMESTAMP','SQL_ID','ID'], ascending=True, inplace=True)

### Label Encoding

Encoding of access plan columns which are not numerical

In [None]:
# def encode(df, features):
#     encoder_dict={} # Used to keep track of respective encoders, in case it is required to decoded labels further down the line
#     for f in features:
#         for col in df.columns:
#             col = str(col)
#             if col.lower() == f.lower()
#                 le = preprocessing.LabelEncoder()
#                 df[col].values = le.fit_transform(df[col].values)
#                 encoder_dict[col] = le
#     return df, le
# #
# encoded_labels_hist_snapshot = []
# encoded_labels_vsql_plan = ['OPERATION',
#                             'OPTIONS']