# Schedule Access Plan Recommendation

This notebook is dedicated to model fitting in terms of database access plans.

In [1]:
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# sklearn
import sklearn as sk
from sklearn import preprocessing
print('sklearn: %s' % sk.__version__)

pandas: 0.23.4
numpy: 1.15.2
sklearn: 0.20.0


In [2]:
#
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test

### Read data from file into pandas dataframes

In [3]:
#rep_hist_snapshot_path = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds + '/rep_hist_snapshot.csv'
# rep_vsql_plan_path = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds + '/rep_vsql_plan.csv'
rep_hist_snapshot_path = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds + '/rep_hist_snapshot.csv'
rep_vsql_plan_path = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds + '/rep_vsql_plan.csv'
#
rep_hist_snapshot_df = pd.read_csv(rep_hist_snapshot_path)
rep_vsql_plan_df = pd.read_csv(rep_vsql_plan_path)
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_hist_snapshot_df.columns = prettify_header(rep_hist_snapshot_df.columns.values)
rep_vsql_plan_df.columns = prettify_header(rep_vsql_plan_df.columns.values)
print('------------------------------------------')
print(rep_hist_snapshot_df.columns)
print(rep_vsql_plan_df.columns)

  interactivity=interactivity, compiler=compiler, result=result)


------------------------------------------
Index(['SNAP_ID', 'DBID', 'INSTANCE_NUMBER', 'SQL_ID', 'PLAN_HASH_VALUE',
       'OPTIMIZER_COST', 'OPTIMIZER_MODE', 'OPTIMIZER_ENV_HASH_VALUE',
       'SHARABLE_MEM', 'LOADED_VERSIONS', 'VERSION_COUNT', 'MODULE', 'ACTION',
       'SQL_PROFILE', 'FORCE_MATCHING_SIGNATURE', 'PARSING_SCHEMA_ID',
       'PARSING_SCHEMA_NAME', 'PARSING_USER_ID', 'FETCHES_TOTAL',
       'FETCHES_DELTA', 'END_OF_FETCH_COUNT_TOTAL', 'END_OF_FETCH_COUNT_DELTA',
       'SORTS_TOTAL', 'SORTS_DELTA', 'EXECUTIONS_TOTAL', 'EXECUTIONS_DELTA',
       'PX_SERVERS_EXECS_TOTAL', 'PX_SERVERS_EXECS_DELTA', 'LOADS_TOTAL',
       'LOADS_DELTA', 'INVALIDATIONS_TOTAL', 'INVALIDATIONS_DELTA',
       'PARSE_CALLS_TOTAL', 'PARSE_CALLS_DELTA', 'DISK_READS_TOTAL',
       'DISK_READS_DELTA', 'BUFFER_GETS_TOTAL', 'BUFFER_GETS_DELTA',
       'ROWS_PROCESSED_TOTAL', 'ROWS_PROCESSED_DELTA', 'CPU_TIME_TOTAL',
       'CPU_TIME_DELTA', 'ELAPSED_TIME_TOTAL', 'ELAPSED_TIME_DELTA',
       'IOWAIT_TO

  interactivity=interactivity, compiler=compiler, result=result)


### Dealing with empty values

In [4]:
def get_na_columns(df, headers):
    """
    Return columns which consist of NAN values
    """
    na_list = []
    for head in headers:
        if df[head].isnull().values.any():
            na_list.append(head)
    return na_list
#
print('N/A Columns\n')
print('\nREP_HIST_SNAPSHOT Features ' + str(len(rep_hist_snapshot_df.columns)) + ': ' + str(get_na_columns(df=rep_hist_snapshot_df,headers=rep_hist_snapshot_df.columns)) + "\n")
print('\nREP_VSQL_PLAN Features ' + str(len(rep_vsql_plan_df.columns)) + ': ' + str(get_na_columns(df=rep_vsql_plan_df,headers=rep_vsql_plan_df.columns)) + "\n")
#
def fill_na(df):
    """
    Replaces NA columns with 0s
    """
    return df.fillna(0)
#
# Populating NaN values with amount '0'
rep_hist_snapshot_df = fill_na(df=rep_hist_snapshot_df)
rep_vsql_plan_df = fill_na(df=rep_vsql_plan_df)

N/A Columns


REP_HIST_SNAPSHOT Features 90: ['OPTIMIZER_COST', 'OPTIMIZER_MODE', 'OPTIMIZER_ENV_HASH_VALUE', 'LOADED_VERSIONS', 'MODULE', 'ACTION', 'SQL_PROFILE', 'PARSING_SCHEMA_ID', 'PARSING_SCHEMA_NAME', 'PARSING_USER_ID', 'FETCHES_TOTAL', 'FETCHES_DELTA', 'END_OF_FETCH_COUNT_TOTAL', 'END_OF_FETCH_COUNT_DELTA', 'SORTS_TOTAL', 'SORTS_DELTA', 'EXECUTIONS_TOTAL', 'EXECUTIONS_DELTA', 'PX_SERVERS_EXECS_TOTAL', 'PX_SERVERS_EXECS_DELTA', 'LOADS_TOTAL', 'LOADS_DELTA', 'INVALIDATIONS_TOTAL', 'INVALIDATIONS_DELTA', 'PARSE_CALLS_TOTAL', 'DISK_READS_TOTAL', 'DISK_READS_DELTA', 'BUFFER_GETS_TOTAL', 'BUFFER_GETS_DELTA', 'ROWS_PROCESSED_TOTAL', 'ROWS_PROCESSED_DELTA', 'CPU_TIME_TOTAL', 'ELAPSED_TIME_TOTAL', 'IOWAIT_TOTAL', 'IOWAIT_DELTA', 'CLWAIT_TOTAL', 'CLWAIT_DELTA', 'APWAIT_TOTAL', 'APWAIT_DELTA', 'CCWAIT_TOTAL', 'CCWAIT_DELTA', 'DIRECT_WRITES_TOTAL', 'DIRECT_WRITES_DELTA', 'PLSEXEC_TIME_TOTAL', 'PLSEXEC_TIME_DELTA', 'JAVEXEC_TIME_TOTAL', 'JAVEXEC_TIME_DELTA', 'IO_OFFLOAD_ELIG_BYTES_TOTAL', '

### Data Ordering

Sorting of datasets in order of:

* REP_VSQL_PLAN - TIMESTAMP, SQL_ID, ID

In [5]:
rep_hist_snapshot_df.sort_values(by=['SNAP_ID'], ascending=True, inplace=True)
rep_vsql_plan_df.sort_values(by=['TIMESTAMP','SQL_ID','ID'], ascending=True, inplace=True)

### TPC-DS Filtering

Exclude all queries but those pertaining to TPC-DS.
This part also merges the field 'SQL_TEXT' from data matrix REP_HIST_SNAPSHOT with data matrix 'REP_VSQL_PLAN'.

In [11]:
#
# Segregating those SQL_IDs which utilize TPC-DS owned objects
rep_vsql_plan_series = rep_vsql_plan_df['SQL_ID'].loc[rep_vsql_plan_df['OBJECT_OWNER'] == tpcds]
tpc_sql_ids = rep_vsql_plan_series.unique()
print('Unique ' + tpcds + ' IDs: ' + str(tpc_sql_ids))
#
# Retrieves data matrix rows pertaining to SQL_IDs using TPC-DS owned objects
print("REP_VSQL_SHAPE before: " + str(rep_vsql_plan_df.shape))
rep_vsql_plan_df = rep_vsql_plan_df.loc[rep_vsql_plan_df['SQL_ID'].isin(tpc_sql_ids)]
print("REP_VSQL_SHAPE after: " + str(rep_vsql_plan_df.shape))
#
# Merging data matrixes
df = pd.merge(rep_hist_snapshot_df, rep_vsql_plan_df, how='right', on=['SQL_ID','PLAN_HASH_VALUE'])
print("Merged Dataframe: " + str(df.shape))
# print(type(rep_vsql_plan_series))
# print(rep_vsql_plan_series)

Unique TPCDS1 IDs: ['0ga8vk4nftz45' '54qdvyrqsg8m6' '8skndm0ag8dwu' '9nzkptun0hjwu'
 '8rv3y7yy1zny3' 'gu5x4z494njku' '785wb90xs3r0t' '2j5bk3tn2zt0g'
 'cfsnf5tz2q74a' 'cjq93m442uprp' 'au8ztarrm6vvs' 'gkjkxbzzptg00'
 'gh5w0gcyfaujs' '7m8xtjmn5zv0g' 'bcbpkhm3cq424' 'd134mqkq6kgbu'
 '3419gsthd5szh' '9ua42c6f2qs7s' '9x8gaksqvta15' '4g1u6kabran4u'
 'd7w1dugmzb9n9' '4cgbvpjc134nu' 'd5wzutxy2w8np' '8k0qd372mh9td'
 'bj5v9w48937nu' 'c277ysg385fby' 'gn7c56v4qdv0a' 'gw5vg9fmj44kf'
 '9kum9s57rptk4' '0cs8gqsjwxxkq' '36zntmzb9nzbx' 'b8cjbq1au6kz8'
 '53w22dn4kt7us' 'dqmnrkfw3n0hc' 'ct08q649zt7zq' '3uqub29v7bm7k'
 '731j23kzchb46' '18kgfax58817q' '3z4t7h53vpq5j' 'g2kvb6h17ds7m'
 '34727gtyphqg0' '0axwf3djk6qhw' '8hb1p1z9z4wfb' '3uvpfa36gkwa2'
 '6fn9107s176xp' 'fx86bvbgy3k69' '9tasj8t3m8c9h' '5d1xymjjs040y'
 '8pyv1un0240hy' '1z9kdprccj203' 'g04ur4j6jn9av' '0a08ug2qc1j82'
 '5q9kqvuctxkvq' '0vx9985zy097p' 'dswqy60bkb6x5' '1aqmps4rb3nsn'
 'c71j8ycjcmgvk' '267gcwda3u6qf' '40tcb45hnv1wk' '2wtt22p5d7h8d'
 '846j

MemoryError: 

### Label Encoding

Converting labels/features into numerical representations

In [None]:
def encode(df, encoded_labels):
    for col in df.columns:
        if col in encoded_labels:
            le = preprocessing.LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    return df
#
#print(rep_vsql_plan_df.head())
encoded_labels = ['OPERATION','OPTIONS','OBJECT_OWNER','OBJECT_NAME','OBJECT_ALIAS','OBJECT_TYPE','OPTIMIZER','OTHER_TAG','DISTRIBUTION','QBLOCK_NAME']
rep_vsql_plan_df = encode(df=rep_vsql_plan_df, encoded_labels=encoded_labels)
print('Encoded labels:\n' + str(encoded_labels) + "\n\n----------------------------------------------\n\n")
print(rep_vsql_plan_df.head())

In [None]:
# print('Before Matrix Merge:')
# print("REP_HIST_SNAPSHOT: " + str(rep_hist_snapshot_df.shape))
# print("REP_VSQL_PLAN: " + str(rep_vsql_plan_df.shape))
# df = pd.merge(rep_hist_snapshot_df, rep_vsql_plan_df, how='right', on=['SQL_ID','PLAN_HASH_VALUE'])
# print('------------------------------------------')
# print('After Matrix Merge:')
# print("Joined Matrix: " + str(df.shape))
# print(df.head())
#
#
#
#print(rep_hist_snapshot_df[['ACTION','MODULE']].head(100))
#print(rep_hist_snapshot_df.loc[rep_hist_snapshot_df['PLAN_HASH_VALUE'] == 839894449])
# print(rep_vsql_plan_df['TIMESTAMP'].min())
# print(rep_vsql_plan_df['TIMESTAMP'].max())