# Query Sequence Analysis

This notebook focuses on sequence analysis, when presented with a workload schedule / sequence of queries. In an average day to day work activity, particular query patterns can be discerned. This pattern distinction allows us to discern which queries will be susceptible to execution over time, allowing us to know ahead of time which queries will be executed against the database.

### Module Installation and Importing Libraries

In [78]:
# scipy
import scipy as sc
print('scipy: %s' % sc.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# matplotlib
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# scikit-learn
from sklearn.preprocessing import MinMaxScaler
import sklearn as sk
print('sklearn: %s' % sk.__version__)

scipy: 0.19.1
numpy: 1.15.2
pandas: 0.23.4
sklearn: 0.18.1


### Configuration Cell

Tweak parametric changes from this cell to influence outcome of experiment. 
NB: This experiment demonstrates at time  step = 1 (1 minute in advance). Further down in experiment, other timestep results are also featured and evaluated.

In [79]:
#
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test
lag=3 # Time Series shift / Lag Step. Each lag value equates to 1 minute. Cannot be less than 1
test_split=.2 # Denotes which Data Split to operate under when it comes to training / validation
batch_size=10
y_label = ['SQL_ID'] # Denotes which label to use for time series experiments
#
# Forest Config
parallel_degree = 2
n_estimators = 100
#
if lag < 1:
    raise ValueError('Lag value must be greater than 1!')

### Read data from file into Pandas Dataframes

In [80]:
#
# Open Data
rep_hist_snapshot_path = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds + '/v2/rep_hist_snapshot.csv'
# rep_hist_snapshot_path = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds + '/v2/rep_hist_snapshot.csv'
#
rep_hist_snapshot_df = pd.read_csv(rep_hist_snapshot_path)
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_hist_snapshot_df.columns = prettify_header(rep_hist_snapshot_df.columns.values)
#
print(rep_hist_snapshot_df.columns.values)

['SNAP_ID' 'DBID' 'INSTANCE_NUMBER' 'SQL_ID' 'PLAN_HASH_VALUE'
 'OPTIMIZER_COST' 'OPTIMIZER_MODE' 'OPTIMIZER_ENV_HASH_VALUE'
 'SHARABLE_MEM' 'LOADED_VERSIONS' 'VERSION_COUNT' 'MODULE' 'ACTION'
 'SQL_PROFILE' 'FORCE_MATCHING_SIGNATURE' 'PARSING_SCHEMA_ID'
 'PARSING_SCHEMA_NAME' 'PARSING_USER_ID' 'FETCHES_TOTAL' 'FETCHES_DELTA'
 'END_OF_FETCH_COUNT_TOTAL' 'END_OF_FETCH_COUNT_DELTA' 'SORTS_TOTAL'
 'SORTS_DELTA' 'EXECUTIONS_TOTAL' 'EXECUTIONS_DELTA'
 'PX_SERVERS_EXECS_TOTAL' 'PX_SERVERS_EXECS_DELTA' 'LOADS_TOTAL'
 'LOADS_DELTA' 'INVALIDATIONS_TOTAL' 'INVALIDATIONS_DELTA'
 'PARSE_CALLS_TOTAL' 'PARSE_CALLS_DELTA' 'DISK_READS_TOTAL'
 'DISK_READS_DELTA' 'BUFFER_GETS_TOTAL' 'BUFFER_GETS_DELTA'
 'ROWS_PROCESSED_TOTAL' 'ROWS_PROCESSED_DELTA' 'CPU_TIME_TOTAL'
 'CPU_TIME_DELTA' 'ELAPSED_TIME_TOTAL' 'ELAPSED_TIME_DELTA' 'IOWAIT_TOTAL'
 'IOWAIT_DELTA' 'CLWAIT_TOTAL' 'CLWAIT_DELTA' 'APWAIT_TOTAL'
 'APWAIT_DELTA' 'CCWAIT_TOTAL' 'CCWAIT_DELTA' 'DIRECT_WRITES_TOTAL'
 'DIRECT_WRITES_DELTA' 'PLSEXEC_TIME_T

### Changing Matrix Shapes

Changes dataframe shape, in an attempt to drop all numeric data. Below's aggregated data is done so on:
* SNAP_ID
* INSTANCE_NUMBER
* DBID
* SQL_ID

In [81]:
print("Shape Before Aggregation: " + str(rep_hist_snapshot_df.shape))
#
# Group By Values by SNAP_ID , sum all metrics (for table REP_HIST_SNAPSHOT) and drop all numeric
rep_hist_snapshot_df = rep_hist_snapshot_df.groupby(['SNAP_ID','DBID','INSTANCE_NUMBER','SQL_ID']).sum()
rep_hist_snapshot_df.reset_index(inplace=True)
#
print("Shape After Aggregation: " + str(rep_hist_snapshot_df.shape))

Shape Before Aggregation: (115178, 90)
Shape After Aggregation: (94023, 78)


### Dealing with Empty Values

In [82]:
def get_na_columns(df, headers):
    """
    Return columns which consist of NAN values
    """
    na_list = []
    for head in headers:
        if df[head].isnull().values.any():
            na_list.append(head)
    return na_list
#
print('N/A Columns\n')
print('\n REP_HIST_SNAPSHOT Features ' + str(len(rep_hist_snapshot_df.columns)) + ': ' + str(get_na_columns(df=rep_hist_snapshot_df,headers=rep_hist_snapshot_df.columns)) + "\n")
#
def fill_na(df):
    """
    Replaces NA columns with 0s
    """
    return df.fillna(0)
#
# Populating NaN values with amount '0'
df = fill_na(df=rep_hist_snapshot_df)

N/A Columns


 REP_HIST_SNAPSHOT Features 78: []



### Data Ordering

Sorting of datasets in order of SNAP_ID.

In [83]:
df.sort_values(by=['SNAP_ID'], ascending=True, inplace=True)
print(df.shape)

(94023, 78)


### Floating point precision conversion

Each column is converted into a column of type values which are floating point for higher precision.

In [84]:
for column in df.columns:
    try:
        df[column].astype('float32', inplace=True)
        df[column] = np.round(df[column], 3) # rounds to 3 dp
    except:
        print('Couldnt convert column [' + column + ']')
print(df.shape)

Couldnt convert column [SQL_ID]
(94023, 78)


### Feature Selection

In this step, redundant features are dropped. Features are considered redundant if exhibit a standard devaition of 0 (meaning no change in value).

In [85]:
def drop_flatline_columns(df):
    columns = df.columns
    flatline_features = []
    for i in range(len(columns)):
        try:
            std = df[columns[i]].std()
            if std == 0:
                flatline_features.append(columns[i])
        except:
            pass
    #
    #print('Features which are considered flatline:\n')
    #for col in flatline_features:
    #    print(col)
    print('\nShape before changes: [' + str(df.shape) + ']')
    df = df.drop(columns=flatline_features)
    print('Shape after changes: [' + str(df.shape) + ']')
    print('Dropped a total [' + str(len(flatline_features)) + ']')
    return df
#
print('Before column drop:')
print(df.shape)
df = drop_flatline_columns(df=df)
print('\nAfter flatline column drop:')
print(df.shape)
dropped_columns_df = [ 'PLAN_HASH_VALUE',
                       'OPTIMIZER_ENV_HASH_VALUE',
                       'LOADED_VERSIONS',
                       'VERSION_COUNT',
                       'PARSING_SCHEMA_ID',
                       'PARSING_USER_ID',
                       'CON_DBID',
                       'SNAP_LEVEL',
                       'SNAP_FLAG',
                       'COMMAND_TYPE']
df.drop(columns=dropped_columns_df, inplace=True)
print('\nAfter additional column drop:')
print(df.shape)

Before column drop:
(94023, 78)

Shape before changes: [(94023, 78)]
Shape after changes: [(94023, 63)]
Dropped a total [15]

After flatline column drop:
(94023, 63)

After additional column drop:
(94023, 53)


### Data Normalization

Under the assumption that outliers have been capped/transformed, data is now passed through a min-max transformer.

In [86]:
#
# Keep reference of label before normalizaing
y_df = df[y_label]
df.drop(columns=y_label, inplace=True)
#
# Normalize values
scaler = MinMaxScaler(feature_range=(0, 1))
df_normalized_values = scaler.fit_transform(df.values)
#
# Carry normalized values (numpy array) to pandas df
df = pd.DataFrame(data=df_normalized_values, columns=df.columns)
del df_normalized_values
#
# Combine back labels to normalized values
df = pd.concat([y_df, df], axis=1, join_axes=[y_df.index])
#
print(str(df.shape))
print(df.head())

(94023, 53)
           SQL_ID  SNAP_ID  OPTIMIZER_COST  SHARABLE_MEM  FETCHES_TOTAL  \
0   03ggjrmy0wa1w      0.0    1.984567e-10      0.000168   3.691454e-08   
59  bwsf4tnh0gcgv      0.0    6.245616e-11      0.000338   8.613393e-08   
58  bkq9pjcfvm9vn      0.0    4.870902e-09      0.001418   4.484870e-05   
56  aggcw7yk1a7s6      0.0    6.652939e-11      0.000175   2.460969e-08   
55  ac717udu18a35      0.0    7.331811e-11      0.000243   6.890714e-08   

    FETCHES_DELTA  END_OF_FETCH_COUNT_TOTAL  END_OF_FETCH_COUNT_DELTA  \
0        0.000021              6.271117e-07                  0.000317   
59       0.000021              1.463261e-06                  0.000317   
58       0.000363              4.481758e-05                  0.000317   
56       0.000021              4.180745e-07                  0.000317   
55       0.000021              1.170609e-06                  0.000317   

    SORTS_TOTAL  SORTS_DELTA  ...   PHYSICAL_READ_REQUESTS_DELTA  \
0      0.000114     0.041255  

### Rearranging Labels

Removes the label column, and adds it at the beginning of the matrix for later usage

In [87]:
print('Before Column Switch: ' + str(df.shape))
y_df = df[y_label]
df.drop(columns=y_label, inplace=True)
print("Label " + str(y_label) + " shape: " + str(y_df.shape))
print("Feature matrix shape: " + str(df.shape))
#
# Merging labels and features in respective order
df = pd.concat([y_df, df], axis=1, sort=False)
print('After Column Switch: ' + str(df.shape))
print(df.head())

Before Column Switch: (94023, 53)
Label ['SQL_ID'] shape: (94023, 1)
Feature matrix shape: (94023, 52)
After Column Switch: (94023, 53)
           SQL_ID  SNAP_ID  OPTIMIZER_COST  SHARABLE_MEM  FETCHES_TOTAL  \
0   03ggjrmy0wa1w      0.0    1.984567e-10      0.000168   3.691454e-08   
59  bwsf4tnh0gcgv      0.0    6.245616e-11      0.000338   8.613393e-08   
58  bkq9pjcfvm9vn      0.0    4.870902e-09      0.001418   4.484870e-05   
56  aggcw7yk1a7s6      0.0    6.652939e-11      0.000175   2.460969e-08   
55  ac717udu18a35      0.0    7.331811e-11      0.000243   6.890714e-08   

    FETCHES_DELTA  END_OF_FETCH_COUNT_TOTAL  END_OF_FETCH_COUNT_DELTA  \
0        0.000021              6.271117e-07                  0.000317   
59       0.000021              1.463261e-06                  0.000317   
58       0.000363              4.481758e-05                  0.000317   
56       0.000021              4.180745e-07                  0.000317   
55       0.000021              1.170609e-06     

### Time Series Shifting

Shifting the datasets N lag minutes, in order to transform the problem into a supervised dataset. Each Lag Shift equates to 60 seconds (due to the way design of the data capturing tool). For each denoted lag amount, the same number of feature vectors will be stripped away at the beginning.

Features and Labels are separated into seperate dataframes at this point.

https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

In [88]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = data
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    if n_in != 0:
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    n_out += 1
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg
#
def remove_n_time_steps(data, n=1):
    if n == 0:
        return data
    df = data
    headers = df.columns
    dropped_headers = []
    #     for header in headers:
    #         if "(t)" in header:
    #             dropped_headers.append(header)
    #
    for i in range(1,n+1):
        for header in headers:
            if "(t+"+str(i)+")" in header:
                dropped_headers.append(str(header))
    #
    return df.drop(dropped_headers, axis=1) 
#
# Frame as supervised learning set
shifted_df = series_to_supervised(df, lag, lag)
#
# Seperate labels from features
y_row = []
for i in range(lag+1,(lag*2)+2):
    y_df_column_names = shifted_df.columns[len(df.columns)*i:len(df.columns)*i + len(y_label)]
    y_row.append(y_df_column_names)
    print(y_df_column_names)
    print(type(y_df_column_names))
y_df_column_names = []   
for row in y_row:
    for val in row:
        y_df_column_names.append(val)
#
# y_df_column_names = shifted_df.columns[len(df.columns)*lag:len(df.columns)*lag + len(y_label)]
y_df = shifted_df[y_df_column_names]
X_df = shifted_df.drop(columns=y_df_column_names)
print('\n-------------\nFeatures')
print(X_df.columns)
print(X_df.shape)
print('\n-------------\nLabels')
print(y_df.columns)
print(y_df.shape)
#
# Delete middle timesteps
X_df = remove_n_time_steps(data=X_df, n=lag)
print('\n-------------\nFeatures After Time Shift')
print(X_df.columns)
print(X_df.shape)
# y_df = remove_n_time_steps(data=y_df, n=lag)
print('\n-------------\nLabels After Time Shift')
print(y_df.columns)
print(y_df.shape)

Index(['var1(t+1)'], dtype='object')
<class 'pandas.core.indexes.base.Index'>
Index(['var1(t+2)'], dtype='object')
<class 'pandas.core.indexes.base.Index'>
Index(['var1(t+3)'], dtype='object')
<class 'pandas.core.indexes.base.Index'>
Index([], dtype='object')
<class 'pandas.core.indexes.base.Index'>

-------------
Features
Index(['var1(t-3)', 'var2(t-3)', 'var3(t-3)', 'var4(t-3)', 'var5(t-3)',
       'var6(t-3)', 'var7(t-3)', 'var8(t-3)', 'var9(t-3)', 'var10(t-3)',
       ...
       'var44(t+3)', 'var45(t+3)', 'var46(t+3)', 'var47(t+3)', 'var48(t+3)',
       'var49(t+3)', 'var50(t+3)', 'var51(t+3)', 'var52(t+3)', 'var53(t+3)'],
      dtype='object', length=368)
(94017, 368)

-------------
Labels
Index(['var1(t+1)', 'var1(t+2)', 'var1(t+3)'], dtype='object')
(94017, 3)

-------------
Features After Time Shift
Index(['var1(t-3)', 'var2(t-3)', 'var3(t-3)', 'var4(t-3)', 'var5(t-3)',
       'var6(t-3)', 'var7(t-3)', 'var8(t-3)', 'var9(t-3)', 'var10(t-3)',
       ...
       'var44(t)', 'var4