# Query Sequence Analysis

This notebook focuses on sequence analysis, when presented with a workload schedule / sequence of queries. In an average day to day work activity, particular query patterns can be discerned. This pattern distinction allows us to discern which queries will be susceptible to execution over time, allowing us to know ahead of time which queries will be executed against the database.

### Module Installation and Importing Libraries

In [1]:
# scipy
import scipy as sc
print('scipy: %s' % sc.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# matplotlib
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
# pandas
import pandas as pd
print('pandas: %s' % pd.__version__)
# scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
import sklearn as sk
print('sklearn: %s' % sk.__version__)
import math

scipy: 1.1.0
numpy: 1.15.2
pandas: 0.23.4
sklearn: 0.20.0


### Configuration Cell

Tweak parametric changes from this cell to influence outcome of experiment. 
NB: This experiment demonstrates at time  step = 1 (1 minute in advance). Further down in experiment, other timestep results are also featured and evaluated.

In [2]:
#
# Experiment Config
tpcds='TPCDS1' # Schema upon which to operate test
lag=3 # Time Series shift / Lag Step. Each lag value equates to 1 minute. Cannot be less than 1
if lag < 1:
    raise ValueError('Lag value must be greater than 1!')
#
test_split=.2 # Denotes which Data Split to operate under when it comes to training / validation
#
# Forest Config
parallel_degree = 1
n_estimators = 10
#
# Net Config
batch_size=10
epochs=10

### Read data from file into Pandas Dataframes

In [3]:
#
# Open Data
#rep_hist_snapshot_path = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds + '/v2/rep_hist_snapshot.csv'
rep_hist_snapshot_path = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds + '/v2/rep_hist_snapshot.csv'
#
rep_hist_snapshot_df = pd.read_csv(rep_hist_snapshot_path)
#
def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list
#
rep_hist_snapshot_df.columns = prettify_header(rep_hist_snapshot_df.columns.values)
#
print(rep_hist_snapshot_df.columns.values)

['SNAP_ID' 'DBID' 'INSTANCE_NUMBER' 'SQL_ID' 'PLAN_HASH_VALUE'
 'OPTIMIZER_COST' 'OPTIMIZER_MODE' 'OPTIMIZER_ENV_HASH_VALUE'
 'SHARABLE_MEM' 'LOADED_VERSIONS' 'VERSION_COUNT' 'MODULE' 'ACTION'
 'SQL_PROFILE' 'FORCE_MATCHING_SIGNATURE' 'PARSING_SCHEMA_ID'
 'PARSING_SCHEMA_NAME' 'PARSING_USER_ID' 'FETCHES_TOTAL' 'FETCHES_DELTA'
 'END_OF_FETCH_COUNT_TOTAL' 'END_OF_FETCH_COUNT_DELTA' 'SORTS_TOTAL'
 'SORTS_DELTA' 'EXECUTIONS_TOTAL' 'EXECUTIONS_DELTA'
 'PX_SERVERS_EXECS_TOTAL' 'PX_SERVERS_EXECS_DELTA' 'LOADS_TOTAL'
 'LOADS_DELTA' 'INVALIDATIONS_TOTAL' 'INVALIDATIONS_DELTA'
 'PARSE_CALLS_TOTAL' 'PARSE_CALLS_DELTA' 'DISK_READS_TOTAL'
 'DISK_READS_DELTA' 'BUFFER_GETS_TOTAL' 'BUFFER_GETS_DELTA'
 'ROWS_PROCESSED_TOTAL' 'ROWS_PROCESSED_DELTA' 'CPU_TIME_TOTAL'
 'CPU_TIME_DELTA' 'ELAPSED_TIME_TOTAL' 'ELAPSED_TIME_DELTA' 'IOWAIT_TOTAL'
 'IOWAIT_DELTA' 'CLWAIT_TOTAL' 'CLWAIT_DELTA' 'APWAIT_TOTAL'
 'APWAIT_DELTA' 'CCWAIT_TOTAL' 'CCWAIT_DELTA' 'DIRECT_WRITES_TOTAL'
 'DIRECT_WRITES_DELTA' 'PLSEXEC_TIME_T

  interactivity=interactivity, compiler=compiler, result=result)


### Changing Matrix Shapes

Changes dataframe shape, in an attempt to drop all numeric data. Below's aggregated data is done so on:
* SNAP_ID
* INSTANCE_NUMBER
* DBID
* SQL_ID

In [4]:
print("Shape Before Aggregation: " + str(rep_hist_snapshot_df.shape))
#
# Group By Values by SNAP_ID , sum all metrics (for table REP_HIST_SNAPSHOT) and drop all numeric
df = rep_hist_snapshot_df.groupby(['SNAP_ID'])['SQL_ID'].apply(list).reset_index()
#
print("Shape After Aggregation: " + str(df.shape))
print(type(df))
print(df.head(100))

Shape Before Aggregation: (64912, 90)
Shape After Aggregation: (820, 2)
<class 'pandas.core.frame.DataFrame'>
    SNAP_ID                                             SQL_ID
0     28190  [03ggjrmy0wa1w, 06dymzb481vnd, 0aq14dznn91rg, ...
1     28191  [04kug40zbu4dm, 0a08ug2qc1j82, 0a08ug2qc1j82, ...
2     28192  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
3     28193  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
4     28194  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
5     28195  [06g9mhm5ba7tt, 0kcbwucxmazcp, 0kkhhb2w93cx0, ...
6     28196  [06g9mhm5ba7tt, 0kcbwucxmazcp, 0kkhhb2w93cx0, ...
7     28197  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
8     28198  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
9     28199  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
10    28200  [06g9mhm5ba7tt, 0kcbwucxmazcp, 0kkhhb2w93cx0, ...
11    28201  [06g9mhm5ba7tt, 09vrdx888wvvb, 0kcbwucxmazcp, ...
12    28202  [03ggjrmy0wa1w, 06dymzb481vnd, 0aq14dznn91rg, ...
13    28

### Data Ordering

Sorting of datasets in order of SNAP_ID.

In [5]:
df.sort_index(ascending=True,inplace=True)
print(df.shape)
print(df.head(100))

(820, 2)
    SNAP_ID                                             SQL_ID
0     28190  [03ggjrmy0wa1w, 06dymzb481vnd, 0aq14dznn91rg, ...
1     28191  [04kug40zbu4dm, 0a08ug2qc1j82, 0a08ug2qc1j82, ...
2     28192  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
3     28193  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
4     28194  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
5     28195  [06g9mhm5ba7tt, 0kcbwucxmazcp, 0kkhhb2w93cx0, ...
6     28196  [06g9mhm5ba7tt, 0kcbwucxmazcp, 0kkhhb2w93cx0, ...
7     28197  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
8     28198  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
9     28199  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
10    28200  [06g9mhm5ba7tt, 0kcbwucxmazcp, 0kkhhb2w93cx0, ...
11    28201  [06g9mhm5ba7tt, 09vrdx888wvvb, 0kcbwucxmazcp, ...
12    28202  [03ggjrmy0wa1w, 06dymzb481vnd, 0aq14dznn91rg, ...
13    28203  [01tp87bk1t2zv, 01tp87bk1t2zv, 01tp87bk1t2zv, ...
14    28204  [06dymzb481vnd, 0y080mnfaqk3u, 0y

### Univariate Selection

This sextion treats the dataset as a univariate dataset. Therefore the SNAP_ID pertaining to each set of SQL_IDs is removed, with the intent of future classifiers training solely on past SQL executions

In [6]:
print(df.shape)
del df['SNAP_ID']
print(df.shape)

(820, 2)
(820, 1)


### Label Encoding

Since this experiment deals with prediction of upcoming SQL_IDs, respectice SQL_ID strings need to labelled as a numeric representation. Label Encoder will be used here to convert SQL_ID's into a numeric format, which are in turn used for training. Evaluation (achieved predictions) is done so also in numeric format, at which point the label encoder is eventually used to decode back the labels into the original, respetive SQL_ID representation.

This section of the experiment additionally converts the targetted label into a binarized version of the previous achieved categorical numeric values.

* https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html

In [7]:
print(df.shape)
temp_df = df
le = preprocessing.LabelEncoder()
df = df.applymap(lambda x: le.fit_transform(x))
print(df.shape)
print("\n----------------------------------\n\nAvailable Classes:")
print('Total SQL_ID Classes' + str(len(le.classes_)))
print(le.classes_)
print(temp_df.head())
del temp_df
print(df.head())

(820, 1)
(820, 1)

----------------------------------

Available Classes:
Total SQL_ID Classes55
['06dymzb481vnd' '0ga8vk4nftz45' '13a9r2xkx1bxb' '14f5ngrj3cc5h'
 '1fn8v91f0arf0' '1p5grz1gs7fjq' '1r4kjczvthaqy' '1u97hwfu7dcmz'
 '1v2b661suttyp' '20bqsr6btd9x9' '26jdypa362wv9' '2j25hzq35w45h'
 '2pz0tqbv91m11' '2wuhkcaz4uhs5' '327nhwy0369kr' '38243c4tqrkxm'
 '39nyc1pykjg41' '3rd3sp2ak89rc' '4u268zn6r57tm' '4vym5a5jcq88d'
 '54qdvyrqsg8m6' '5g88vmdgd99f7' '5r6n3gv8bjpf9' '616m6uhpa2usu'
 '6fvfqaw68q59b' '6zcux9jb78w36' '6zs29hb3gpcf5' '76ds5wxsv7f5t'
 '7fbzhzg6ysu25' '7vtvbg7s3zcyp' '84aqqjbf6dkt3' '84ntdbh48ctu9'
 '85cmvvurya34f' '87gtj5jaq4a3t' '8cxgpdw3qqxqg' '9dnnpagwcg2cu'
 '9ffht8tuysgx9' '9rmg1ukgcyzpv' 'a6fy23us0jz84' 'a6g97rawd3ggv'
 'b4j3z1g2nwfys' 'bkq9pjcfvm9vn' 'c1x7a33hxwfyy' 'c2bxq4kd3uj1t'
 'chbj5w1vwums1' 'ckfvfhzy0qrws' 'ct4bbu3duky6v' 'cws42h8dn8wta'
 'd2tvgg49y2ap6' 'd2zx61xsr2xfn' 'd6vqfmt62hypx' 'd7w1dugmzb9n9'
 'dh7u76hwz04bq' 'f9hk3q1y2b8nt' 'fc0va0vju750z']
        

### Data Normalization

A note regarding normalization. Normalization for this experiment was purposely skipped, since value dimensionality & size is not as important for RandomForest based models. The purity split  does not benefit greatly from such a process:

* https://stats.stackexchange.com/questions/57010/is-it-essential-to-do-normalization-for-svm-and-random-forest
* https://stackoverflow.com/questions/8961586/do-i-need-to-normalize-or-scale-data-for-randomforest-r-package
* https://bmcbioinformatics.biomedcentral.com/track/pdf/10.1186/1471-2105-8-25

### Feature Padding

Since there isn't a fixed number of SQL_ID's per SNAP_ID, each set of SQL_IDs need to be padded so as to assume an equal number if SQL_IDs for the purpose of model fitting.

In [8]:
print("Length at index 0: " + str(df['SQL_ID'].iloc[0].size))
print(df['SQL_ID'].iloc[0])
print("Length at index 1: " + str(df['SQL_ID'].iloc[1].size))
print(df['SQL_ID'].iloc[1])
print("Length at index 2: " + str(df['SQL_ID'].iloc[2].size))
print(df['SQL_ID'].iloc[2])
#
# Retrieve largest length
def pad_datamatrix(df):
    """
    Iterates over dataframe and pads SQL_ID lists accordingly with -1 values
    """
    row_sizes = []
    for index, row in df.iterrows():
        row_sizes.append(len(row['SQL_ID']))
    max_row_size = max(row_sizes)
    #
    # Pad Dataframe Values
    i = 0
    for index, row in df.iterrows():
        length = len(row['SQL_ID'])
        diff = max_row_size - length
        if diff != 0:
            for j in range(length, max_row_size):
                df['SQL_ID'].iloc[i] = np.append(df['SQL_ID'].iloc[i], -1) # Appends -1 to padded values
        # print("Length at index " + str(i) + ": " + str(df['SQL_ID'].iloc[i].size))
        i += 1
    return df
#
df = pad_datamatrix(df)
#
print('\n\n------------------------------------------\n\n')
print("Length at index 0: " + str(df['SQL_ID'].iloc[0].size))
print(df['SQL_ID'].iloc[0])
print("Length at index 1: " + str(df['SQL_ID'].iloc[1].size))
print(df['SQL_ID'].iloc[1])
print("Length at index 2: " + str(df['SQL_ID'].iloc[2].size))
print(df['SQL_ID'].iloc[2])

Length at index 0: 80
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 52 52 52 52 53 54 55 55 55 56 57 58 59 59 59 59 60 61 62
 63 64 65 66 67 68 69 70]
Length at index 1: 81
[ 0  1  1  2  2  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 19
 20 21 22 23 24 24 25 26 26 27 28 29 29 30 31 32 33 34 35 36 37 38 39 39
 40 41 42 43 44 45 46 47 47 48 49 50 51 51 51 51 52 52 52 53 54 55 56 57
 58 59 60 61 62 63 64 64 65]
Length at index 2: 91
[ 0  0  0  1  2  2  2  3  4  5  6  7  8  9  9 10 11 12 13 13 14 15 16 17
 17 18 19 19 20 21 22 23 24 25 26 26 26 27 28 29 30 31 32 33 34 35 36 37
 37 38 39 40 40 40 40 40 41 42 43 44 44 44 45 45 46 47 48 49 49 49 49 50
 51 51 51 51 52 52 52 53 54 55 56 56 57 58 59 60 61 61 62]


------------------------------------------


Length at index 0: 109
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 3

### Time Series Shifting

Shifting the datasets N lag minutes, in order to transform the problem into a supervised dataset. Each Lag Shift equates to 60 seconds (due to the way design of the data capturing tool). For each denoted lag amount, the same number of feature vectors will be stripped away at the beginning.

Features and Labels are separated into seperate dataframes at this point.

https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

In [9]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = data
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    if n_in != 0:
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    n_out += 1
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg
#
def remove_n_time_steps(data, n=1):
    if n == 0:
        return data
    df = data
    headers = df.columns
    dropped_headers = []
    #
    for i in range(1,n+1):
        for header in headers:
            if "(t+"+str(i)+")" in header:
                dropped_headers.append(str(header))
    #
    return df.drop(dropped_headers, axis=1) 
#
# Frame as supervised learning set
shifted_df = series_to_supervised(df, lag, lag)
#
# Seperate labels from features
y_row = []
for i in range(lag+1,(lag*2)+2):
    y_df_column_names = shifted_df.columns[len(df.columns)*i:len(df.columns)*i + 1]
    y_row.append(y_df_column_names)
y_df_column_names = []   
for row in y_row:
    for val in row:
        y_df_column_names.append(val)
#
# y_df_column_names = shifted_df.columns[len(df.columns)*lag:len(df.columns)*lag + len(y_label)]
y_df = shifted_df[y_df_column_names]
X_df = shifted_df.drop(columns=y_df_column_names)
print('\n-------------\nFeatures')
print(X_df.columns)
print(X_df.shape)
print('\n-------------\nLabels')
print(y_df.columns)
print(y_df.shape)
#
# Delete middle timesteps
X_df = remove_n_time_steps(data=X_df, n=lag)
print('\n-------------\nFeatures After Time Shift')
print(X_df.columns)
print(X_df.shape)
# y_df = remove_n_time_steps(data=y_df, n=lag)
print('\n-------------\nLabels After Time Shift')
print(y_df.columns)
print(y_df.shape)


-------------
Features
Index(['var1(t-3)', 'var1(t-2)', 'var1(t-1)', 'var1(t)'], dtype='object')
(814, 4)

-------------
Labels
Index(['var1(t+1)', 'var1(t+2)', 'var1(t+3)'], dtype='object')
(814, 3)

-------------
Features After Time Shift
Index(['var1(t-3)', 'var1(t-2)', 'var1(t-1)', 'var1(t)'], dtype='object')
(814, 4)

-------------
Labels After Time Shift
Index(['var1(t+1)', 'var1(t+2)', 'var1(t+3)'], dtype='object')
(814, 3)


### Expand Feature Lists

Expand Feature Lists, where in each list element is represented as it's own features. Total feature count here equates as follows:

Features = (lag * SQL_ID per SNAP_ID count) + SQL_ID per SNAP_ID count
Labels = lag * SQL_ID per SNAP_ID count

In [17]:
feature_count = len(X_df['var1(t)'].iloc[0])
print(feature_count)

[ 0  0  0  1  2  2  3  3  3  4  5  6  7  8  9 10 11 11 12 13 14 15 16 16
 17 18 19 20 21 21 22 23 23 24 25 26 26 27 28 29 30 31 31 31 32 33 34 35
 35 36 37 38 39 40 41 41 42 43 43 43 43 43 44 45 46 47 47 47 48 49 49 50
 51 52 53 54 54 54 54 55 56 56 56 57 58 59 60 61 62 63 64 64 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
109


### RandomForest Classification (Many To Many)

Classification attemps using RFC

In [10]:
#
# Random Forest
class RandomForest:
    """
    Random Forest Class (Regression + Classification)
    """
    #
    def __init__(self, n_estimators, max_depth=None,parallel_degree=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.parallel_degree=parallel_degree
        self.model = RandomForestClassifier(max_depth=self.max_depth,
                                            n_estimators=self.n_estimators,
                                            n_jobs=self.parallel_degree)
    #
    def fit_model(self, X_train, y_train):
        """
        Fits training data to target labels
        """
        self.model.fit(X_train,y_train)
        print(self.model)
    #
    def predict(self, X):
        yhat = self.model.predict(X)
        return yhat
    #
    def predict_and_evaluate(self, X, y, y_labels, plot=False):
        """
        Runs test data through previously trained model, and evaluate differently depending if a regression of classification model
        """
        yhat = self.predict(X)
        #
        # F1-Score Evaluation
        for i in range(y.shape[1]):
            f1 = f1_score(y[:,i], yhat[:,i], average='micro') # Calculate metrics globally by counting the total true positives, false negatives and false positives.
            print('Test FScore ' + y_labels[0] + ' with LAG value [' + str(i) + ']: ' +  str(f1))
        #
        if plot:
            for i in range(0, len(y[0])):
                plt.rcParams['figure.figsize'] = [20, 15]
                plt.plot(y[:,i], label='actual')
                plt.plot(yhat[:,i], label='predicted')
                plt.legend(['actual', 'predicted'], loc='upper left')
                plt.title(y_labels[i%len(y_labels)] + " +" + str(math.ceil((i+1)/len(y_label))))
                plt.show()
    #
    @staticmethod
    def write_results_to_disk(path, iteration, lag, test_split, estimator, score, time_train):
        file_exists = os.path.isfile(path)
        with open(path, 'a') as csvfile:
            headers = ['iteration', 'lag', 'test_split', 'estimator', 'score', 'time_train']
            writer = csv.DictWriter(csvfile, delimiter=',', lineterminator='\n', fieldnames=headers)
            if not file_exists:
                writer.writeheader()  # file doesn't exist yet, write a header
            writer.writerow({'iteration': iteration,
                             'lag': lag,
                             'test_split': test_split,
                             'estimator': estimator,
                             'score': score,
                             'time_train': time_train})

In [11]:
X_train, X_validate, y_train, y_validate = train_test_split(X_df, y_df, test_size=test_split)
# X_train = X_train.values
# y_train = y_train.values
#
print("X_train shape [" + str(X_train.shape) + "] Type - " + str(type(X_train)))
print("y_train shape [" + str(y_train.shape) + "] Type - " + str(type(y_train)))
#
X_validate, X_test, y_validate, y_test = train_test_split(X_validate, y_validate, test_size=.5)
#
# X_validate = X_validate.values
# y_validate = y_validate.values
print("X_validate shape [" + str(X_validate.shape) + "] Type - " + str(type(X_validate)))
print("y_validate shape [" + str(y_validate.shape) + "] Type - " + str(type(y_validate)))
#
# X_test = X_test.values
# y_test = y_test.values
print("X_test shape [" + str(X_test.shape) + "] Type - " + str(type(X_test)))
print("y_test shape [" + str(y_test.shape) + "] Type - " + str(type(y_test)) + "\n------------------------------")
#
print(X_train[0:1])
print(y_train[0:1])
print('------------------------------------------------------------')
print(X_validate[0:1])
print(y_validate[0:1])
print('------------------------------------------------------------')
print(X_test[0:1])
print(y_test[0:1])
#
# Train on discrete data (Train > Validation)
print('Training + Validation')
model = RandomForest(n_estimators=n_estimators,
                     parallel_degree=parallel_degree)
model.fit_model(X_train=X_train,
                y_train=y_train)
model.predict_and_evaluate(X=X_validate,
                           y=y_validate,
                           y_labels=y_label,
                           plot=True)
#
# Train on discrete data (Train + Validation > Test)
print('\n\nTraining + Testing')
model.fit_model(X_train=X_validate,
                y_train=y_validate)
model.predict_and_evaluate(X=X_test,
                           y=y_test,
                           y_labels=y_label,
                           plot=True)

X_train shape [(651, 4)] Type - <class 'pandas.core.frame.DataFrame'>
y_train shape [(651, 3)] Type - <class 'pandas.core.frame.DataFrame'>
X_validate shape [(81, 4)] Type - <class 'pandas.core.frame.DataFrame'>
y_validate shape [(81, 3)] Type - <class 'pandas.core.frame.DataFrame'>
X_test shape [(82, 4)] Type - <class 'pandas.core.frame.DataFrame'>
y_test shape [(82, 3)] Type - <class 'pandas.core.frame.DataFrame'>
------------------------------
                                             var1(t-3)  \
462  [24, 43, 29, 41, 44, 3, 15, 20, 25, 12, 30, 49...   

                                             var1(t-2)  \
462  [26, 42, 16, 29, 5, 28, 22, 35, 38, 3, 10, 9, ...   

                                             var1(t-1)  \
462  [28, 46, 7, 53, 6, 18, 4, 20, 60, 49, 0, 22, 4...   

                                               var1(t)  
462  [57, 42, 4, 15, 9, 35, 49, 53, 0, 12, 37, 52, ...  
                                             var1(t+1)  \
462  [47, 7, 38, 57, 5, 11

ValueError: setting an array element with a sequence.