# Schedule TPC-DS 1 Descriptor

This notebook contains work pertatining to pattern learning / identification for a database workload schedule. It contains descriptors of the available data through plot visualizations, so as to better understand which resource play a part into reflecting underlying workloads.

## Data Preprocessing

### Module Installation and Importing Libraries

In [7]:
# Module Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Configuration Cell

Tweak parametric changes from this cell to influence outcome of experiment. 
NB: This experiment demonstrates at time  step = 1 (1 minute in advance). Further down in experiment, other timestep results are also featured and evaluated.

In [8]:
tpcds='TPCDS1'

In [9]:
# Root path
#root_dir = 'C:/Users/gabriel.sammut/University/Data_ICS5200/Schedule/' + tpcds
root_dir = 'D:/Projects/Datagenerated_ICS5200/Schedule/' + tpcds

# Open Data
scheduler_log_path = root_dir + '/msg_log_tpcds1scheduler_20181116'
scheduler_log_file = open(scheduler_log_path,'r')

def prettify_header(headers):
    """
    Cleans header list from unwated character strings
    """
    header_list = []
    [header_list.append(header.replace("(","").replace(")","").replace("'","").replace(",","")) for header in headers]
    return header_list

In [10]:
class ParseLogs:
    """
    Parses Scheduler File
    """
    @staticmethod
    def parse_log_file(file):
        """
        Parses Scheduling Log File, and retrieves relavaent info. Input is a dataframe of type 'Pandas'
        """
        task_list = ['THROUGHPUT_TEST_1',
                     'THROUGHPUT_TEST_2',
                     'DATA_MAINTENANCE_1',
                     'DATA_MAINTENANCE_2',
                     'GATHER_STATS',
                     'POWER_TEST']
        lines = []
        flag = False
        for line in file.readlines():
            #
            line = str(line)
            line = line.replace(' ',' ')
            #
            if 'Metrics successfully written to file' in line: # Skip this line
                continue
            #
            if flag is True:
                seconds = ParseLogs.__parse_time(line)
                lines[len(lines)-1].append(seconds)
                flag = False
                continue
            #
            for task in task_list:
                if task in line:
                    timestamp = ParseLogs.__parse_timestamp(line)
                    snap_id = ParseLogs.__parse_snap_id(line)
                    lines.append([timestamp,
                                  task,
                                  snap_id])
                    flag = True
                    break
        return lines
    
    @staticmethod
    def __parse_timestamp(data_line):
        """
        Parses timestamp from passed data_line
        """
        return data_line[0:19]
    
    @staticmethod
    def __parse_snap_id(data_line):
        """
        Parses log line and retrieves SNAP_ID
        """
        snap_id = ''
        for i in reversed(range(len(data_line))):
            #print(len(data_line) - i)
            if (len(data_line) - i) < 7:
                try:
                    snap_id += str(int(data_line[i]))
                except:
                    pass
            else:
                break
        snap_id = snap_id[::-1]
        return int(snap_id)
    
    @staticmethod
    def __parse_time(data_line):
        """
        Parses time in seconds for line position index+1 (determined by log file structure)
        """
        time_secs = ''
        for i in range(len(data_line)):
            if i > 34:
                try:
                    data_line = str(data_line)
                    time_secs += str(int(data_line[i]))
                except:
                    if data_line[i] == '.':
                        break
        return int(time_secs)

parsed_log_file = ParseLogs.parse_log_file(scheduler_log_file)
print(parsed_log_file)
scheduler_log_file.close()

[['2018-11-16 17:34:06', 'GATHER_STATS', 3413, 32], ['2018-11-16 17:35:56', 'POWER_TEST', 3415, 110], ['2018-11-16 17:38:30', 'THROUGHPUT_TEST_1', 3417, 154], ['2018-11-16 17:40:57', 'DATA_MAINTENANCE_1', 3420, 147], ['2018-11-16 17:43:37', 'THROUGHPUT_TEST_2', 3422, 159], ['2018-11-16 17:46:04', 'DATA_MAINTENANCE_2', 3425, 147], ['2018-11-16 17:46:37', 'GATHER_STATS', 3425, 32], ['2018-11-16 17:48:27', 'POWER_TEST', 3427, 110], ['2018-11-16 17:51:00', 'THROUGHPUT_TEST_1', 3430, 152], ['2018-11-16 17:53:27', 'DATA_MAINTENANCE_1', 3432, 147], ['2018-11-16 17:56:06', 'THROUGHPUT_TEST_2', 3435, 158], ['2018-11-16 17:58:32', 'DATA_MAINTENANCE_2', 3437, 146], ['2018-11-16 17:59:07', 'GATHER_STATS', 3438, 34], ['2018-11-16 18:00:58', 'POWER_TEST', 3439, 110], ['2018-11-16 18:03:31', 'THROUGHPUT_TEST_1', 3442, 152], ['2018-11-16 18:05:58', 'DATA_MAINTENANCE_1', 3444, 147], ['2018-11-16 18:08:37', 'THROUGHPUT_TEST_2', 3447, 158], ['2018-11-16 18:11:04', 'DATA_MAINTENANCE_2', 3449, 146], ['2018