# Guidepost Tutorial Notebook

## Provided Tutorial Datasets
test_data_med.parquet


### 1. Import and Initialize Guidepost Along With Pandas

In [1]:
# import pandas as pd
# from guidepost import Guidepost
# %env ANYWIDGET_HMR=1

# gp = Guidepost()

env: ANYWIDGET_HMR=1


In [67]:
import anywidget
import traitlets
import pandas as pd
import json
import os

class Guidepost(anywidget.AnyWidget):
    _esm = os.path.join("../guidepost/guidepost.js")
    vis_data = traitlets.Dict({}).tag(sync=True)
    vis_configs = traitlets.Dict({
        'x': 'submit_time',
        'y': 'queue_wait',
        'color': 'nodes_req',
        'color_agg': 'avg',
        'categorical': 'user'}).tag(sync=True)
    selected_records = traitlets.Unicode("[]").tag(sync=True)
    records_df = pd.DataFrame()

    def retrieve_selected_data(self):
        obj = json.loads(self.selected_records)
        self.records_df = pd.DataFrame()
        
        for val in obj:
            self.records_df = pd.concat([self.records_df, pd.DataFrame.from_records(obj[val])])
            
        return self.records_df
    
gp = Guidepost()

### 2. Load, Validate and Specify the Columns You are Interested In

Note the following:
- Guidepost does not allow for json in a data column so be sure to remove columns with complex json string stored in them.
- Guidepost expects that there will not be any nans in the data, so please run `.dropna()` on your dataset.
- `.vis_data` expects a python dictonary so be sure to run to_dict() on any pandas dataframe you are passing into Guidepost.
- Guidepost is not yet optimized for very large datasets so try to minimize the number of columns present in the data passed to the visualizaiton.

In [112]:
jobs_data_df = pd.read_parquet("data/test_data_med.parquet")

gp.vis_data = jobs_data_df.dropna()[['account',\
                                 'accounting_qos',\
                                 'day_of_week',\
                                 'day_of_week_num',\
                                 'hour_of_day',\
                                 'hours_used',\
                                 'cpu_eff',\
                                 'job_array_id',\
                                 'nodes_req',\
                                 'partition',\
                                 'processors_req',\
                                 'processors_used',\
                                 'qcd',\
                                 'qos',\
                                 'queue_avg_mem',\
                                 'queue_depth_log',\
                                 'queue_avg_size',\
                                 'queue_wait',\
                                 'queue_wait_pred',\
                                 'start_time',\
                                 'submit_time',\
                                 'user',\
                                 'wallclock_req']].to_dict()

### 3. Configure the encoding rules for the visualization

Load configurations into the `gp.vis_configs` class variable.

Vis configurations must be specified as a python dictonary with the following fields:

- 'x': The column name from the pandas dataframe which will be shown on the x axis. The data in the column can be integers, floats or datetimes.
- 'y': The column name from the pandas dataframe which will be shown on the y axis of this visualization. The data in the column can be integers or floats.
- 'color': The column name from the pandas dataframe which will determine the color of squares in the main summary view. The data in the column can be integers or floats.
- 'color_agg': This is a specification for what aggregation is used for the color variable. It can be: 'avg', 'variance', 'std', 'sum', or 'median'
- 'categorical': A categorical variable from the dataset. The data in the column must be a string. The visualization will show the top 7 instances of this variable.

In [113]:
gp.vis_configs = {
        'x': 'queue_wait_pred',
        'y': 'queue_wait',
        'color': 'nodes_req',
        'color_agg': 'avg',
        'categorical': 'user',
        # 'facet_by': 'partition'
}

### Run the Visualization by calling the guidepost object in it's own cell: `gp`




In [106]:
gp

Guidepost(selected_records='', vis_configs={'x': 'queue_wait_pred', 'y': 'queue_wait', 'color': 'nodes_req', '…

In [53]:
gp.retrieve_selected_data()