# Data Profiler

## Quick Start
1. Run the cells in this notebook (Run > Run All Cells)
1. Add your login info and dataset ID, and click **Login**
1. Select the features you wish to run, and a page name for the output, and click **Run Scaffolding**

Checkout the **README.md** in the `docs/` for a detailed description of the profiler features

## Jupyter Notebook Tips
* Jupyter Notebooks are made up of "cells"
    * These cells can be Markdown (like this one),
    * or they can contain Python code
* Cells must be *run* for the code to be executed
    * This can be done with `Ctrl+Enter` or `Shift+Enter`
    * You can also run cells from the "Run" and "Kernel" menus
* Cells can be *hidden*, which makes a Notebook easier to view
    * Hidden cells are displayed as 3 dots
    * Expand hidden cells by clicking the dots
    * Hide cells by clicking the blue bar to the left of the cell

<br>

---

In [1]:
## Just hit cmd enter to run this cell, then click on the blue bar to the left to hide this cell again - Thanks##

try:
    __import__('ipywidgets')
except ImportError:
    print('ipywidgets not found, installing now')
    !pip install ipywidgets==8.0.4

import ipywidgets as widgets
import time
import json
import requests
import copy
from getpass import getpass

from IPython.display import display, HTML as ipyHTML
from ipywidgets import (
    Label, Button, HBox, VBox,
    Layout, Checkbox, Text, Output,
    Password, Dropdown, HTML, RadioButtons
)

URL = {
    'login':                        '{}/api/content/v2/authentication',
    'userInfo':                     '{}/api/content/v2/users/me',
    'card':                         '{}/api/content/v1/cards/kpi',
    'cardV2':                       '{}/api/content/v2/cards/kpi',
    'getSchemaFromDataSource':      '{}/api/data/v2/datasources/{}/schemas/indexed',
    'getDataSourceDetails':         '{}/api/data/v3/datasources/{}',
    'createDataSource':             '{}/api/data/v2/datasources',
    'dataFlow':                     '{}/api/dataprocessing/v1/dataflows',
    'dataflowRun':                  '{}/api/dataprocessing/v1/dataflows/{}/executions', 
    'webforms':                     '{}/api/data/v2/webforms',
    'createPage':                   '{}/api/content/v1/pages?layout=false',
    'getPageDetails':               '{}/api/content/v3/stacks/{}/cards?includePageLayouts=true&parts=metadata,datasources,drillPathURNs',
    'addTagToDataSource':           '{}/api/data/ui/v3/datasources/{}/tags',
    'addTagToDataFlow':             '{}/api/dataprocessing/v1/dataflows/{}/tags',
}

jsons = {
    ## Cards for Profile
    'templateHist': {'conditionalFormats': {'card': [], 'datasource': []}, 'formulas': {'card': [{'id': 'calculation_1', 'name': 'Summary Number col_name', 'formula': "summary", 'status': 'VALID', 'persistedOnDataSource': False, 'dataType': 'numeric', 'isAggregatable': False, 'bignumber': False}], 'dsUpdated': [], 'dsDeleted': []}, 'subscriptions': {'big_number': {'name': 'big_number', 'dataSourceId': 'dsid', 'columns': [{'column': None, 'aggregation': None, 'formulaId': 'calculation_1', 'dataSourceId': 'dsid', 'calendar': None, 'mapping': None, 'alias': None, 'format': {'type': 'abbreviated', 'format': '0', 'default': False}}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': None, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}, 'main': {'name': 'main', 'dataSourceId': 'dsid', 'columns': [{'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'ITEM', 'alias': None, 'format': None}, {'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'VALUE', 'alias': None, 'format': None}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': {'column': 'Event_Time', 'dateTimeElement': None}, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}}, 'slicers': [], 'title': 'col_name Histogram', 'description': '', 'chartType': 'badge_vert_histogram', 'chartVersion': '6', 'goal': None, 'noSummary': False, 'allowTableDrill': True, 'locked': None, 'hideColumns': None, 'metadataOverrides': {}, 'preferredFullSize': None, 'cardId': None, 'urn': None}
    ,'templateDate': {'conditionalFormats': {'card': [], 'datasource': []}, 'formulas': {'card': [{'id': 'calculation_1', 'name': 'NULL Count col_name', 'formula': "CONCAT(SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END),' Nulls | ',ROUND(SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END)/(COUNT(`col_name`)+SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END))*100,0),'% of all rows')", 'status': 'VALID', 'persistedOnDataSource': False, 'dataType': 'numeric', 'isAggregatable': False, 'bignumber': False}], 'dsUpdated': [], 'dsDeleted': []}, 'subscriptions': {'big_number': {'name': 'big_number', 'dataSourceId': 'dsid', 'columns': [{'column': None, 'aggregation': None, 'formulaId': 'calculation_1', 'dataSourceId': 'dsid', 'calendar': None, 'mapping': None, 'alias': 'NULL Count col_name', 'format': {'type': 'abbreviated', 'format': '0', 'default': False}}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': None, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}, 'main': {'name': 'main', 'dataSourceId': 'dsid', 'columns': [{'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'ITEM', 'alias': None, 'format': None}, {'column': 'col_name', 'aggregation': 'COUNT', 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'VALUE', 'alias': None, 'format': None}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': {'column': 'col_name', 'dateTimeElement': None}, 'orderBy': [], 'groupBy': [{'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None}], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}}, 'slicers': [], 'title': 'col_name', 'description': '', 'chartType': 'badge_vert_bar', 'chartVersion': '6', 'goal': None, 'noSummary': False, 'allowTableDrill': True, 'locked': None, 'hideColumns': None, 'metadataOverrides': {}, 'preferredFullSize': None, 'cardId': None, 'urn': 'null'}
    ,'templateCat': {'conditionalFormats': {'card': [], 'datasource': []}, 'formulas': {'card': [{'id': 'calculation_1', 'name': 'NULL Count col_name', 'formula': "CONCAT(SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END),' Nulls | ',ROUND(SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END)/(COUNT(`col_name`)+SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END))*100,0),'% of all rows')", 'status': 'VALID', 'persistedOnDataSource': False, 'dataType': 'numeric', 'isAggregatable': False, 'bignumber': False}], 'dsUpdated': [], 'dsDeleted': []}, 'subscriptions': {'big_number': {'name': 'big_number', 'dataSourceId': 'dsid', 'columns': [{'column': None, 'aggregation': None, 'formulaId': 'calculation_1', 'dataSourceId': 'dsid', 'calendar': None, 'mapping': None, 'alias': 'NULL Count col_name', 'format': {'type': 'abbreviated', 'format': '0', 'default': False}}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': None, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}, 'main': {'name': 'main', 'dataSourceId': 'dsid', 'columns': [{'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'ITEM', 'alias': None, 'format': None}, {'column': 'col_name', 'aggregation': 'COUNT', 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'VALUE', 'alias': None, 'format': None}, {'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'SERIES', 'alias': None, 'format': None}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': {'column': 'Order Date', 'dateTimeElement': None}, 'orderBy': [], 'groupBy': [{'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None}], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}}, 'slicers': [], 'title': 'col_name', 'description': '', 'chartType': 'badge_horiz_bar', 'chartVersion': '6', 'goal': None, 'noSummary': False, 'allowTableDrill': True, 'locked': None, 'hideColumns': None, 'metadataOverrides': {'range_filter_y': 'none', 'range_filter_time': 'none', 'title_x': 'Categories in col_name', 'title_y': 'Rows in col_name', 'range_filter_x': 'none', 'row_filter': 'none', 'series_filter': 'none', 'hide_series': 'none', 'datalabel_text': '%_PERCENT_OF_TOTAL','datalabel_fill': True,'hover_text': '%_CATEGORY_NAME \n%_VALUE  (%_PERCENT_OF_TOTAL)', 'total_sort': 'Descending'}, 'preferredFullSize': None, 'cardId': None, 'urn': None}
    ,'templateScat': {'conditionalFormats': {'card': [], 'datasource': []}, 'formulas': {'card': [{'id': 'calculation_1', 'name': 'Correlation', 'formula': 'CASE WHEN ABS((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`))) >= 0.7000 THEN CONCAT(\'<b><font color="#31689B">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') WHEN ABS((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`))) >= 0.5000 THEN CONCAT(\'<b><font color="#72B0D7">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') WHEN ABS((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`))) >= 0.3000 THEN CONCAT(\'<b><font color="#B7DAF5">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') ELSE CONCAT(\'<b><font color="#D4D4D4">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') END', 'status': 'VALID', 'persistedOnDataSource': False, 'dataType': 'numeric', 'isAggregatable': False, 'bignumber': False}], 'dsUpdated': [], 'dsDeleted': []}, 'subscriptions': {'big_number': {'name': 'big_number', 'dataSourceId': 'dsid', 'columns': [{'column': None, 'aggregation': None, 'formulaId': 'calculation_1', 'dataSourceId': 'dsid', 'calendar': None, 'mapping': None, 'alias': 'Correlation', 'format': {'type': 'abbreviated', 'format': '0.0000', 'default': False}}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': None, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}, 'main': {'name': 'main', 'dataSourceId': 'dsid', 'columns': [{'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'XTIME', 'alias': 'col_name', 'format': None}, {'column': 'dvar', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'VALUE', 'alias': 'dvar', 'format': None}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': {'column': 'Event_Time', 'dateTimeElement': None}, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}}, 'slicers': [], 'title': 'col_name vs. dvar Scatterplot', 'description': '', 'chartType': 'badge_xyscatterplot_basic', 'chartVersion': '6', 'goal': None, 'noSummary': False, 'allowTableDrill': True, 'locked': None, 'hideColumns': None, 'metadataOverrides': {'show_regression_lines': 'true', 'title_x': 'col_name', 'title_y': 'dvar', 'regression_line_style': 'Default', 'regression_line_color': '#FB8D34'}, 'preferredFullSize': None, 'cardId': None, 'urn': None}
    ,'templateBox': {'conditionalFormats': {'card': [], 'datasource': []}, 'formulas': {'card': [], 'dsUpdated': [], 'dsDeleted': []}, 'subscriptions': {'big_number': {'name': 'big_number', 'dataSourceId': 'dsid', 'columns': [{'column': 'col_name', 'aggregation': 'AVG', 'formulaId': 'calculation_1', 'dataSourceId': 'dsid', 'calendar': None, 'mapping': None, 'alias': 'Mean of col_name', 'format': {'type': 'number', 'format': '###,##0', 'default': False}}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': None, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}, 'main': {'name': 'main', 'dataSourceId': 'dsid', 'columns': [{'column': 'col_name', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'VALUE', 'alias': 'col_name', 'format': None}, {'column': 'dvar', 'aggregation': None, 'formulaId': None, 'dataSourceId': 'dsid', 'calendar': None, 'mapping': 'ITEM', 'alias': 'dvar', 'format': None}], 'filters': [], 'dateRangeFilter': None, 'dateGrain': {'column': 'DATE', 'dateTimeElement': None}, 'orderBy': [], 'groupBy': [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': None, 'offset': None}}, 'slicers': [], 'title': 'col_name vs. dvar Boxplot', 'description': '', 'chartType': 'badge_horiz_boxplot', 'chartVersion': '6', 'goal': None, 'noSummary': False, 'allowTableDrill': True, 'locked': None, 'hideColumns': None, 'metadataOverrides': {'title_x': 'dvar', 'title_y': 'col_name', 'datalabel_text': '%_VALUE'}, 'preferredFullSize': None, 'cardId': None, 'urn': None}
    ,'genericCardV2': {"subscriptions": {"big_number": {"name": "big_number","dataSourceId": "dsid","columns": [{"column": "values_column","aggregation": "aggType","dataSourceId": "dsid","alias": "aggType of values_column","format": {"type": "abbreviated","format": "#","default": False}}],"filters": [],"orderBy": [],"groupBy": [],"fiscal": False,"projection": False,"distinct": False,"limit": 1},"main": {"name": "main","dataSourceId": "dsid","columns": [{"column": "cat_column","dataSourceId": "dsid","mapping": "ITEM"},{"column": "values_column","aggregation": "aggType","dataSourceId": "dsid","mapping": "VALUE"},{"column": "series_column","dataSourceId": "dsid","mapping": "SERIES"}],"filters": [],"orderBy": [],"groupBy": [{"column": "cat_column","dataSourceId": "dsid"},{"column": "series_column","dataSourceId": "dsid"}],"fiscal": False,"projection": False,"distinct": False,"dateGrain": {}}},"formulas": {"dsUpdated": [],"dsDeleted": [],"card": []},"annotations": {"new": [],"modified": [],"deleted": []},"conditionalFormats": {"card": [],"datasource": []},"slicers": [],"charts": {"main": {"component": "main","chartType": "badgeorient_cType","overrides": {},"goal": None}},"title": "generatedTitle","description": "","chartVersion": "6","allowTableDrill": True}
    ,'corMat_pivotTable': {'subscriptions': {'main': {'name': 'main', 'dataSourceId': 'dsid', 'columns': [{'column': 'Var1', 'dataSourceId': 'dsid', 'mapping': 'ROW'}, {'column': 'Var2', 'dataSourceId': 'dsid', 'mapping': 'COLUMN'}, {'column': 'Correlation', 'aggregation': 'AVG', 'dataSourceId': 'dsid', 'mapping': 'VALUE'}], 'filters': [], 'orderBy': [], 'groupBy': [{'column': 'Var2', 'dataSourceId': 'dsid'}, {'column': 'Var1', 'dataSourceId': 'dsid'}], 'fiscal': False, 'projection': False, 'distinct': False}}, 'formulas': {'dsUpdated': [], 'dsDeleted': [], 'card': []}, 'annotations': {'new': [], 'modified': [], 'deleted': []}, 'conditionalFormats': {'card': [{"condition":{"column":"Correlation","values":["1","0.9"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#31689B","textColor":"#ffffff","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":[".9",".7"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#4E8CBA","textColor":"#ffffff","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":["0.7","0.5"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#73B0D7FF","textColor":"#ffffff","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":[".5",".3"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#90C4E4FF","textColor":"#345263","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":[".3",".1"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#B7DAF5FF","textColor":"#3c525b","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":[".1","-.1"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#FFFFFF7D","textColor":"#504e50","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":["-.1","-.3"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#B7DAF5FF","textColor":"#3c525b","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":["-.3","-.5"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#90C4E4FF","textColor":"#345263","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":["-.5","-.7"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#73B0D7FF","textColor":"#ffffff","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":["-.7","-.9"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#4E8CBAFF","textColor":"#ffffff","textStyle":"PLAIN","applyToRow":True}},{"condition":{"dataSourceId":"773278e2-0a73-4a7c-9242-08c9e5ab2c1a","column":"Correlation","values":["-.9","-1"],"operand":"BETWEEN", 'dataSourceId': 'dsid'},"format":{"color":"#31689BFF","textColor":"#ffffff","textStyle":"PLAIN","applyToRow":True}}], 'datasource': []}, 'slicers': [{'type': 'string', 'displayType': 'multiple_select', 'dataSourceId': 'dsid', 'name': 'Var1', 'column': 'Var1', 'columnDisplayName': 'Var1', 'operator': 'IN', 'values': [], 'collapsed': True}, {'type': 'string', 'displayType': 'multiple_select', 'dataSourceId': 'dsid', 'name': 'Var2', 'column': 'Var2', 'columnDisplayName': 'Var2', 'operator': 'IN', 'values': [], 'collapsed': True}], 'charts': {'main': {'component': 'main', 'chartType': 'badge_pivot_table', 'overrides': {'subtotal_rows': 'false', 'total_col': 'false', 'subtotal_columns': 'false', 'total_row': 'false'}, 'goal': None}}, 'title': 'Correlation Matrix', 'description': '', 'chartVersion': '6', 'allowTableDrill': True, 'noDateRange': True}
    ,'validFace': {"subscriptions":{"main":{"name":"main","dataSourceId":"dsid","columns":[{"column":"valid","mapping":"VALUE","aggregation":"SUM","dataSourceId":"dsid"},{"column":"valid","dataSourceId":"dsid","aggregation":"SUM"}],"filters":[],"orderBy":[],"groupBy":[],"fiscal":False,"projection":False,"distinct":False}},"formulas":{"dsUpdated":[],"dsDeleted":[],"card":[]},"annotations":{"new":[],"modified":[],"deleted":[]},"conditionalFormats":{"card":[],"datasource":[]},"slicers":[],"charts":{"main":{"component":"main","chartType":"badge_facegauge","overrides":{"green_range_min":"0","green_range_max":"0","red_range_min":"1","red_range_max":"999"},"goal":None}},"title":"Production Data Validated","description":"","chartVersion":"6","noDateRange":True}
    ,'summaryTable':  {'subscriptions':  {'big_number':  {'name': 'big_number', 'dataSourceId': 'dsid', 'columns':      [{'column': 'count_model', 'aggregation': 'MAX', 'dataSourceId': 'dsid', 'alias': 'Total Rows', 'format':  {'type': 'number', 'format': '###,##0', 'default': False}}], 'filters':      [], 'orderBy':      [], 'groupBy':      [], 'fiscal': False, 'projection': False, 'distinct': False, 'limit': 1}, 'main':  {'name': 'main', 'dataSourceId': 'dsid', 'columns':      [{'column': 'column_name_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE', 'alias': 'Column Name', 'format':  {'alignment': 'LEFT', 'style': 'BOLD', 'default': False}},      {'column': 'min_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE'},      {'column': '25%_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE'},      {'column': 'mean_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE'},      {'column': '50%_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE'},      {'column': '75%_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE'},      {'column': 'max_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE'},      {'column': 'count_model', 'dataSourceId': 'dsid', 'mapping': 'VALUE'}], 'filters':      [], 'orderBy':      [], 'groupBy':      [], 'fiscal': False, 'projection': False, 'distinct': False}}, 'formulas':  {'dsUpdated':      [], 'dsDeleted':      [], 'card':      []}, 'annotations':  {'new':      [], 'modified':      [], 'deleted':      []}, 'conditionalFormats':  {'card':      [], 'datasource':      []}, 'slicers':      [], 'charts':  {'main':  {'component': 'main', 'chartType': 'badge_table', 'overrides':  {'header_row_fill_color': '#90C4E4FF', 'header_row': 'Center'}, 'goal': None}}, 'title': 'Summary Statistics', 'description': '', 'chartVersion': '6', 'allowTableDrill': True, 'noDateRange': True}
    ,'validErrorsTable': {"subscriptions":{"big_number":{"name":"big_number","dataSourceId":"dsid","columns":[{"aggregation":"SUM","alias":"Invalid Columns","column":"valid","dataSourceId":"dsid","format":{"format":"0","type":"abbreviated"}}],"filters":[],"groupBy":[],"orderBy":[]},"main":{"name":"main","dataSourceId":"dsid","columns":[{"column":"column name","mapping":"VALUE","alias":"Column Name","dataSourceId":"dsid"},{"column":"reason","mapping":"VALUE","alias":"Reason","dataSourceId":"dsid"}],"filters":[{"column":"valid","dataType":"numeric","dataSourceId":"dsid","slicerEnabled":False,"operand":"GREAT_THAN_EQUALS_TO","values":[1]}],"orderBy":[],"groupBy":[],"fiscal":False,"projection":False,"distinct":False}},"formulas":{"dsUpdated":[],"dsDeleted":[],"card":[]},"annotations":{"new":[],"modified":[],"deleted":[]},"conditionalFormats":{"card":[],"datasource":[]},"slicers":[],"charts":{"main":{"component":"main","chartType":"badge_basic_table","overrides":{"header_row_fill_color":"#4E8CBAFF"},"goal":None}},"title":"Data Validation Errors","description":"","chartVersion":"6","noDateRange":True}
    ,'varImp': {"subscriptions":{"main":{"name":"main","dataSourceId":"dsid","columns":[{"column":"term","dataSourceId":"dsid","mapping":"ITEM"},{"column":"estimate","aggregation":"SUM","dataSourceId":"dsid","mapping":"VALUE"},{"formulaId":"calculation_b36abc0c-9dbd-47ce-8fbe-74428c3f063c","mapping":"SERIES","dataSourceId":"dsid"}],"filters":[],"orderBy":[],"groupBy":[{"formulaId":"calculation_b36abc0c-9dbd-47ce-8fbe-74428c3f063c","dataSourceId":"dsid"},{"column":"term","dataSourceId":"dsid"}],"fiscal":False,"projection":False,"distinct":False}},"formulas":{"dsUpdated":[{"id":"calculation_b36abc0c-9dbd-47ce-8fbe-74428c3f063c","name":"Statistical Significance","formula":"CASE \n  WHEN `p.value` <= .05\n    THEN 'Significant'\n  ELSE 'Not Significant'\nEND ","status":"VALID","dataType":"string","persistedOnDataSource":True,"isAggregatable":True,"bignumber":False}],"dsDeleted":[],"card":[]},"annotations":{"new":[],"modified":[],"deleted":[]},"conditionalFormats":{"card":[{"dataSourceId":None,"condition":{"dataSourceId":"dsid","column":"calculation_b36abc0c-9dbd-47ce-8fbe-74428c3f063c","values":["Significant"],"operand":"IN"},"format":{"color":"#4E8CBAFF","textColor":"#ffffff","textStyle":"PLAIN","applyToRow":True}},{"dataSourceId":None,"condition":{"dataSourceId":"dsid","column":"calculation_b36abc0c-9dbd-47ce-8fbe-74428c3f063c","values":["Not Significant"],"operand":"IN"},"format":{"color":"#A0D7718C","textColor":"#000000","textStyle":"PLAIN","applyToRow":True}}],"datasource":[]},"slicers":[],"charts":{"main":{"component":"main","chartType":"badge_horiz_multi_dotplot","overrides":{"lrg_legend_position":"Bottom","details_legend_position":"Bottom","datalabel_text":"%_CATEGORY_TOTAL","total_sort":"Descending","hide_series":"none","series_filter":"none","row_filter":"none","range_filter_y":"none","range_filter_x":"none","range_filter_cat_x":"none","range_filter_time":"none"},"goal":None}},"title":"Variable Importance","description":"","chartVersion":"6","noDateRange":True}
    ,'varImp2': {"subscriptions":{"main":{"name":"main","dataSourceId":"dsid","columns":[{"column":"term","dataSourceId":"dsid","mapping":"ITEM"},{"column":"statistic","mapping":"VALUE","aggregation":"SUM","dataSourceId":"dsid"},{"column":"estimate","aggregation":"SUM","dataSourceId":"dsid"}],"filters":[],"orderBy":[],"groupBy":[{"column":"term","dataSourceId":"dsid"}],"fiscal":False,"projection":False,"distinct":False}},"formulas":{"dsUpdated":[],"dsDeleted":[],"card":[{"id":"calculation_aac917da-db1e-4787-a41c-8b2ba3055938","name":"Odds","formula":"-- Only use if using a logistic regression model\nPOWER(2.71828, `estimate`)-1","status":"VALID","dataType":"numeric","persistedOnDataSource":False,"isAggregatable":True,"bignumber":False},{"id":"calculation_901e9206-96f3-41be-9f6b-b7b39dc71ba8","name":"Statistical Significance","formula":"CASE \n\tWHEN `p.value` <= .05\n    THEN 'Significant'\n    ELSE 'Not Significant'\nEND ","status":"VALID","dataType":"string","persistedOnDataSource":False,"isAggregatable":True,"bignumber":False}]},"annotations":{"new":[],"modified":[],"deleted":[]},"conditionalFormats":{"card":[],"datasource":[]},"slicers":[],"charts":{"main":{"component":"main","chartType":"badge_horiz_stackedbar","overrides":{"title_x":"Independent Variables","title_y":"Relative Impact","total_sort":"Descending","hide_series":"none","series_filter":"none","row_filter":"none","range_filter_y":"none","range_filter_x":"none","range_filter_cat_x":"none","range_filter_time":"none"},"goal":None}},"title":"Variable Importance","description":"","chartVersion":"6","noDateRange":True}
    ,'accuracy': {"subscriptions":{"main":{"name":"main","dataSourceId":"dsid","columns":[{"formulaId":"calculation_2047e14f-1a96-4e79-b76a-cf1be9717a89","mapping":"VALUE","dataSourceId":"dsid"},{"formulaId":"calculation_a40e58a6-0639-408e-b2b2-df21e60523a5","aggregation":"AVG","dataSourceId":"dsid"},{"column":"Actual","dataSourceId":"dsid","aggregation":"SUM"},{"column":"Error","aggregation":"SUM","formulaId":"calculation_a40e58a6-0639-408e-b2b2-df21e60523a5","dataSourceId":"dsid"},{"column":"RMSE","formulaId":"calculation_71993c4e-51f1-4027-a45c-8f88b02d7a0b","dataSourceId":"dsid"}],"filters":[],"orderBy":[],"groupBy":[],"fiscal":False,"projection":False,"distinct":False}},"formulas":{"dsUpdated":[{"id":"calculation_15ca5591-5e92-42bc-8820-e486a6ae7fe0","name":"Classification Accuracy","formula":"SUM(CASE WHEN `Actual` = `Predicted` THEN 1 ELSE 0 END) / COUNT(`Predicted`)","status":"VALID","dataType":"numeric","persistedOnDataSource":True,"isAggregatable":False,"bignumber":False},{"id":"calculation_a40e58a6-0639-408e-b2b2-df21e60523a5","name":"Error","formula":"`Predicted`-`Actual`","status":"VALID","dataType":"numeric","persistedOnDataSource":True,"isAggregatable":True,"bignumber":False},{"id":"calculation_71993c4e-51f1-4027-a45c-8f88b02d7a0b","name":"RMSE","formula":"POWER(AVG(POWER(`Predicted`-`Actual`,2)),.5)","status":"VALID","dataType":"numeric","persistedOnDataSource":True,"isAggregatable":False,"bignumber":False},{"id":"calculation_2047e14f-1a96-4e79-b76a-cf1be9717a89","name":"MAE","formula":"AVG(ABS(`Predicted`-`Actual`))","status":"VALID","dataType":"numeric","persistedOnDataSource":True,"isAggregatable":False,"bignumber":False}],"dsDeleted":[],"card":[]},"annotations":{"new":[],"modified":[],"deleted":[]},"conditionalFormats":{"card":[],"datasource":[]},"slicers":[],"charts":{"main":{"component":"main","chartType":"badge_singlevalue","overrides":{},"goal":None}},"title":"Accuracy","description":"","chartVersion":"6","noDateRange":True}

    ## Dataflows / Datasets
    ,'overrideWebform': {"rows":[["example","0","1000"]],"columns":[{"name":"field_name","valid":True,"type":"STRING","uniqueCount":1},{"name":"override_min","valid":True,"type":"LONG","uniqueCount":1},{"name":"override_max","valid":True,"type":"LONG","uniqueCount":1}],"name":"Validation Override"}
    ,"fe_base": {"documentVersion": 1, "databaseType": "ETL", "name": "Feature Engineering", "scheduleInfo": {}, "description": "", "responsibleUserId": 'user_id', "runState": "ENABLED", "actions": [{"name": None, "id": "ececef22-1901-437c-b05a-4773e391baeb", "type": "LoadFromVault", "gui": {"x": 36, "y": 180, "color": 3238043}, "dependsOn": [], "removeByDefault": False, "dataSourceId": "ds_id", "executeFlowWhenUpdated": False, "onlyLoadNewVersions": False, "recentVersionCutoffMs": 0, "columnSettings": {}}, {"name": "Feature Engineering", "id": "ed79ffeb-56de-4e7e-af95-e82acc7df728", "type": "PublishToVault", "gui": {"x": 360, "y": 180}, "dependsOn": ["ececef22-1901-437c-b05a-4773e391baeb"], "removeByDefault": False, "dataSource": {"name": "Feature Engineering"}}], "onboardFlowVersion": {}, "engineProperties": {}}
    ,"correlationFlow": {'documentVersion': 1, 'databaseType': 'ETL', 'name': 'Correlation', 'scheduleInfo': {}, 'description': '', 'responsibleUserId': 'user_id', 'runState': 'ENABLED', 'actions': [{'name': 'Data Input', 'id': 'c00ac026-f0bf-488c-bc9d-c1dc17cfc719', 'type': 'LoadFromVault', 'gui': {'x': 36, 'y': 180, 'color': 3238043}, 'dependsOn': [], 'removeByDefault': False, 'dataSourceId': 'ds_id', 'executeFlowWhenUpdated': False, 'onlyLoadNewVersions': False, 'recentVersionCutoffMs': 0, 'columnSettings': {}}, {'name': 'Correlations', 'id': 'fdd681f1-b53b-4d6d-a893-e51fce2aa686', 'type': 'PythonEngineAction', 'gui': {'x': 204, 'y': 300}, 'dependsOn': ['c00ac026-f0bf-488c-bc9d-c1dc17cfc719'], 'removeByDefault': True, 'fillMissingWithNull': True, 'script': "from domomagic import read_dataframe, write_dataframe\nfrom pandas import melt, get_dummies\n\ndf = read_dataframe('Data Input')\n\nto_dummy = []\nfor col in df.select_dtypes('O'):\n    if df[col].unique().shape[0] <= 10:\n        to_dummy.append(col)\ndf = get_dummies(df, columns=to_dummy)\n\ndf = df.corr(method='pearson')\ndf['Var1'] = df.index\ndf = df.melt(id_vars='Var1', var_name='Var2', value_name='Correlation')\n\nwrite_dataframe(df)",'additions': [{'name': 'Var1', 'dataType': 'STRING'}, {'name': 'Var2', 'dataType': 'STRING'}, {'name': 'Correlation', 'dataType': 'DOUBLE'}]},{'name': 'Correlation Matrix', 'id': '8715f441-996a-4b90-a61c-807b2a0f995f', 'type': 'PublishToVault', 'gui': {'x': 348, 'y': 300}, 'dependsOn': ['fdd681f1-b53b-4d6d-a893-e51fce2aa686'], 'removeByDefault': False, 'dataSource': {'name': 'Correlation Matrix'},},],'onboardFlowVersion': {},'engineProperties': {}}
    ,"validationCriteria": {'documentVersion': 1,    'databaseType': 'ETL',    'name': 'Validation Criteria',    'scheduleInfo': {},    'description': '',    'responsibleUserId': 'user_id',    'runState': 'ENABLED',    'actions': [{'name': 'Feature Engineering',      'id': 'c00ac026-f0bf-488c-bc9d-c1dc17cfc719',      'type': 'LoadFromVault',      'gui': {'x': 36, 'y': 180, 'color': 3238043},      'dependsOn': [],      'removeByDefault': False,      'dataSourceId': 'ds_id',      'executeFlowWhenUpdated': False,      'onlyLoadNewVersions': False,      'recentVersionCutoffMs': 0,      'columnSettings': {}},     {'name': 'Summary Stats',      'id': 'f0fa2d14-8521-4f53-ac07-cad27c9c8a09',      'type': 'PythonEngineAction',      'gui': {'x': 204, 'y': 180, 'color': None},      'dependsOn': ['c00ac026-f0bf-488c-bc9d-c1dc17cfc719',       '2976ebff-be2a-47a5-a546-9365aa4edf46'],      'removeByDefault': True,      'fillMissingWithNull': True,      'script': "from domomagic import *\nimport numpy as np\nimport pandas as pd\nfrom array import *\n\n# read data from inputs into a data frame\ndata = read_dataframe('Feature Engineering')\noverride = read_dataframe('Validation Override')\n\n#Get Datatypes of all columns\ndfDataTypes = data.dtypes.to_frame()\ndfDataTypes.columns = ['data type']\n\n#Get Summary Stats\ndfStats = data.describe(include = 'all')\ndfStats['stat'] = dfStats.index\ndfStats.index = np.arange(0, len(dfStats))\ndfStatsPivoted = pd.melt(dfStats, id_vars=['stat'])\ndfStatsPivoted = dfStatsPivoted.pivot(index='variable', columns='stat', values='value')\n\n#Join df of Summary Stats to df of Columns datatypes\nresult = pd.concat([dfStatsPivoted, dfDataTypes], axis=1, join='inner')\nresult['column name'] = result.index\nresult.index = np.arange(0, len(result))\n\n#Get distinct values of categorical columns\nresult['unique values'] = ''\nfor row in result.iterrows():\n    col = row[1]['column name']\n    if row[1]['data type'] == 'object':\n        r = data[col].astype('U')\n        arr_unique = r.unique()\n        list_unique = arr_unique.tolist()\n        list_unique.sort()\n        str_unique = ':'.join(list_unique)\n        row[1]['unique values'] = str_unique\n\n# Override numeric values\nresult = result.merge(override,how = 'left', left_on='column name', right_on='field_name')\nresult['max'] = np.where(result['field_name'] == result['column name'],  result['override_max'], result['max'])\nresult['min'] = np.where(result['field_name'] == result['column name'],  result['override_min'], result['min'])\n\n# write a data frame so it's available to the next action\nwrite_dataframe(result)\nprint('process complete')",      'additions': [{'name': '25%', 'dataType': 'STRING'},       {'name': '50%', 'dataType': 'STRING'},       {'name': '75%', 'dataType': 'STRING'},       {'name': 'count', 'dataType': 'STRING'},       {'name': 'freq', 'dataType': 'STRING'},       {'name': 'max', 'dataType': 'STRING'},       {'name': 'mean', 'dataType': 'STRING'},       {'name': 'min', 'dataType': 'STRING'},       {'name': 'std', 'dataType': 'STRING'},       {'name': 'top', 'dataType': 'STRING'},       {'name': 'unique', 'dataType': 'STRING'},       {'name': 'column name', 'dataType': 'STRING'},       {'name': 'data type', 'dataType': 'STRING'}]},     {'name': 'Set Column Type',      'id': '5d82a35b-f3d2-4684-844c-8f1b4e2fc740',      'type': 'Metadata',      'gui': {'x': 348, 'y': 180, 'color': None},      'dependsOn': ['f0fa2d14-8521-4f53-ac07-cad27c9c8a09'],      'removeByDefault': False,      'fields': [{'name': '25%', 'type': 'STRING', 'dateFormat': None},       {'name': '50%', 'type': 'STRING', 'dateFormat': None},       {'name': '75%', 'type': 'STRING', 'dateFormat': None},       {'name': 'count', 'type': 'STRING', 'dateFormat': None},       {'name': 'freq', 'type': 'STRING', 'dateFormat': None},       {'name': 'max', 'type': 'LONG', 'dateFormat': None},       {'name': 'mean', 'type': 'LONG', 'dateFormat': None},       {'name': 'min', 'type': 'LONG', 'dateFormat': None},       {'name': 'std', 'type': 'STRING', 'dateFormat': None},       {'name': 'top', 'type': 'STRING', 'dateFormat': None},       {'name': 'unique', 'type': 'STRING', 'dateFormat': None},       {'name': 'column name', 'type': 'STRING', 'dateFormat': None},       {'name': 'data type', 'type': 'STRING', 'dateFormat': None}]},     {'name': 'Validation Override',      'id': '2976ebff-be2a-47a5-a546-9365aa4edf46',      'type': 'LoadFromVault',      'gui': {'x': 36, 'y': 48, 'color': 3701798},      'dependsOn': [],      'removeByDefault': False,      'dataSourceId': 'overrideID',      'executeFlowWhenUpdated': False,      'onlyLoadNewVersions': False,      'recentVersionCutoffMs': 0,      'columnSettings': {}},{'name': 'Select Columns',      'id': '359bd855-e617-4900-8439-d85d95f9612c',      'type': 'SelectValues',      'gui': {'x': 480, 'y': 180, 'color': None},      'dependsOn': ['5d82a35b-f3d2-4684-844c-8f1b4e2fc740'],      'removeByDefault': False,      'fields': [{'name': '25%', 'rename': '25%_model'},       {'name': '50%', 'rename': '50%_model'},       {'name': '75%', 'rename': '75%_model'},       {'name': 'count', 'rename': 'count_model'},       {'name': 'freq', 'rename': 'freq_model'},       {'name': 'max', 'rename': 'max_model'},       {'name': 'mean', 'rename': 'mean_model'},       {'name': 'min', 'rename': 'min_model'},       {'name': 'std', 'rename': 'std_model'},       {'name': 'top', 'rename': 'top_model'},       {'name': 'unique', 'rename': 'unique_model'},       {'name': 'data type', 'rename': 'data_type_model'},       {'name': 'column name', 'rename': 'column_name_model'}]},     {'name': 'Validation Criteria',      'id': '15704a83-6cee-4a95-b028-a3a445f7f864',      'type': 'PublishToVault',      'gui': {'x': 636, 'y': 180, 'color': None},      'dependsOn': ['359bd855-e617-4900-8439-d85d95f9612c'],      'removeByDefault': False,      'dataSource': {'name': 'Validation Criteria'}},],    'onboardFlowVersion': {},    'engineProperties': {}}
    #  OLD VERSION    ,"validationCriteria": {'documentVersion': 1,    'databaseType': 'ETL',    'name': 'Validation Criteria',    'scheduleInfo': {},    'description': '',    'responsibleUserId': 'user_id',    'runState': 'ENABLED',    'actions': [{'name': 'Feature Engineering',      'id': 'c00ac026-f0bf-488c-bc9d-c1dc17cfc719',      'type': 'LoadFromVault',      'gui': {'x': 36, 'y': 180, 'color': 3238043},      'dependsOn': [],      'removeByDefault': False,      'dataSourceId': 'ds_id',      'executeFlowWhenUpdated': False,      'onlyLoadNewVersions': False,      'recentVersionCutoffMs': 0,      'columnSettings': {}},     {'name': 'Summary Stats',      'id': 'f0fa2d14-8521-4f53-ac07-cad27c9c8a09',      'type': 'PythonEngineAction',      'gui': {'x': 204, 'y': 180, 'color': None},      'dependsOn': ['c00ac026-f0bf-488c-bc9d-c1dc17cfc719',       '2976ebff-be2a-47a5-a546-9365aa4edf46'],      'removeByDefault': True,      'fillMissingWithNull': True,      'script': 'from domomagic import *\nimport numpy as np\nimport pandas as pd\nfrom array import *\n\n# read data from inputs into a data frame\ndata = read_dataframe(\'Feature Engineering\')\noverride = read_dataframe(\'Validation Override\')\n\n\nfor col in data.columns: \n    print(col) \n\n#Get Datatypes of all columns\ndfDataTypes = data.dtypes.to_frame()\ndfDataTypes.columns = ["data type"]\nprint(\'step 1 complete\')\n\n#Get Summary Stats\ndfStats = data.describe(include = \'all\')\ndfStats[\'stat\'] = dfStats.index\ndfStats.index = np.arange(0, len(dfStats))\ndfStatsPivoted = pd.melt(dfStats, id_vars=[\'stat\'])\ndfStatsPivoted = dfStatsPivoted.pivot(index=\'variable\', columns=\'stat\', values=\'value\')\nprint(\'step 2 complete\')\n\n#Join df of Summary Stats to df of Columns datatypes\nresult = pd.concat([dfStatsPivoted, dfDataTypes], axis=1, join=\'inner\')\nresult[\'column name\'] = result.index\nresult.index = np.arange(0, len(result))\nprint(\'step 3 complete\')\n\n#Get distinct values of categorical columns\nresult["unique values"] = \'\'\nfor row in result.iterrows():\n    col = row[1]["column name"]\n    print(col)\n    if row[1]["data type"] == \'object\':\n        r = data[col].astype(\'U\')\n        arr_unique = r.unique()\n        list_unique = arr_unique.tolist()\n        list_unique.sort()\n        str_unique = \':\'.join(list_unique)\n        row[1]["unique values"] = str_unique\nprint(\'step 4 complete\')\n\n# Override numeric values\nresult = result.merge(override,how = \'left\', left_on=\'column name\', right_on=\'field_name\')\nresult[\'max\'] = np.where(result[\'field_name\'] == result[\'column name\'],  result[\'override_max\'], result[\'max\'])\nresult[\'min\'] = np.where(result[\'field_name\'] == result[\'column name\'],  result[\'override_min\'], result[\'min\'])\n\n# write a data frame so it\'s available to the next action\nwrite_dataframe(result)\nprint(\'process complete\')',      'additions': [{'name': '25%', 'dataType': 'STRING'},       {'name': '50%', 'dataType': 'STRING'},       {'name': '75%', 'dataType': 'STRING'},       {'name': 'count', 'dataType': 'STRING'},       {'name': 'freq', 'dataType': 'STRING'},       {'name': 'max', 'dataType': 'STRING'},       {'name': 'mean', 'dataType': 'STRING'},       {'name': 'min', 'dataType': 'STRING'},       {'name': 'std', 'dataType': 'STRING'},       {'name': 'top', 'dataType': 'STRING'},       {'name': 'unique', 'dataType': 'STRING'},       {'name': 'column name', 'dataType': 'STRING'},       {'name': 'data type', 'dataType': 'STRING'}]},     {'name': 'Set Column Type',      'id': '5d82a35b-f3d2-4684-844c-8f1b4e2fc740',      'type': 'Metadata',      'gui': {'x': 348, 'y': 180, 'color': None},      'dependsOn': ['f0fa2d14-8521-4f53-ac07-cad27c9c8a09'],      'removeByDefault': False,      'fields': [{'name': '25%', 'type': 'STRING', 'dateFormat': None},       {'name': '50%', 'type': 'STRING', 'dateFormat': None},       {'name': '75%', 'type': 'STRING', 'dateFormat': None},       {'name': 'count', 'type': 'STRING', 'dateFormat': None},       {'name': 'freq', 'type': 'STRING', 'dateFormat': None},       {'name': 'max', 'type': 'LONG', 'dateFormat': None},       {'name': 'mean', 'type': 'LONG', 'dateFormat': None},       {'name': 'min', 'type': 'LONG', 'dateFormat': None},       {'name': 'std', 'type': 'STRING', 'dateFormat': None},       {'name': 'top', 'type': 'STRING', 'dateFormat': None},       {'name': 'unique', 'type': 'STRING', 'dateFormat': None},       {'name': 'column name', 'type': 'STRING', 'dateFormat': None},       {'name': 'data type', 'type': 'STRING', 'dateFormat': None}]},     {'name': 'Validation Override',      'id': '2976ebff-be2a-47a5-a546-9365aa4edf46',      'type': 'LoadFromVault',      'gui': {'x': 36, 'y': 48, 'color': 3701798},      'dependsOn': [],      'removeByDefault': False,      'dataSourceId': 'overrideID',      'executeFlowWhenUpdated': False,      'onlyLoadNewVersions': False,      'recentVersionCutoffMs': 0,      'columnSettings': {}},     {'name': 'Correlations',      'id': 'fdd681f1-b53b-4d6d-a893-e51fce2aa686',      'type': 'PythonEngineAction',      'gui': {'x': 204, 'y': 300},      'dependsOn': ['c00ac026-f0bf-488c-bc9d-c1dc17cfc719'],      'removeByDefault': True,      'fillMissingWithNull': True,      'script': '# Import the domomagic package into the script \nfrom domomagic import *\nfrom pandas import melt\n\nprint(\'finished library import\')\n# read data from inputs into a data frame\ninput1 = read_dataframe(\'Feature Engineering\')\n\nprint(\'finished data import\')\n\ndf = (input1.corr(method=\'pearson\'))\ndf[\'Var1\'] = df.index\n\nprint(\'finished correlation matrix\')\n\ndf2 = df.melt(id_vars="Var1", \n        var_name="Var2", \n        value_name="Correlation")\n\nprint(df2)\n# write a data frame so it\'s available to the next action\nwrite_dataframe(df2)',      'additions': [{'name': 'Var1', 'dataType': 'STRING'},       {'name': 'Var2', 'dataType': 'STRING'},       {'name': 'Correlation', 'dataType': 'DOUBLE'}]},     {'name': 'Select Columns',      'id': '359bd855-e617-4900-8439-d85d95f9612c',      'type': 'SelectValues',      'gui': {'x': 480, 'y': 180, 'color': None},      'dependsOn': ['5d82a35b-f3d2-4684-844c-8f1b4e2fc740'],      'removeByDefault': False,      'fields': [{'name': '25%', 'rename': '25%_model'},       {'name': '50%', 'rename': '50%_model'},       {'name': '75%', 'rename': '75%_model'},       {'name': 'count', 'rename': 'count_model'},       {'name': 'freq', 'rename': 'freq_model'},       {'name': 'max', 'rename': 'max_model'},       {'name': 'mean', 'rename': 'mean_model'},       {'name': 'min', 'rename': 'min_model'},       {'name': 'std', 'rename': 'std_model'},       {'name': 'top', 'rename': 'top_model'},       {'name': 'unique', 'rename': 'unique_model'},       {'name': 'data type', 'rename': 'data_type_model'},       {'name': 'column name', 'rename': 'column_name_model'}]},     {'name': 'Validation Criteria',      'id': '15704a83-6cee-4a95-b028-a3a445f7f864',      'type': 'PublishToVault',      'gui': {'x': 636, 'y': 180, 'color': None},      'dependsOn': ['359bd855-e617-4900-8439-d85d95f9612c'],      'removeByDefault': False,      'dataSource': {'name': 'Validation Criteria'}},     {'name': 'Correlation Matrix',      'id': '8715f441-996a-4b90-a61c-807b2a0f995f',      'type': 'PublishToVault',      'gui': {'x': 348, 'y': 300},      'dependsOn': ['fdd681f1-b53b-4d6d-a893-e51fce2aa686'],      'removeByDefault': False,      'dataSource': {'name': 'Correlation Matrix'}}],    'onboardFlowVersion': {},    'engineProperties': {}}
    ,"validation": {'documentVersion': 1, 'databaseType': 'ETL', 'name': 'Data Valdation', 'scheduleInfo': {}, 'description': '', 'responsibleUserId': 'user_id', 'runState': 'ENABLED', 'actions': [{'name': 'Validation Criteria', 'id': '755b1996-0b13-4b15-b77d-2949240c8d6b', 'type': 'LoadFromVault', 'gui': {'x': 216, 'y': 413, 'color': 3238043}, 'dependsOn': [], 'removeByDefault': False, 'dataSourceId': 'vc_id', 'executeFlowWhenUpdated': False, 'onlyLoadNewVersions': False, 'recentVersionCutoffMs': 0, 'columnSettings': {}}, {'name': 'Feature Engineering', 'id': 'f76b4323-0e96-47ac-a7a6-32ae774f12c4', 'type': 'LoadFromVault', 'gui': {'x': 204, 'y': 185, 'color': 3238043}, 'dependsOn': [], 'removeByDefault': False, 'dataSourceId': 'fe_id', 'executeFlowWhenUpdated': True, 'onlyLoadNewVersions': False, 'recentVersionCutoffMs': 0, 'columnSettings': {}}, {'name': 'Summary stats and distinct values', 'id': 'ad64f153-cdb1-4297-9fcb-4f07d803d15f', 'type': 'PythonEngineAction', 'gui': {'x': 648, 'y': 221, 'color': None}, 'dependsOn': ['f76b4323-0e96-47ac-a7a6-32ae774f12c4'], 'removeByDefault': True, 'fillMissingWithNull': True, 'script': '# Import the domomagic package into the script \nfrom domomagic import *\nimport numpy as np\nimport pandas as pd\nfrom array import *\n\n# read data from inputs into a data frame\ndata = read_dataframe(\'Feature Engineering\')\n\n\n#---------------- write your script here\n#Get Datatypes of all columns\ndfDataTypes = data.dtypes.to_frame()\ndfDataTypes.columns = ["data type"]\n\n#Get Summary Stats\ndfStats = data.describe(include = \'all\')\ndfStats[\'stat\'] = dfStats.index\ndfStats.index = np.arange(0, len(dfStats))\ndfStatsPivoted = pd.melt(dfStats, id_vars=[\'stat\'])\ndfStatsPivoted = dfStatsPivoted.pivot(index=\'variable\', columns=\'stat\', values=\'value\')\n\n#Join df of Summary Stats to df of Columns datatypes\nresult = pd.concat([dfStatsPivoted, dfDataTypes], axis=1, join=\'inner\')\nresult[\'column name\'] = result.index\nresult.index = np.arange(0, len(result))\n\n#Get distinct values of categorical columns\nresult["unique values"] = \'\'\nfor row in result.iterrows():\n    col = row[1]["column name"]\n    if row[1]["data type"] == \'object\':\n        r = data[col].astype(\'U\')\n        arr_unique = r.unique()\n        list_unique = arr_unique.tolist()\n        list_unique.sort()\n        str_unique = \':\'.join(list_unique)\n        row[1]["unique values"] = str_unique\n\n\n\n# write a data frame so it\'s available to the next action\nwrite_dataframe(result)', 'additions': [{'name': '25%', 'dataType': 'STRING'}, {'name': '50%', 'dataType': 'STRING'}, {'name': '75%', 'dataType': 'STRING'}, {'name': 'count', 'dataType': 'STRING'}, {'name': 'freq', 'dataType': 'STRING'}, {'name': 'max', 'dataType': 'STRING'}, {'name': 'mean', 'dataType': 'STRING'}, {'name': 'min', 'dataType': 'STRING'}, {'name': 'std', 'dataType': 'STRING'}, {'name': 'top', 'dataType': 'STRING'}, {'name': 'unique', 'dataType': 'STRING'}, {'name': 'data type', 'dataType': 'STRING'}, {'name': 'column name', 'dataType': 'STRING'}, {'name': 'unique values', 'dataType': 'STRING'}]}, {'type': 'MergeJoin', 'name': 'Join Data', 'id': '6d2c9b5e-3192-4605-a06e-a449b84eaa1f', 'gui': {'x': 864, 'y': 413, 'color': None}, 'dependsOn': ['ad64f153-cdb1-4297-9fcb-4f07d803d15f', '755b1996-0b13-4b15-b77d-2949240c8d6b'], 'joinType': 'INNER', 'step1': '755b1996-0b13-4b15-b77d-2949240c8d6b', 'step2': 'ad64f153-cdb1-4297-9fcb-4f07d803d15f', 'keys1': ['column_name_model'], 'keys2': ['column name'], 'schemaModification1': [], 'schemaModification2': []}, {'name': 'Validate data', 'id': 'a272d06a-16d2-46e5-91ac-7053b1177568', 'type': 'PythonEngineAction', 'gui': {'x': 984, 'y': 413, 'color': None}, 'dependsOn': ['6d2c9b5e-3192-4605-a06e-a449b84eaa1f'], 'removeByDefault': True, 'fillMissingWithNull': True, 'script': '# Import the domomagic package into the script \nfrom domomagic import *\n\nprint(\'packages loaded\')\n# read data from inputs into a data frame\ndata = read_dataframe(\'Join Data\')\nprint(\'data loaded\')\n\n\n# write your script here\ndata["valid"] = 0\ndata["reason"] = \'\'\nfor index in range(len(data)):\n  print(data.loc[index, \'column_name_model\'])\n  if data.loc[index, "data type"] != data.loc[index, "data_type_model"]:\n    print("datatype mismatch")\n    data.at[index, \'valid\'] = 1\n    data.at[index, \'reason\'] = "data type does not match the model"\n    print(data)\n  elif data.loc[index, "data type"] == \'datetime64[ns]\':\n    print(\'you have a date, are you sure?\')\n  else:\n    if data.at[index, "data type"] == \'object\':\n      list_unique_values = data.loc[index, "unique values"].split(\':\')\n      list_unique_values_model = data.loc[index, "unique_model"].split(\':\')\n      for i in list_unique_values:\n        if i not in list_unique_values_model:\n          data.at[index, "valid"] = 1\n          data.at[index, "reason"] = "invalid value"\n    else:\n      if float(data.loc[index, "min"]) < float(data.loc[index, "min_model"]) or float(data.loc[index, "max"]) > float(data.loc[index, "max_model"]):\n        data.at[index, "valid"] = 1\n        data.at[index, "reason"] = "values out of range"\n\nprint(\'analysis complete\')\n# write a data frame so it\'s available to the next action\nwrite_dataframe(data)\nprint(\'output written\')', 'additions': [{'name': '25%_model', 'dataType': 'STRING'}, {'name': '50%_model', 'dataType': 'STRING'}, {'name': '75%_model', 'dataType': 'STRING'}, {'name': 'count_model', 'dataType': 'STRING'}, {'name': 'freq_model', 'dataType': 'STRING'}, {'name': 'max_model', 'dataType': 'LONG'}, {'name': 'mean_model', 'dataType': 'LONG'}, {'name': 'min_model', 'dataType': 'LONG'}, {'name': 'std_model', 'dataType': 'STRING'}, {'name': 'top_model', 'dataType': 'STRING'}, {'name': 'unique_model', 'dataType': 'STRING'}, {'name': 'data_type_model', 'dataType': 'STRING'}, {'name': 'column_name_model', 'dataType': 'STRING'}, {'name': '25%', 'dataType': 'STRING'}, {'name': '50%', 'dataType': 'STRING'}, {'name': '75%', 'dataType': 'STRING'}, {'name': 'count', 'dataType': 'STRING'}, {'name': 'freq', 'dataType': 'STRING'}, {'name': 'max', 'dataType': 'STRING'}, {'name': 'mean', 'dataType': 'STRING'}, {'name': 'min', 'dataType': 'STRING'}, {'name': 'std', 'dataType': 'STRING'}, {'name': 'top', 'dataType': 'STRING'}, {'name': 'unique', 'dataType': 'STRING'}, {'name': 'data type', 'dataType': 'STRING'}, {'name': 'column name', 'dataType': 'STRING'}, {'name': 'unique values', 'dataType': 'STRING'}, {'name': 'valid', 'dataType': 'LONG'}, {'name': 'reason', 'dataType': 'STRING'}]}, {'name': 'Data Validation Errors', 'id': '9f3680b3-2552-4208-b371-03099c0b3fcd', 'type': 'PublishToVault', 'gui': {'x': 1320, 'y': 413, 'color': None}, 'dependsOn': ['a272d06a-16d2-46e5-91ac-7053b1177568'], 'removeByDefault': False, 'dataSource': {'name': 'Data Validation Errors'}}, {'name': 'if valid?', 'id': 'e82d4408-c3fd-4f32-b53a-0eff534cda88', 'type': 'PythonEngineAction', 'gui': {'x': 1176, 'y': 185, 'color': None}, 'dependsOn': ['a272d06a-16d2-46e5-91ac-7053b1177568', 'f76b4323-0e96-47ac-a7a6-32ae774f12c4'], 'removeByDefault': True, 'fillMissingWithNull': True, 'script': "# Import the domomagic package into the script \nfrom domomagic import *\n\n# read data from inputs into a data frame\ndata = read_dataframe('Validate data')\nresult = read_dataframe('Feature Engineering')\n\n# write your script here\nif len(data[data.valid == 1]) > 0:\n  result.drop(result.index, inplace=True)\n\n# write a data frame so it's available to the next action\nwrite_dataframe(result)", 'additions': [{'name': 'Order ID', 'dataType': 'LONG'}, {'name': 'Order Date', 'dataType': 'DATETIME'}, {'name': 'Order Priority', 'dataType': 'STRING'}, {'name': 'Order Quantity', 'dataType': 'LONG'}, {'name': 'Sales', 'dataType': 'DOUBLE'}, {'name': 'Discount', 'dataType': 'DOUBLE'}, {'name': 'Ship Mode', 'dataType': 'STRING'}, {'name': 'Profit', 'dataType': 'DOUBLE'}, {'name': 'Unit Price', 'dataType': 'DOUBLE'}, {'name': 'Shipping Cost', 'dataType': 'DOUBLE'}, {'name': 'Customer Name', 'dataType': 'STRING'}, {'name': 'Customer State', 'dataType': 'STRING'}, {'name': 'Zip Code', 'dataType': 'LONG'}, {'name': 'Region', 'dataType': 'STRING'}, {'name': 'Customer Segment', 'dataType': 'STRING'}, {'name': 'Product Category', 'dataType': 'STRING'}, {'name': 'Product Sub-Category', 'dataType': 'STRING'}, {'name': 'Product Name', 'dataType': 'STRING'}, {'name': 'Product Container', 'dataType': 'STRING'}, {'name': 'Product Base Margin', 'dataType': 'DOUBLE'}, {'name': 'Ship Date', 'dataType': 'DATETIME'}, {'name': 'Date yyyy', 'dataType': 'LONG'}, {'name': 'Lead Source', 'dataType': 'STRING'}, {'name': 'Visits to Close', 'dataType': 'LONG'}, {'name': 'Date', 'dataType': 'DATETIME'}, {'name': 'Group', 'dataType': 'STRING'}, {'name': 'Subgroup', 'dataType': 'STRING'}, {'name': 'Category', 'dataType': 'STRING'}, {'name': 'Subcategory', 'dataType': 'STRING'}, {'name': 'Country', 'dataType': 'STRING'}, {'name': 'State', 'dataType': 'STRING'}, {'name': 'Province', 'dataType': 'STRING'}, {'name': 'Territory', 'dataType': 'STRING'}, {'name': 'Prefecture', 'dataType': 'STRING'}, {'name': 'Value1', 'dataType': 'LONG'}, {'name': 'Value2', 'dataType': 'LONG'}, {'name': 'Customer Stage', 'dataType': 'LONG'}]}, {'name': 'Validated Data', 'id': '43a1f03d-ef1d-436c-95f1-549caeb8e469', 'type': 'PublishToVault', 'gui': {'x': 1308, 'y': 185, 'color': None}, 'dependsOn': ['e82d4408-c3fd-4f32-b53a-0eff534cda88'], 'removeByDefault': False, 'dataSource': {'name': 'Validated Data'}}], 'onboardFlowVersion': {}, 'engineProperties': {}}
    ,"modelSelection": {'documentVersion': 1, 'databaseType': 'ETL', 'name': 'Model Selection', 'scheduleInfo': {}, 'description': '', 'responsibleUserId': 'user_id', 'runState': 'ENABLED', 'actions': [{'name': 'Feature Engineering', 'id': '6fcb5c3a-3d30-4d66-98a0-928d0bdd96cf', 'type': 'LoadFromVault', 'gui': {'x': 36, 'y': 180, 'color': 3238043}, 'dependsOn': [], 'removeByDefault': False, 'dataSourceId': 'dsid', 'executeFlowWhenUpdated': False, 'onlyLoadNewVersions': False, 'recentVersionCutoffMs': 0, 'columnSettings': {}}, {'name': 'model creation code', 'id': '9d59248e-e199-4dee-bf25-0ccff5454e57', 'type': 'REngineAction', 'gui': {'x': 204, 'y': 180}, 'dependsOn': ['6fcb5c3a-3d30-4d66-98a0-928d0bdd96cf'], 'removeByDefault': True, 'fillMissingWithNull': True, 'script': "# Import the domomagic library into the script. \nlibrary('domomagic')\nlibrary('dplyr')\n\nprint('libraries loaded')\n\n\n# read data from inputs into a data frame\ninput1 <- read.dataframe('Feature Engineering')%>%select_if(is.numeric)\n\nprint('Data Loaded')\n\nds <- na.omit(input1[,1:4])\ncolnames(ds) <- c('y','x1', 'x2', 'x3')\n\nprint('data prep completed')\n\nlm00 <- lm(y ~ ., data = ds)\n\nprint('model created')\n\nds$pred <- predict(lm00, ds)\noutput <- ds[,c('y','pred')]\n\ncolnames(output) <- c('Actual', 'Predicted')\n\nprint('output generated')\n\n# write a data frame so it's available to the next action\nwrite.dataframe(output)\nprint('output sent to next tile')", 'additions': [{'name': 'Actual', 'dataType': 'DOUBLE'}, {'name': 'Predicted', 'dataType': 'DOUBLE'}]}, {'name': 'variable importance code', 'id': '4803fc60-3401-4691-87e5-96d40e64436f', 'type': 'REngineAction', 'gui': {'x': 204, 'y': 300}, 'dependsOn': ['6fcb5c3a-3d30-4d66-98a0-928d0bdd96cf'], 'removeByDefault': True, 'fillMissingWithNull': True, 'script': "# Import the domomagic library into the script. \nlibrary('domomagic')\nlibrary('dplyr')\n\nprint('libraries loaded')\n\n\n# read data from inputs into a data frame\ninput1 <- read.dataframe('Feature Engineering')%>%select_if(is.numeric)\n\nprint('Data Loaded')\n\nds <- na.omit(input1[,1:4])\ncolnames(ds) <- c('y','x1', 'x2', 'x3')\n\nprint('data prep completed')\n\nlm00 <- lm(y ~ ., data = ds)\n\nprint('model created')\n\noutput <- broom::tidy(lm00)\n\nprint('output generated')\n\n# write a data frame so it's available to the next action\nwrite.dataframe(output)\nprint('output sent to next tile')", 'additions': [{'name': 'term', 'dataType': 'STRING'}, {'name': 'estimate', 'dataType': 'DOUBLE'}, {'name': 'std.error', 'dataType': 'DOUBLE'}, {'name': 'statistic', 'dataType': 'DOUBLE'}, {'name': 'p.value', 'dataType': 'DOUBLE'}]}, {'name': 'Model output', 'id': '9866d7cd-cd8d-4fa9-88f3-f6b61c112c83', 'type': 'PublishToVault', 'gui': {'x': 324, 'y': 180}, 'dependsOn': ['9d59248e-e199-4dee-bf25-0ccff5454e57'], 'removeByDefault': False, 'dataSource': {'name': 'Model output'}}, {'name': 'Variable Importance', 'id': '4e86d944-4d37-4281-9bb0-9d62e53373d1', 'type': 'PublishToVault', 'gui': {'x': 324, 'y': 300}, 'dependsOn': ['4803fc60-3401-4691-87e5-96d40e64436f'], 'removeByDefault': False, 'dataSource': {'name': 'Variable Importance'}}], 'onboardFlowVersion': {}, 'engineProperties': {}}
    ,"modelInference": {'documentVersion': 1, 'databaseType': 'ETL', 'name': 'Model Inference', 'scheduleInfo': {}, 'description': '', 'responsibleUserId': 'user_id', 'runState': 'ENABLED', 'actions': [{'name': 'Feature Engineering', 'id': '6fcb5c3a-3d30-4d66-98a0-928d0bdd96cf', 'type': 'LoadFromVault', 'gui': {'x': 36, 'y': 180, 'color': 3238043}, 'dependsOn': [], 'removeByDefault': False, 'dataSourceId': 'dsid', 'executeFlowWhenUpdated': False, 'onlyLoadNewVersions': False, 'recentVersionCutoffMs': 0, 'columnSettings': {}}, {'name': 'model inference', 'id': '9d59248e-e199-4dee-bf25-0ccff5454e57', 'type': 'REngineAction', 'gui': {'x': 204, 'y': 180}, 'dependsOn': ['6fcb5c3a-3d30-4d66-98a0-928d0bdd96cf'], 'removeByDefault': True, 'fillMissingWithNull': True, 'script': "# Import the domomagic library into the script. \nlibrary('domomagic')\nlibrary('dplyr')\n\nprint('libraries loaded')\n\n\n# read data from inputs into a data frame\ninput1 <- read.dataframe('Feature Engineering')%>%select_if(is.numeric)\n\nprint('Data Loaded')\n\nds <- na.omit(input1[,1:4])\ncolnames(ds) <- c('y','x1', 'x2', 'x3')\n\nprint('data prep completed')\n\nlm00 <- lm(y ~ ., data = ds)\n\nprint('model created')\n\nds$pred <- predict(lm00, ds)\noutput <- ds[,c('y','pred')]\n\ncolnames(output) <- c('Actual', 'Predicted')\n\nprint('output generated')\n\n# write a data frame so it's available to the next action\nwrite.dataframe(output)\nprint('output sent to next tile')", 'additions': [{'name': 'Actual', 'dataType': 'DOUBLE'}, {'name': 'Predicted', 'dataType': 'DOUBLE'}]}, {'name': 'Model Predictions', 'id': '9866d7cd-cd8d-4fa9-88f3-f6b61c112c83', 'type': 'PublishToVault', 'gui': {'x': 324, 'y': 180}, 'dependsOn': ['9d59248e-e199-4dee-bf25-0ccff5454e57'], 'removeByDefault': False, 'dataSource': {'name': 'Model output'}}], 'onboardFlowVersion': {}, 'engineProperties': {}}
}

summaryNumbers = {
    'countPercentNull': "CONCAT(SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END),' Nulls | ',ROUND(SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END)/(COUNT(`col_name`)+SUM(CASE WHEN `col_name` IS NULL THEN 1 ELSE 0 END))*100,0),'% of all rows')",
    #,"correlation": 'CASE WHEN ABS((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`))) >= 0.7000 THEN CONCAT(\'<b><font color="#31689B">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') WHEN ABS((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`))) >= 0.5000 THEN CONCAT(\'<b><font color="#72B0D7">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') WHEN ABS((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`))) >= 0.3000 THEN CONCAT(\'<b><font color="#B7DAF5">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') ELSE CONCAT(\'<b><font color="#D4D4D4">\', ROUND((AVG(`dvar` * `col_name`) - (AVG(`dvar`) * AVG(`col_name`))) / (STDDEV_POP(`dvar`) * STDDEV_POP(`col_name`)), 4),\'</font></b>\') END'
}

SUCCESS = 200


class Profiler:
    """
    Creates class tied to a Domo instance to automate data science work.
    The functions are designed to follow the Domo Data Science Consulting team best practices.

    Parameters:
    instance: Base URL of the Domo instance (i.e. modocorp, domo-data-science)
    email: Email address associated with the domo instance
    password: Password for instance
    """
    
    def __init__(self, instance, dataset_id, email, password, sso=False):
        self.dataset_id = dataset_id
        self.base_url = "https://" + instance + ".domo.com"
        self.instance = instance
        self.email = email
        self.password = password
        self.sso = sso
        self.logged_in = False
        self.headers = None
        self.user_id = None
        self.fe = {
            "feDF_id": None,
            "feDS_id": None,
            "feRun": False
        }
        self.validation = {
            "criteriaDF_id": None,
            "criteriaRun": False,
            "summaryStats_id": None,
            "correlationDF_id": None,
            "correlation_id": None,
            "override_id": None,
            "validationDF_id": None,
            "validationRun": False,
            "validDS_id": None,
            "errorDS_id": None
        }
        self.modeling = {
            "modelingRun": False,
            "varImpDS": None,
            "selectionDS": None,
            "inferenceDS": None,
            "select_DF": None,
            "inference_DF": None,
        }
        self.bias = {
            "bias_DS": None,
            "bias_DF": None,
        }
        self.topPage = None
        self.modPage = None
        self.edaPage = None
        self.dataVal = None
        self.modelVal = None
        self.modelAcc = None
        self.hist = False
        self.scat = False
        self.box = False
        self.cards = {
            "validation": False,
            "summary": False,
            "model": False
        }

        if sso:
            if not self.instance or not self.password:
                print('Missing required fields')
                return
            # when sso, the `password` var is the session cookie
            self.headers = {
                'x-domo-authentication': password,
                'Content-Type': 'application/json;text/plain;*/*'
            }
            # arbitrary api endpoint to check if the cookie is correct
            r = requests.get(f'https://{self.instance}.domo.com/api/dataprocessing/v2/dataflows/filters/dataflowType',
                             headers=self.headers)
            if r.status_code == SUCCESS:
                print('Login Successful')
                self.logged_in = True
            else:
                print('Login Error, check credentials')
                return                
        else:
            if not self.instance or not self.password or not self.email:
                print('Missing required fields')
                return
            data = {
                'method': 'password',
                'emailAddress': self.email, 
                'password': self.password
            }
            r = requests.post(URL['login'].format(self.base_url), json=data)
            if r.status_code == SUCCESS and r.json()['success']:
                session_token = (r.json()).get('sessionToken')
                self.headers = {
                    'x-domo-authentication': session_token,
                    'Content-Type': 'application/json;text/plain;*/*'
                }
                print('Login Successful')
                self.logged_in = True
            else:
                print('Login Error, check credentials')
                return

        r = requests.get(URL['userInfo'].format(self.base_url), headers=self.headers)
        if (r.status_code == SUCCESS): self.user_id = r.json()['id']
            
        r = self.dsMetaData(self.dataset_id)
        if (r.status_code == SUCCESS): self.dataset_name = r.json()['name']
            
            
    def refresh_creds():
        """
        Use stored email and password to re-authenticate if session expires
        """

        data = { 'method': 'password', 'emailAddress': self.email, 'password': self.password }
        r = requests.post(URL['login'].format(self.base_url), json=data)
        if r.status_code == SUCCESS and r.json()['success']:
            session_token = (r.json()).get('sessionToken')
            self.headers = {'x-domo-authentication': session_token,'Content-Type': 'application/json;text/plain;*/*' }
        else:
            print('Login Error, check credentials')

            
    def save_config(self):
        """
          Saves current state of variables created by the scaffolding function as a dict.
          If scaffolding is going to extend beyond more than one session use this function
          
          Returns dict of values used in scaffolding
        """
        output = {
            "fe": self.fe,
            "validation":self.validation,
            "modeling": self.modeling,
            "topPage": self.topPage,
            "modPage": self.modPage,
            "edaPage": self.edaPage,
            "dataVal": self.dataVal,
            "modelVal": self.modelVal,
            "cards": self.cards,
            "histProfile": self.hist,
            "scatProfile": self.scat,
            "boxProfile": self.box
        }
        return output
    
    
    def update_profile(self, updates):
        """
            Complement to save_config. Once dict is saved you can use this function to restore the state
        """
        self.fe = updates.get('fe')
        self.validation = updates.get('validation')
        self.modeling = updates.get('modeling')
        self.topPage = updates.get('topPage')
        self.modPage = updates.get('modPage')
        self.edaPage = updates.get('edaPage')
        self.dataVal = updates.get('dataVal')
        self.modelVal = updates.get('modelVal')
        self.cards = updates.get('cards')
        self.hist = updates.get('histProfile')
        self.scat = updates.get('scatProfile')
        self.box = updates.get('boxProfile')
        print('values have been updated to:')
        print(self.save_config())

        
    def hist_profile(self, columns, page_id=None, summary='Default', dataset_id=None):
        """
        Creates a card for every column in the column list on the specified page

        Params:
        columns = An array containing column name and column type
        page_id = Page where you want the cards to be created. Default will create a new page for you
        summary = the summary number you want on the card.
            You can provide your own valid beastmode as a string. If you want it unique to a card, use `col_name` and the number will change with each card.
            The default is the count and percentage of Null values in the column
        dataset_id = Dataset to use to build cards.The default is the dataset given to create the profile object
        """
        if dataset_id == None:
            dataset_id = self.dataset_id
        
        if page_id == None:
            page_id = self.createPage('Histogram Profile')
            
        querystring = {"skipValidation": "true", "pageId": page_id}
        
        if summary == 'Default':
            summary = summaryNumbers['countPercentNull']

        for row in columns:
            if row['type'] in ('DATE', 'DATETIME'):
                data = jsons['templateDate']          
            elif row['type'] in ('DOUBLE', 'LONG', 'DECIMAL'):
                data = jsons['templateHist']
            else:
                data = jsons['templateCat']
            new = json.dumps(data).replace('summary', summary).replace('col_name', row['name']).replace('dsid',dataset_id)
        
            r = requests.request("PUT", URL['card'].format(self.base_url),
                                 data=new, headers=self.headers, params=querystring)

        
    def scatter_profile(self, columns, dvar, page_id=None, dataset_id=None):
        """
        Creates a scatterplot card for every numeric column in the column list on the specified page

        Params:
        columns = An array containing column name and column type
        page_id = Page where you want the cards to be created
        dvar = the column of interest which will become the y axis on all of the cards
        """
        if dataset_id == None:
            dataset_id = self.dataset_id
        
        if page_id == None:
            page_id = self.createPage('Scatterplot Profile')
            
        querystring = {"skipValidation": "true","pageId": page_id}

        for row in columns:
            if row['type'] not in ('DOUBLE', 'LONG', 'DECIMAL') or row['name'] == dvar:
                continue
            elif row['type'] in ('DOUBLE', 'LONG', 'DECIMAL'):
                data = jsons['templateScat']
            new = json.dumps(data).replace('col_name', row['name']).replace('dsid',dataset_id).replace('dvar', dvar)
        
            r = requests.request("PUT", URL['card'].format(self.base_url), data=new,
                                 headers=self.headers, params=querystring)

        
    def boxplot_profile(self, columns, dvar, page_id=None, dataset_id=None, prediction_type=None):
        """
        Creates a scatterplot card for every numeric column in the column list on the specified page

        Params:
        columns = An array containing column name and column type
        page_id = Page where you want the cards to be created
        dvar = the column of interest which will become the y axis on all of the cards
        """
        if dataset_id == None:
            dataset_id = self.dataset_id
        
        if page_id == None:
            page_id = self.createPage('Boxplot Profile')
            
        querystring = {"skipValidation": "true", "pageId": page_id}
        data = jsons['templateBox']
        
        if prediction_type == 'Numeric':
            # continious DVAR against non-numeric cols
            columns = [x for x in columns if x['type'] in ('STRING')]
            for row in columns:
                new = json.dumps(data).replace('col_name', dvar).replace('dvar', row['name']).replace('dsid', dataset_id)
                response = requests.request("PUT", URL['card'].format(self.base_url), 
                                            data=new, headers=self.headers, params=querystring)
            
        elif prediction_type == 'Categorical':
            # discrete DVAR against numeric columns
            columns = [x for x in columns if x['type'] not in ('STRING', 'DATE')]
            for row in columns:
                new = json.dumps(data).replace('col_name', row['name']).replace('dvar', dvar).replace('dsid', dataset_id)
                response = requests.request("PUT", URL['card'].format(self.base_url), 
                                            data=new, headers=self.headers, params=querystring)

    
    def time_profile(self, pageId, dataset_id, time_series_x):
        cols = self.getSchemaFromDataSource()
        date_col = [col for col in cols if col['name'] == time_series_x][0]
        num_cols = [col for col in cols if col['type'] in ('LONG', 'DOUBLE', 'DECIMAL') and col['name'] != time_series_x]
        
        url = f"https://{self.instance}.domo.com/api/content/v3/cards/kpi"
        query_string = {"pageId": pageId}
        
        for num_col in num_cols:
            time_json = {"definition":{"subscriptions":{"main":{"name":"main","columns":[{"dataSourceId":"DSID","column":"X_VAR","mapping":"ITEM"},{"column":"Y_VAR","mapping":"VALUE","aggregation":"SUM"}],"filters":[],"orderBy":[{"aggregation": None, "column": "X_VAR", "order": "ASCENDING"}],"groupBy":[{"column":"X_VAR"}],"fiscal":False,"projection":False,"distinct":False}},"formulas":{"dsUpdated":[],"dsDeleted":[],"card":[]},"annotations":{"new":[],"modified":[],"deleted":[]},"conditionalFormats":{"card":[],"datasource":[]},"slicers":[],"segments":{"active":[],"create":[],"update":[],"delete":[]},"charts":{"main":{"component":"main","chartType":"badge_two_trendline","overrides":{},"goal":None}},"title":"TITLE","description":"","chartVersion":"8","variableControls":[]},"dataProvider":{"dataSourceId":"DSID"}}
            time_json_str = json.dumps(time_json)
            time_json_str = time_json_str \
                .replace('DSID', dataset_id) \
                .replace('X_VAR', date_col['name']) \
                .replace('Y_VAR', num_col['name']) \
                .replace('TITLE', f"{num_col['name']} over {date_col['name']}")
            requests.request('PUT', url, data=time_json_str, headers=self.headers, params=query_string)

        
    def post_card(self, pageID, chartType, title, aggregation, valueColumn, categoryColumn, seriesColumn=None, orientation='vert', summary=None):
        """
        Create a basic domo card
        
        Params:
            pageID: the id of the page the card will be placed on
            chartType: In theory any of the charts will work. In practice each chart is a little different. The ones I've tested are
            title: Title to be displayed on the card
            aggregation: Type of aggregation (options are listed below)
            valueColumn: Column name to be aggregated and displayed
            categoryColumn: Column name for categories on the axis
            seriesColumn: Column name for series. Not all cards support series. Default = None
            orientation: What direction should the chart be oriented. Default = 'vert'
            summary: Should there be a summary number. If True then it will be the aggregated value of the valueColumn. Default = None
            
        Returns:
            json response from Domo
        """
        querystring = {"newContainer":"false","pageId":pageID}
        data = copy.deepcopy(jsons['genericCardV2'])
        
        if summary == None:
            data['subscriptions'].pop('big_number')
        # Special Chart Cases            
        ## No Series Allowed
        if chartType in ['pie', 'histogram', 'boxplot', 'doughnut', 'facegauge', 'slicer', 'checkbox_selector']:
            seriesColumn = None
            
        ## No orientations        
        if chartType in ['pie', 'doughnut', 'facegauge', 'slicer', 'checkbox_selector', 'two_trendline'
                        ,'rttrndline']:
            orientation = ''
            
        ## Sometimes orientations?
        if chartType in ['two_trendline', 'stepline', 'curvedline'] and orientation == 'vert':
            orientation = ''

        ## No aggregations
        if chartType in ['histogram', 'boxplot']:
            data['subscriptions']['main']['columns'][1].pop('aggregation')
            
        if seriesColumn == None:
            data['subscriptions']['main']['columns'].pop(2)
            data['subscriptions']['main']['groupBy'].pop(1)
            
        new = json.dumps(data).replace('dsid',self.dataset_id).replace('cat_column',categoryColumn ).replace('values_column',valueColumn).replace('series_column',str(seriesColumn)).replace('orient','_'+orientation).replace('cType',chartType).replace('generatedTitle', title).replace('aggType',aggregation).replace('__','_')      
        r = requests.request("PUT", URL['cardV2'].format(profile.base_url), data=new
                         , headers=profile.headers
                         , params=querystring)
        return r
 

    ################## Cards ###################
    def cor_mat(self, datasetID, pageID):
        """
        Create a correlation matrix pivot table card. For use with the Data Science Validation Criteria Dataflow
        """
        
        querystring = {"newContainer":"false","pageId":pageID}
        data = jsons['corMat_pivotTable']
        new = json.dumps(data).replace('dsid', datasetID) 
        r = requests.request("PUT", URL['cardV2'].format(self.base_url), data=new,
                             headers=profile.headers, params=querystring)
        return r
    
    
    def validationFace(self, datasetID, pageID):
        """
        Create a face guage card. For use with the Data Science Validation Criteria Dataflow
        """
        
        querystring = {"newContainer":"false","pageId":pageID}
        data = jsons['validFace']
        new = json.dumps(data).replace('dsid', datasetID) 
        r = requests.request("PUT", URL['cardV2'].format(self.base_url), data=new
                         , headers=profile.headers
                         , params=querystring)
        return r

    
    def validErrorsTable(self, datasetID, pageID):
        """
        Create a table card listing errors found in data validation. For use with the Data Science Validation Criteria Dataflow
        """
        
        querystring = {"newContainer":"false","pageId":pageID}
        data = jsons['validErrorsTable']
        new = json.dumps(data).replace('dsid', datasetID) 
        r = requests.request("PUT",
                             URL['cardV2'].format(self.base_url),
                             data=new,
                             headers=profile.headers,
                             params=querystring)
        return r
        
        
    def summaryStatTable(self, datasetID, pageID):
        """
        Creates a table with summary statistics generated from the Data Science Validation Criteria Dataflow
        """
        
        querystring = {"newContainer":"false","pageId":pageID}
        data = jsons['summaryTable']
        new = json.dumps(data).replace('dsid', datasetID) 
        r = requests.request("PUT", URL['cardV2'].format(self.base_url), data=new
                         , headers=profile.headers
                         , params=querystring)
#         if r.status_code != 200:
#             print('The summary stats card didn\'t make it to Domo, Sorry. Check the json by assigning the function to a variable then calling var.content')
        return r
    
    
    def createCardFromTemplate(self, datasetID, pageID, template):
        """
            Creates card from prebuilt template, only chaning the dataset id
        """
        querystring = {"newContainer":"false","pageId":pageID}
        data = template
        new = json.dumps(data).replace('dsid', datasetID) 
        r = requests.request("PUT", 
                             URL['cardV2'].format(self.base_url),
                             data=new,
                             headers=profile.headers,
                             params=querystring)
#         if r.status_code != 200:
#             print('The card didn\'t make it to Domo, Sorry. Check the json by assigning the function to a variable then calling var.content')
        return r.status_code
        
        
    ###################### Pages ################   
    def createPage(self, title, parentPageId = None, description = "", isCollection = False):
        """
        Creates a page/collection (where you can put cards)
        
        Pararms:
            title: Name of page / collection
            parentPageId: Id of parent page. Required for subpage or collection. Default to None
            isCollection: Are you creating a collection? Default to False
        """

        if(isCollection): data = { 'parentPageId': parentPageId if parentPageId else 0, 'title': title, 'description': description, 'type': 'collection' }
        else: data = { 'parentPageId': '{}'.format(parentPageId if parentPageId else 0), 'title': title }
        r = requests.post(URL['createPage'].format(self.base_url), headers=self.headers, json=data)
        if r.status_code == SUCCESS:
            return str(r.json()['pageId'])
        else:
            print(f'ERROR: unable to create "{title}" page', r.status_code)
    
    
    def getPageDetails(self, pageId): 
        return requests.get(URL['getPageDetails'].format(self.base_url, pageId), headers=self.headers)

    
    ############### Data: Webforms and dataflows ################# 
    def dsMetaData(self, dsid):
        """
            Get information about a datasource:
                name
                owner
                columns
                shape
            
            Returns:
                json response from Domo
        """
        r = requests.get(URL['getDataSourceDetails'].format(self.base_url, dsid), headers=self.headers)
        if (r.status_code != SUCCESS):
            print('ERROR: meta data request failed')
        return r
    
    
    def getSchemaFromDataSource(self, datasetID = None):
        """
        Get columns names and types for the dataset id specified in the class

        Returns:
        Array: containing column name and column type
        """
        columnData = None
        if datasetID == None:
            datasetID = self.dataset_id
          
        r = requests.get(URL['getSchemaFromDataSource'].format(self.base_url, datasetID), headers=self.headers)

        if (r.status_code == SUCCESS):
            columnData = []
            columnData = r.json()['columnList']
            schema = {'columns': columnData}
        else:
            print('ERROR: unable to get dataset schema')
        return columnData

    
    def create_webform_datasource(self):
        """
        Create webform using json template
        For use in the scaffolding function
        """
        data_json = jsons['overrideWebform']
        return requests.post(URL['webforms'].format(self.base_url),
                             headers=self.headers, json=data_json)

    
    def createDataflow(self, jsonData):
        """
        Create dataflow using json template
        For use in the scaffolding function
        """
        response = requests.request("POST", URL['dataFlow'].format(self.base_url),
                                    headers=self.headers, data=jsonData)
        return response
    
    
    def create_fe_dataflow(self):
        """
        Create dataflow using json template modified for the feature engineering dataflow
        For use in the scaffolding function

        Returns:
        json response from Domo
        """
        data = json.dumps(jsons['fe_base'])
        fe_json = data.replace('user_id', str(self.user_id)).replace('ds_id', self.dataset_id)
        response = self.createDataflow(fe_json)
        return response

    
    def create_validation_criteria(self, featureEngID, overrideID):
        """
        Create dataflow using json template modified for the validation criteria dataflow
        For use in the scaffolding function

        Returns:
        json response from Domo
        """
        data = json.dumps(jsons['validationCriteria'])
        criteria = data.replace('user_id', str(self.user_id)).replace('ds_id', featureEngID).replace('overrideID', overrideID)
        response = self.createDataflow(criteria)
        return response
 
    def create_correlation_flow(self, dataset_id):
        data = json.dumps(jsons['correlationFlow'])
        data = data.replace('user_id', str(self.user_id)).replace('ds_id', dataset_id)
        response = self.createDataflow(data)
        return response
        
    def createValidationDF(self, featureEngID, criteriaID):
        """
        Create dataflow using json template modified for the validation dataflow
        For use in the scaffolding function

        Returns:
        json response from Domo
        """
        columns = self.getSchemaFromDataSource(featureEngID)
        for i in columns:
            i["dataType"] = i.pop('type')
            del i['id']
            
        data = copy.deepcopy(jsons['validation'])
        data['actions'][6]['additions']=columns
        
        new = json.dumps(data)
        validation = new.replace('user_id', str(self.user_id)).replace('fe_id', featureEngID).replace('vc_id', criteriaID)
        response = self.createDataflow(validation)
        return response
    
    
    def createModelSelectionDF(self, featureEngID):
        """
        Create dataflow using json template modified for the model selection dataflow
        For use in the scaffolding function

        Returns:
        json response from Domo
        """
        data = json.dumps(jsons['modelSelection'])
        modelSel = data.replace('user_id', str(self.user_id)).replace('dsid', featureEngID)
        response = self.createDataflow(modelSel)
        return response

    
    def createModelInferenceDF(self, featureEngID):
        """
        Create dataflow using json template modified for the model inference dataflow
        For use in the scaffolding function

        Returns:
        json response from Domo
        """
        data = json.dumps(jsons['modelInference'])
        modelSel = data.replace('user_id', str(self.user_id)).replace('dsid', featureEngID)
        response = self.createDataflow(modelSel)
        return response
    
    
    def start_dataflow(self, dfid):
        response = requests.request("POST",
                                    URL['dataflowRun'].format(self.base_url, dfid),
                                    headers=self.headers, data='{}')
        return response
    
    
    def run_dataflow(self, dfid, dsid, name=''):
        self.start_dataflow(dfid)
        
        print(f'INFO: running {name} dataflow')
        
        retry = 100
        sleep_time = 30
        n_try = 0
        while n_try < retry:
            time.sleep(sleep_time)
            if self.dsMetaData(dsid).json()['rowCount']:
                break
            n_try += 1
        
        # this Boolean indicates whether the loop timed-out (fail) or didn't (success!)
        finished = n_try < retry
        if not finished:
              print(f'WARN: {name} dataflow didn\'t finish in {retry * sleep_time}')
              
        return finished
    
    
    def addTagToDataSource(self, dsId, tag, kind): 
        if dsId:
            if (kind == 'DATAFLOW'): 
                return requests.put(URL['addTagToDataFlow'].format(self.base_url,dsId), headers=self.headers, json={'flowId': dsId, 'tags': tag})
            else: 
                return requests.post(URL['addTagToDataSource'].format(self.base_url,dsId), headers=self.headers, json=tag)

            
    def get_unique_values(self, col):
        data = '{"query": {"columns": [{"exprType": "COLUMN", "column": "<COL>"}], "groupByColumns": [{"exprType": "COLUMN", "column": "<COL>"}]}}'
        data = data.replace('<COL>', col)
        
        resp = requests.post(
            f'https://{self.instance}.domo.com/api/query/v1/execute/{self.dataset_id}',
            headers=self.headers,
            data=data
        )

        return [x for y in resp.json()['rows'] for x in y]
        
        
    def scaffolfdingTags(self, tags, fe):
        if fe:
            fe_tags = ['feature engineering'] + tags
            self.addTagToDataSource(self.fe['feDS_id'], fe_tags, 'DATASET')
            self.addTagToDataSource(self.fe['feDF_id'], fe_tags, 'DATAFLOW')
            
        self.addTagToDataSource(self.bias['bias_DF'], tags, 'DATAFLOW')
        self.addTagToDataSource(self.bias['bias_DS'], tags, 'DATASET')
        
        valid_tags = ['validation'] + tags
        self.addTagToDataSource(self.validation['correlationDF_id'], valid_tags, 'DATAFLOW')
        self.addTagToDataSource(self.validation['criteriaDF_id'], valid_tags, 'DATAFLOW')
        self.addTagToDataSource(self.validation['validationDF_id'], valid_tags, 'DATAFLOW')
        self.addTagToDataSource(self.validation['validDS_id'], valid_tags, 'DATASET')
        self.addTagToDataSource(self.validation['summaryStats_id'], valid_tags, 'DATASET')
        self.addTagToDataSource(self.validation['override_id'], valid_tags, 'DATASET')
        self.addTagToDataSource(self.validation['correlation_id'], valid_tags, 'DATASET')
        self.addTagToDataSource(self.validation['validDS_id'], valid_tags, 'DATASET')
        self.addTagToDataSource(self.validation['errorDS_id'], valid_tags, 'DATASET')
        
        model_tags = ['modeling'] + tags
        self.addTagToDataSource(self.modeling['select_DF'], model_tags, 'DATAFLOW')
        self.addTagToDataSource(self.modeling['inference_DF'], model_tags, 'DATAFLOW')
        self.addTagToDataSource(self.modeling['inferenceDS'], model_tags, 'DATASET')
        self.addTagToDataSource(self.modeling['selectionDS'], model_tags, 'DATASET')
        self.addTagToDataSource(self.modeling['varImpDS'], model_tags, 'DATASET')
 

    def bias_profile(self, dvar, dvar_true):
        dfid, dsid = self.create_bias_metrics(dvar, dvar_true)
        done = self.run_dataflow(dfid, dsid, name='bias metrics')
        if done:
            self.create_bias_cards(dsid)
                    
                
    def create_bias_metrics(self, dvar, dvar_true):
        bias_metric_json = {"documentVersion":1,"databaseType":"MAGIC","name":"Bias Metrics","scheduleInifo":None,"settings":{},"description":"","responsibleUserId":"user_id","runState":"ENABLED","actions":[{"name":"data_input","id":"125c860a-90af-4f3d-b843-f33c4602c7b9","type":"LoadFromVault","gui":{"x":288,"y":48,"color":3238043},"dependsOn":[],"removeByDefault":False,"notes":[],"dataSourceId":"dsid","executeFlowWhenUpdated":True,"onlyLoadNewVersions":False,"columnSettings":{},"visiblePartitionColumn":None},{"name":"Python Script","id":"ce42139c-d386-4a3b-b011-b421ab1f3f07","type":"PythonEngineAction","gui":{"x":408,"y":48},"dependsOn":["125c860a-90af-4f3d-b843-f33c4602c7b9"],"removeByDefault":True,"notes":[],"fillMissingWithNull":True,"script":"from domomagic import read_dataframe, write_dataframe\nimport numpy as np\nimport pandas as pd\n\ndf = read_dataframe('data_input')\nres = pd.DataFrame(columns=['feature', 'group', 'CI', 'DPL', 'KL', 'JS', 'LP', 'KS', 'TVD'])\n\ndvar = <dvar>\ndvar_true = <dvar_true>\n\n_df = df.copy()\nfor col in [col for col in df.select_dtypes(['number']).columns if col != dvar]:\n    _75 = df[col].describe()['75%']\n    _df[f'{col}_{_75}'] = _df[col].apply(lambda x: f'gt_{_75}' if x > _75 else f'lte_{_75}')\n\n_df = _df.select_dtypes(['object']).copy()\nif dvar not in _df:\n    _df[dvar] = df[dvar]\n\ndef calc_KL(Pa, Pd):\n    return sum(Pa * np.log(Pa / Pd))\n\nfor col in _df.drop(columns=[dvar]).columns:\n    for group in _df[col].unique():\n        \n        na = len(_df[_df[col] != group])\n        nd = len(_df[_df[col] == group])\n        na1 = len(_df[(_df[col] != group) & (_df[dvar] == dvar_true)])\n        nd1 = len(_df[(_df[col] == group) & (_df[dvar] == dvar_true)])\n        \n        print(col, group)\n        \n        unique_y = _df[dvar].unique()\n        Pa, Pd = [], []\n        for y in unique_y:\n            Pa.append(len(_df[(_df[col] != group) & (_df[dvar] == y)]) / na)\n            Pd.append(len(_df[(_df[col] == group) & (_df[dvar] == y)]) / nd)\n            \n        Pa = np.array(Pa)\n        Pd = np.array(Pd)\n        P = (Pa + Pd) / 2\n        \n        CI = (na - nd) / (na + nd)\n        DPL = (na1 / na) - (nd1 / nd)\n        KL = calc_KL(Pa, Pd)\n        JS = (calc_KL(Pa, P) + calc_KL(Pd, P)) / 2\n        KS = max(abs(Pa - Pd))\n        LP = (sum(((Pa - Pd)**2)))**(1/2)\n        TVD = sum(abs(Pa - Pd)) / 2\n        \n        res.loc[len(res.index)] = [col, group, CI, DPL, KL, JS, LP, KS, TVD]\n\nwrite_dataframe(res)","additions":[{"name":"feature","dataType":"STRING"},{"name":"group","dataType":"STRING"},{"name":"CI","dataType":"DOUBLE"},{"name":"DPL","dataType":"DOUBLE"},{"name":"KL","dataType":"DOUBLE"},{"name":"JS","dataType":"DOUBLE"},{"name":"LP","dataType":"DOUBLE"},{"name":"KS","dataType":"DOUBLE"},{"name":"TVD","dataType":"DOUBLE"}]},{"name":"output_data","id":"ca816978-08da-42b0-9196-f183c1546721","type":"PublishToVault","gui":{"x":528,"y":48},"dependsOn":["ce42139c-d386-4a3b-b011-b421ab1f3f07"],"removeByDefault":False,"notes":[],"dataSource":{"name":"Bias Metrics"},"partitionIdColumns":[]}],"onboardFlowVersion":{},"engineProperties":{"kettle.mode":"STRICT"}}
        data = json.dumps(bias_metric_json)
        if isinstance(dvar_true, str):
            dvar_true = f"'{dvar_true}'"
        
        data = (
            data.replace('<dvar>', f"'{dvar}'")
                .replace('<dvar_true>', str(dvar_true))
                .replace('user_id', str(self.user_id))
                .replace('dsid', self.fe['feDS_id'])
        )

        resp = self.createDataflow(data)
        self.bias['bias_DF'] = resp.json()['id']
        self.bias['bias_DS'] = resp.json()['actions'][-1]['dataSource']['guid']
        
        return self.bias['bias_DF'], self.bias['bias_DS']
    
    
    def create_bias_cards(self, bias_ds_id):
        bias_heatmap_json = {"definition":{"subscriptions":{"main":{"name":"main","columns":[{"column":"feature","mapping":"VALUE",},{"column":"group","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"column":"CI","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"column":"DPL","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"column":"JS","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"column":"KL","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"column":"KS","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"column":"LP","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"column":"TVD","mapping":"VALUE","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}}],"filters":[],"orderBy":[],"groupBy":[],"fiscal":False,"projection":False,"distinct":False}},"title":"TODO","chartVersion":"7","charts":{"main":{"component":"main","chartType":"badge_heatmap_table","overrides":{"use_logscale":"false","range_by_column":"true","color_theme":"gradient-7"},"goal":None}},},"dataProvider":{"dataSourceId":"TODO"}}
        bias_heatmap_agg_json = {"definition":{"subscriptions":{"main":{"name":"main","columns":[{"column":"feature","mapping":"VALUE"},{"aggregation":"AVG","formulaId":"abs_ci","mapping":"VALUE","alias":"CI","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"aggregation":"AVG","formulaId":"abs_dpl","mapping":"VALUE","alias":"DPL","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"aggregation":"AVG","formulaId":"abs_js","mapping":"VALUE","alias":"JS","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"aggregation":"AVG","formulaId":"abs_kl","mapping":"VALUE","alias":"KL","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"aggregation":"AVG","formulaId":"abs_ks","mapping":"VALUE","alias":"KS","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"aggregation":"AVG","formulaId":"abs_lp","mapping":"VALUE","alias":"LP","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},{"aggregation":"AVG","formulaId":"abs_tvd","mapping":"VALUE","alias":"TVD","format":{"type":"number","format":"###,###.000","commas":True,"percentMultiplied":True,"precision":3}},],"filters":[],"orderBy":[],"groupBy":[{"column":"feature"}],"fiscal":False,"projection":False,"distinct":False}},"formulas":{"dsUpdated":[],"dsDeleted":[],"card":[{"id":"abs_ci","name":"ABS_CI","formula":"ABS(`CI`)","status":"VALID","persistedOnDataSource":False,"dataType":"numeric","isAggregatable":True,"bignumber":False},{"id":"abs_dpl","name":"ABS_DPL","formula":"ABS(`DPL`)","status":"VALID","persistedOnDataSource":False,"dataType":"numeric","isAggregatable":True,"bignumber":False},{"id":"abs_js","name":"ABS_JS","formula":"ABS(`JS`)","status":"VALID","persistedOnDataSource":False,"dataType":"numeric","isAggregatable":True,"bignumber":False},{"id":"abs_kl","name":"ABS_KL","formula":"ABS(`KL`)","status":"VALID","persistedOnDataSource":False,"dataType":"numeric","isAggregatable":True,"bignumber":False},{"id":"abs_ks","name":"ABS_KS","formula":"ABS(`KS`)","status":"VALID","persistedOnDataSource":False,"dataType":"numeric","isAggregatable":True,"bignumber":False},{"id":"abs_lp","name":"ABS_LP","formula":"ABS(`LP`)","status":"VALID","persistedOnDataSource":False,"dataType":"numeric","isAggregatable":True,"bignumber":False},{"id":"abs_tvd","name":"ABS_TVD","formula":"ABS(`TVD`)","status":"VALID","persistedOnDataSource":False,"dataType":"numeric","isAggregatable":True,"bignumber":False}]},"conditionalFormats":{"card":[],"datasource":[]},"annotations":{"new":[],"modified":[],"deleted":[]},"slicers":[],"title":"TODO","description":"","chartVersion":"7","charts":{"main":{"component":"main","chartType":"badge_heatmap_table","overrides":{"use_logscale":"false","range_by_column":"true","color_theme":"gradient-9"},"goal":None}},"allowTableDrill":True,"segments":{"active":[],"create":[],"update":[],"delete":[]},},"dataProvider":{"dataSourceId":"TODO"}}
        
        heatmaps = [
            ('Bias Heatmap', bias_heatmap_json),
            ('Bias Heatmap Summary', bias_heatmap_agg_json)
        ]

        url = f"https://{self.instance}.domo.com/api/content/v3/cards/kpi"
        query_string = {"pageId": self.edaPage}
        responses = []
        for name, data in heatmaps:
            data['definition']['title'] = name
            data['dataProvider']['dataSourceId'] = bias_ds_id
            r = requests.request("PUT", url, data=json.dumps(data), headers=self.headers, params=query_string)
            responses.append(r)
        
        # make the notebook card
        url = f'https://{self.instance}.domo.com/api/content/v1/cards/notebook'
        data = {
            "textHtml": "<div class=\"centerAlignedContent\"><div style='white-space: pre-wrap'><table><tr><td style=\"width:11.03896103896104%\"><div><span style=\"font-size:18px\">﻿</span><a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-bias-metric-class-imbalance.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">﻿</span></a><span style=\"font-size:18px\">﻿</span><a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-bias-metric-class-imbalance.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">Class Imbalance (CI)</span></a><span style=\"font-size:18px\">﻿</span></div></td><td style=\"width:19.480519480519476%\"><div><span style=\"font-size:18px\">(</span><span style=\"font-style:italic;font-size:18px\">na</span><span style=\"font-size:18px\"> - </span><span style=\"font-style:italic;font-size:18px\">nd</span><span style=\"font-size:18px\">) / (</span><span style=\"font-style:italic;font-size:18px\">na</span><span style=\"font-size:18px\"> + </span><span style=\"font-style:italic;font-size:18px\">nd</span><span style=\"font-size:18px\">)</span></div></td><td style=\"width:69.48051948051948%\"><div>Measures the imbalance in the number of members between different facet values</div><div>﻿</div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div>Range: [-1, 1]</div></li><li style=\"text-align:left\"><div>Positive values indicate the <span style=\"font-style:italic\">facet a</span> has more training samples in the dataset</div></li><li style=\"text-align:left\"><div>Values near zero indicate the facets are balanced in the number of training samples in the dataset</div></li><li style=\"text-align:left\"><div>Negative values indicate the <span style=\"font-style:italic\">facet d</span> has more training samples in the dataset</div></li></ul></td></tr><tr><td style=\"width:11.03896103896104%\"><div>﻿<a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-true-label-imbalance.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">Difference in Proportions of Labels (DPL)</span></a><span style=\"font-size:18px\">﻿</span></div></td><td style=\"width:19.480519480519476%\"><div><span style=\"font-size:18px\">(</span><span style=\"font-style:italic;font-size:18px\">na1</span><span style=\"font-size:18px\"> / </span><span style=\"font-style:italic;font-size:18px\">na</span><span style=\"font-size:18px\">) - (</span><span style=\"font-style:italic;font-size:18px\">nd1</span><span style=\"font-size:18px\"> / </span><span style=\"font-style:italic;font-size:18px\">nd</span><span style=\"font-size:18px\">)</span></div></td><td style=\"width:69.48051948051948%\"><div>Measures the imbalance of positive outcomes between different facet values</div><div>﻿</div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div>Range: [-1, 1]</div></li><li style=\"text-align:left\"><div>Positive values indicate <span style=\"font-style:italic\">facet a</span> has a higher proportion of positive outcomes</div></li><li style=\"text-align:left\"><div>Values near zero indicate a more equal proportion of positive outcomes between facets</div></li><li style=\"text-align:left\"><div>Negative values indicate <span style=\"font-style:italic\">facet d</span> has a higher proportion of positive outcomes</div></li></ul></td></tr><tr><td style=\"width:11.03896103896104%\"><div>﻿<a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kl-divergence.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">Kullback-Leibler Divergence (KL)</span></a><span style=\"font-size:18px\">﻿</span></div></td><td style=\"width:19.480519480519476%\"><div><span style=\"font-size:18px\">sum(</span><span style=\"font-style:italic;font-size:18px\">Pa</span><span style=\"font-size:18px\"> * log(</span><span style=\"font-style:italic;font-size:18px\">Pa</span><span style=\"font-size:18px\"> / </span><span style=\"font-style:italic;font-size:18px\">Pd</span><span style=\"font-size:18px\">)) </span></div></td><td style=\"width:69.48051948051948%\"><div>Measures how much the outcome distributions of different facets diverge from each other entropically</div><div>﻿</div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div>Range: [0, inf)</div></li><li style=\"text-align:left\"><div>Values near zero indicate the labels are similarly distributed</div></li><li style=\"text-align:left\"><div>Positive values indicate the label distributions diverge, the more positive the larger the divergence</div></li></ul></td></tr><tr><td style=\"width:11.03896103896104%\"><div>﻿<a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-jensen-shannon-divergence.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">Jensen-Shannon Divergence (JS)</span></a><span style=\"font-size:18px\">﻿</span></div></td><td style=\"width:19.480519480519476%\"><div><span style=\"font-size:18px\">0.5 * ( KL(</span><span style=\"font-style:italic;font-size:18px\">Pa,</span><span style=\"font-size:18px\"> </span><span style=\"font-style:italic;font-size:18px\">P</span><span style=\"font-size:18px\">) + KL(</span><span style=\"font-style:italic;font-size:18px\">Pd</span><span style=\"font-size:18px\">, </span><span style=\"font-style:italic;font-size:18px\">P</span><span style=\"font-size:18px\">) )</span></div><div>﻿</div><div>where <span style=\"font-style:italic\">P</span> is avg(<span style=\"font-style:italic\">Pa</span>, <span style=\"font-style:italic\">Pd</span>)</div></td><td style=\"width:69.48051948051948%\"><div>Measures how much the outcome distributions of different facets diverge from each other entropically</div><div>﻿</div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div>Range: [0, inf)</div></li><li style=\"text-align:left\"><div>Values near zero indicate the labels are similarly distributed</div></li><li style=\"text-align:left\"><div>Positive values indicate the label distributions diverge, the more positive the larger the divergence</div></li></ul></td></tr><tr><td style=\"width:11.03896103896104%\"><div>﻿<a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-lp-norm.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">Lp-Norm (LP)</span></a><span style=\"font-size:18px\">﻿</span></div></td><td style=\"width:19.480519480519476%\"><div><span style=\"font-size:18px\">(sum((</span><span style=\"font-style:italic;font-size:18px\">Pa</span><span style=\"font-size:18px\"> - </span><span style=\"font-style:italic;font-size:18px\">Pd</span><span style=\"font-size:18px\">)^2))^0.5</span></div></td><td style=\"width:69.48051948051948%\"><div>Measures a p-norm difference between distinct demographic distributions of the outcomes associated with different facets in a dataset</div><div>﻿</div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div>Range: [0, inf)</div></li><li style=\"text-align:left\"><div>Values near zero indicate the labels are similarly distributed</div></li><li style=\"text-align:left\"><div>Positive values indicate the label distributions diverge, the more positive the larger the divergence</div></li></ul></td></tr><tr><td style=\"width:11.03896103896104%\"><div><span style=\"font-size:18px\">﻿</span><a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-total-variation-distance.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">Total Variation Distance (TVD)</span></a><span style=\"font-size:18px\">﻿</span></div></td><td style=\"width:19.480519480519476%\"><div><span style=\"font-size:18px\">0.5 * sum(abs(Pa - Pd))</span></div></td><td style=\"width:69.48051948051948%\"><div>Measures half of the L1-norm difference between distinct demographic distributions of the outcomes associated with different facets in a dataset</div><div>﻿</div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div>Range: [0, inf)</div></li><li style=\"text-align:left\"><div>Values near zero indicate the labels are similarly distributed</div></li><li style=\"text-align:left\"><div>Positive values indicate the label distributions diverge, the more positive the larger the divergence</div></li></ul></td></tr><tr><td style=\"width:11.03896103896104%\"><div><span style=\"font-size:18px\">﻿</span><a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kolmogorov-smirnov.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\"><span style=\"font-size:18px\">Kolmogorov-Smirnov (KS)</span></a><span style=\"font-size:18px\">﻿</span></div></td><td style=\"width:19.480519480519476%\"><div><span style=\"font-size:18px\">max(abs(</span><span style=\"font-style:italic;font-size:18px\">Pa</span><span style=\"font-size:18px\"> - </span><span style=\"font-style:italic;font-size:18px\">Pd</span><span style=\"font-size:18px\">))</span></div></td><td style=\"width:69.48051948051948%\"><div>Measures maximum divergence between outcomes in distributions for different facets in a dataset</div><div>﻿</div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div>Range: [0, 1]</div></li><li style=\"text-align:left\"><div>Values near zero indicate the labels were evenly distributed between facets in all outcome categories</div></li><li style=\"text-align:left\"><div>Values near one indicate the labels for one category were all in one facet, so very imbalanced</div></li><li style=\"text-align:left\"><div>Intermittent values indicate relative degrees of maximum label imbalance</div></li></ul></td></tr></table><div>﻿</div><div><span style=\"font-size:24px\">Notation</span></div><ul style=\"list-style-type:disc\"><li style=\"text-align:left\"><div><span style=\"font-weight:600;font-style:italic;font-size:18px\">facet a  </span>the feature value that defines a demographic that bias favors</div></li><li style=\"text-align:left\"><div><span style=\"font-weight:600;font-style:italic;font-size:18px\">facet d  </span>the feature value that defines a demographic that bias disfavors</div></li><li style=\"text-align:left\"><div><span style=\"font-style:italic;font-weight:600;font-size:18px\">na  </span>the number of observed labels for the favored facet value</div></li><li style=\"text-align:left\"><div><span style=\"font-weight:600;font-style:italic;font-size:18px\">nd  </span>the number of observed labels for the disfavored facet value</div></li><li style=\"text-align:left\"><div><span style=\"font-weight:600;font-style:italic;font-size:18px\">na1  </span>the number of observed labels for positive outcomes for <span style=\"font-style:italic\">facet a</span></div></li><li style=\"text-align:left\"><div><span style=\"font-weight:600;font-style:italic;font-size:18px\">nd1  </span>the number of observed labels for positive outcomes for <span style=\"font-style:italic\">facet d</span></div></li><li style=\"text-align:left\"><div><span style=\"font-size:18px;font-weight:600;font-style:italic\">Pa  </span>the probability distribution of the observed labels for <span style=\"font-style:italic\">facet a</span></div></li><li style=\"text-align:left\"><div><span style=\"font-size:18px;font-weight:600;font-style:italic\">Pd  </span>the probability distribution of the observed labels for <span style=\"font-style:italic\">facet d</span></div></li></ul><div>﻿</div><div>﻿</div><div>(Adapted from the <a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-measure-data-bias.html\" target=\"_blank\" rel=\"noopener noreferrer nofollow\">SageMaker documentation</a>)</div></div></div>",
            "markup": "{\"object\":\"value\",\"document\":{\"object\":\"document\",\"data\":{\"vertAlignment\":\"center\"},\"nodes\":[{\"object\":\"block\",\"type\":\"table\",\"data\":{\"cell-percent-width\":[11.03896103896104,19.480519480519476,69.48051948051948]},\"nodes\":[{\"object\":\"block\",\"type\":\"tr\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Class Imbalance (CI)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-bias-metric-class-imbalance.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Class Imbalance (CI)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-bias-metric-class-imbalance.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Class Imbalance (CI)\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"(\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"na\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" - \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"nd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\") / (\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"na\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" + \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"nd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\")\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Measures the imbalance in the number of members between different facet values\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Range: [-1, 1]\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Positive values indicate the \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet a\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" has more training samples in the dataset\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near zero indicate the facets are balanced in the number of training samples in the dataset\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Negative values indicate the \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet d\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" has more training samples in the dataset\",\"marks\":[]}]}]}]}]}]}]},{\"object\":\"block\",\"type\":\"tr\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Difference in Proportions of Labels (DPL)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-true-label-imbalance.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Difference in Proportions of Labels (DPL)\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"(\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"na1\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" / \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"na\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\") - (\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"nd1\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" / \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"nd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\")\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Measures the imbalance of positive outcomes between different facet values\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Range: [-1, 1]\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Positive values indicate \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet a\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" has a higher proportion of positive outcomes\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near zero indicate a more equal proportion of positive outcomes between facets\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Negative values indicate \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet d\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" has a higher proportion of positive outcomes\",\"marks\":[]}]}]}]}]}]}]},{\"object\":\"block\",\"type\":\"tr\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Kullback-Leibler Divergence (KL)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kl-divergence.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Kullback-Leibler Divergence (KL)\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"sum(\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pa\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" * log(\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pa\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" / \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\")) \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Measures how much the outcome distributions of different facets diverge from each other entropically\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Range: [0, inf)\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near zero indicate the labels are similarly distributed\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Positive values indicate the label distributions diverge, the more positive the larger the divergence\",\"marks\":[]}]}]}]}]}]}]},{\"object\":\"block\",\"type\":\"tr\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Jensen-Shannon Divergence (JS)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-jensen-shannon-divergence.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Jensen-Shannon Divergence (JS)\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"0.5 * ( KL(\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pa,\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"P\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\") + KL(\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\", \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"P\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\") )\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"where \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"P\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" is avg(\",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"Pa\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\", \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"Pd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\")\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Measures how much the outcome distributions of different facets diverge from each other entropically\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Range: [0, inf)\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near zero indicate the labels are similarly distributed\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Positive values indicate the label distributions diverge, the more positive the larger the divergence\",\"marks\":[]}]}]}]}]}]}]},{\"object\":\"block\",\"type\":\"tr\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Lp-Norm (LP)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-lp-norm.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Lp-Norm (LP)\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"(sum((\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pa\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" - \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\")^2))^0.5\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Measures a p-norm difference between distinct demographic distributions of the outcomes associated with different facets in a dataset\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Range: [0, inf)\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near zero indicate the labels are similarly distributed\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Positive values indicate the label distributions diverge, the more positive the larger the divergence\",\"marks\":[]}]}]}]}]}]}]},{\"object\":\"block\",\"type\":\"tr\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Total Variation Distance (TVD)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-total-variation-distance.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Total Variation Distance (TVD)\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"0.5 * sum(abs(Pa - Pd))\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Measures half of the L1-norm difference between distinct demographic distributions of the outcomes associated with different facets in a dataset\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Range: [0, inf)\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near zero indicate the labels are similarly distributed\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Positive values indicate the label distributions diverge, the more positive the larger the divergence\",\"marks\":[]}]}]}]}]}]}]},{\"object\":\"block\",\"type\":\"tr\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"Kolmogorov-Smirnov (KS)\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-data-bias-metric-kolmogorov-smirnov.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Kolmogorov-Smirnov (KS)\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"max(abs(\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pa\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\" - \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"Pd\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"))\",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"td\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Measures maximum divergence between outcomes in distributions for different facets in a dataset\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Range: [0, 1]\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near zero indicate the labels were evenly distributed between facets in all outcome categories\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Values near one indicate the labels for one category were all in one facet, so very imbalanced\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Intermittent values indicate relative degrees of maximum label imbalance\",\"marks\":[]}]}]}]}]}]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Notation\",\"marks\":[{\"object\":\"mark\",\"type\":\"h1\",\"data\":{}}]}]}]},{\"object\":\"block\",\"type\":\"ul\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"facet a  \",\"marks\":[{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the feature value that defines a demographic that bias favors\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"facet d  \",\"marks\":[{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the feature value that defines a demographic that bias disfavors\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"na  \",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the number of observed labels for the favored facet value\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"nd  \",\"marks\":[{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the number of observed labels for the disfavored facet value\",\"marks\":[]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"na1  \",\"marks\":[{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the number of observed labels for positive outcomes for \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet a\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"TableContent\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"nd1  \",\"marks\":[{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"em\",\"data\":{}},{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the number of observed labels for positive outcomes for \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet d\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Pa  \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}},{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the probability distribution of the observed labels for \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet a\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]}]}]}]},{\"object\":\"block\",\"type\":\"li\",\"data\":{},\"nodes\":[{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"Pd  \",\"marks\":[{\"object\":\"mark\",\"type\":\"h2\",\"data\":{}},{\"object\":\"mark\",\"type\":\"strong\",\"data\":{}},{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]},{\"object\":\"leaf\",\"text\":\"the probability distribution of the observed labels for \",\"marks\":[]},{\"object\":\"leaf\",\"text\":\"facet d\",\"marks\":[{\"object\":\"mark\",\"type\":\"em\",\"data\":{}}]}]}]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"\",\"marks\":[]}]}]},{\"object\":\"block\",\"type\":\"p\",\"data\":{},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"(Adapted from the \",\"marks\":[]}]},{\"object\":\"inline\",\"type\":\"a\",\"data\":{\"displayName\":\"SageMaker documentation\",\"url\":\"https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-measure-data-bias.html\",\"openInNewWindow\":true,\"configType\":\"WEBLINK\"},\"nodes\":[{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\"SageMaker documentation\",\"marks\":[]}]}]},{\"object\":\"text\",\"leaves\":[{\"object\":\"leaf\",\"text\":\")\",\"marks\":[]}]}]}]}}",
            "title": "Bias Metric Glossary",
            "encoding": "html",
            "pageId": self.edaPage,
            "version": "v3"
        }
        r = requests.post(url, headers=profile.headers, data=json.dumps(data))
        responses.append(r)
        
        return responses    
    
    
    ##################### Workflows ###################   
    def scaffolding(
        self, 
        page_name='Data Profile', 
        tags=['DataProfiler'],  
        fe_flow=False,
        correlation_flow=False,
        validation_flow=False,
        model_flow=False,
        hist_profile=False,
        scat_profile=False,
        box_profile=False,
        time_profile=False,
        dependent_var=None,
        dependent_var_true=None,
        time_series_x=None,
        prediction_type=None,
    ):
        """
        Create mutliple dataflows, datasets, pages and cards to create the standard template
        used by the Domo Data Science Consulting Team
        """
        
        print('INFO: starting scaffolding')
            
        if dependent_var == 'none':
            dependent_var = None
        
        if time_series_x == 'none':
            time_series_x = None
        
        if prediction_type == 'none':
            prediction_type = None
        
        bias_profile = dependent_var_true != 'none'
        
        if any([scat_profile, box_profile]) and (not dependent_var):
            print('ERROR: missing `dependent_var` (required for scat/box profiles)')
            return
        
        #
        #  feature engineering dataflow
        #
        if fe_flow:
            fe = self.create_fe_dataflow()
            if fe.status_code == SUCCESS:
                self.fe['feDF_id'] = str(fe.json()['id'])
                self.fe['feDS_id'] = str(fe.json()['outputs'][0]['dataSourceId'])
                self.run_dataflow(self.fe['feDF_id'], self.fe['feDS_id'], name='feature engineering')
            else:
                print('ERROR: failed to create feature engineering dataflow:', fe.status_code)
                print('ERROR: stopping scaffolding')
                return
        else:
            self.fe['feDS_id'] = self.dataset_id
        
        #
        #  correlation flow
        #
        if correlation_flow:
            cf = self.create_correlation_flow(self.fe['feDS_id'])
            if cf.status_code == SUCCESS:
                self.validation['correlationDF_id'] = cf.json()['id']
                self.validation['correlation_id'] = cf.json()['outputs'][0]['dataSourceId']
                self.run_dataflow(self.validation['correlationDF_id'], self.validation['correlation_id'], name='correlation')
        
        #
        #  validation flows
        #
        if validation_flow:
            wf = self.create_webform_datasource()
            self.validation['override_id'] = wf.json()['dataSource']['id']
            vc = self.create_validation_criteria(self.fe['feDS_id'], self.validation['override_id'])
            if vc.status_code == SUCCESS:
                self.validation['summaryStats_id'] = vc.json()['outputs'][0]['dataSourceId']
                self.validation['criteriaDF_id'] = str(vc.json()['id'])
                self.run_dataflow(self.validation['criteriaDF_id'], self.validation['summaryStats_id'], name='validation criteria')
            else:
                print('ERROR: failed to create validation criteria dataflow:', vc.status_code)
                print('ERROR: stopping scaffolding')
                return
        
            v = self.createValidationDF(self.fe['feDS_id'], self.validation['summaryStats_id'])
            if v.status_code == SUCCESS:
                self.validation['validationDF_id'] = str(v.json()['id'])
                x = self.dsMetaData(v.json()['outputs'][0]['dataSourceId'])
                if x.json()['name'] == 'Validated Data':
                    self.validation['validDS_id'] = v.json()['outputs'][0]['dataSourceId']
                    self.validation['errorDS_id'] = v.json()['outputs'][1]['dataSourceId']
                else:
                    self.validation['validDS_id'] = v.json()['outputs'][1]['dataSourceId']
                    self.validation['errorDS_id'] = v.json()['outputs'][0]['dataSourceId']
                self.run_dataflow(self.validation['validationDF_id'], self.validation['errorDS_id'], name='validation')
            else:
                print('ERROR: failed to create validation dataflow:', v.status_code)
                print('ERROR: stopping scaffolding')
                return
        
        #
        #  modeling flows
        #
        if model_flow:
            m1 = self.createModelSelectionDF(self.fe['feDS_id'])
            m2 = self.createModelInferenceDF(self.fe['feDS_id'])
            if m1.status_code == SUCCESS and m2.status_code == SUCCESS:
                self.modeling['select_DF'] = m1.json()['id']
                self.modeling['inference_DF'] = m2.json()['id']

                m = profile.dsMetaData(m1.json()['outputs'][0]['dataSourceId'])
                if m.json()['name'] == 'Variable Importance':                    
                    self.modeling['varImpDS'] = m1.json()['outputs'][0]['dataSourceId']
                    self.modeling['selectionDS'] = m1.json()['outputs'][1]['dataSourceId']
                else:                    
                    self.modeling['varImpDS'] = m1.json()['outputs'][1]['dataSourceId']
                    self.modeling['selectionDS'] = m1.json()['outputs'][0]['dataSourceId']

                self.modeling['inferenceDS'] = m2.json()['outputs'][0]['dataSourceId']
                
                self.run_dataflow(self.modeling['select_DF'], self.modeling['varImpDS'], name='model selection') 
                self.run_dataflow(self.modeling['inference_DF'], self.modeling['inferenceDS'], name='model inference')
            else: 
                print('ERROR: failed to create model dataflow:', m1.status_code, m2.status_code)
                print('ERROR: stopping scaffolding')
                return 
                
        #
        #  cards and pages
        #
        needs_top_page = any([
            bias_profile, hist_profile, scat_profile,
            box_profile, correlation_flow,
            validation_flow, model_flow, time_series_x
        ])
        
        if needs_top_page:
            print(f'INFO: creating "{page_name}" page')
            self.topPage = self.createPage(page_name)
        
        if any([bias_profile, hist_profile, scat_profile, box_profile, time_series_x, correlation_flow, validation_flow]):
            print(f'INFO: creating "Exploratory Data Analysis" page')
            self.edaPage = self.createPage('Exploratory Data Analysis', self.topPage)        
        
            if validation_flow:
                print(f'INFO: creating "Model/Data Validation" page')
                self.modPage = self.createPage('Model/Data Validation', self.topPage)
                self.summaryStatTable(self.validation['summaryStats_id'], self.edaPage)
                self.dataVal = self.createPage('Data Validation', self.modPage, "", True)
                self.createCardFromTemplate(self.validation['errorDS_id'], self.dataVal, jsons['validFace'])
                self.createCardFromTemplate(self.validation['errorDS_id'], self.dataVal, jsons['validErrorsTable'])

            if correlation_flow:
                self.cor_mat(self.validation['correlation_id'], self.edaPage)
                
            if bias_profile:
                print('INFO: starting bias profile')
                self.bias_profile(dependent_var, dependent_var_true)

            if hist_profile:
                print('INFO: starting histogram profile')
                hp = self.createPage('Histograms', self.edaPage)
                columns = self.getSchemaFromDataSource(self.fe['feDS_id'])
                self.hist_profile(columns, hp, 'Default', self.fe['feDS_id'])

            if scat_profile:
                print('INFO: starting scatterplot profile')
                sp = self.createPage('Scatterplots', self.edaPage)
                columns = self.getSchemaFromDataSource(self.fe['feDS_id'])
                self.scatter_profile(columns, dependent_var, sp, self.fe['feDS_id'])

            if box_profile and prediction_type:
                print('INFO: starting boxplot profile')
                bp = self.createPage('Boxplots', self.edaPage)
                columns = self.getSchemaFromDataSource(self.fe['feDS_id'])
                self.boxplot_profile(columns, dependent_var, bp, self.fe['feDS_id'], prediction_type)
                
            if time_series_x:
                print('INFO: starting time series profile')
                self.time_profile(self.edaPage, self.fe['feDS_id'], time_series_x)
            
        if model_flow:
            if not self.modPage:
                print(f'INFO: creating "Model/Data Validation" page')
                self.modPage = self.createPage('Model/Data Validation', self.topPage)
            self.modelAcc = self.createPage('Model Accuracy', self.modPage, "", True)
            self.modelVal = self.createPage('Model Details', self.modPage, "", True)
            self.createCardFromTemplate(self.modeling['varImpDS'], self.modelVal, jsons['varImp'])
            self.createCardFromTemplate(self.modeling['varImpDS'], self.modelVal, jsons['varImp2'])
            self.createCardFromTemplate(self.modeling['selectionDS'], self.modelAcc, jsons['accuracy'])
            
        self.scaffolfdingTags(tags=tags, fe=fe_flow)
    
        print('INFO: finished scaffolding')
        
#
# GUI 
#
hidden = Layout(display='none')
lbl_layout = Layout(display='flex', justify_content='flex-end', align_items='center', width='200px', padding='10px')
lbl_layout2 = Layout(display='flex', justify_content='flex-end', align_items='center', width='150px', padding='10px')
lbl_input_layout = Layout(display='flex', justify_content='flex-start', padding='5px 0 0 0')
lbl_input_layout2 = Layout(display='flex', justify_content='flex-start', padding='15px 0 0 0')
lbl_input_layout3 = Layout(display='flex', justify_content='flex-start', padding='15px 0 0 0', margin='15px 0 0 0')
heading = '<p style="font-size: 1.2rem; margin: 50px 0 20px 70px; width: 450px; border-bottom: 1px solid black;">TEXT</p>'

# inline radio buttons
display(ipyHTML('''
<style>
  .widget-radio-box { display: flex; flex-direction: row !important; justify-content: space-evenly; }
  .widget-radio-box label { margin: 0 5px !important; width:100px !important; }
</style>
'''))

instance_txt = Text()
instance_box = HBox([Label('Instance', layout=lbl_layout), instance_txt], layout=lbl_input_layout)
login_type_rad = RadioButtons(options=['Direct', 'SSO'], layout=Layout(padding='0 0 0 200px'))
email_txt = Text()
email_box = HBox([Label('Email', layout=lbl_layout), email_txt], layout=lbl_input_layout)
password_txt = Password()
password_lbl = Label('Password', layout=lbl_layout)
password_box = HBox([password_lbl, password_txt], layout=lbl_input_layout)
dataset_id_txt = Text()
dataset_id_box = HBox([Label('Dataset ID', layout=lbl_layout), dataset_id_txt], layout=lbl_input_layout)
login_btn = Button(description='Login')
login_box = HBox([Label('', layout=lbl_layout), login_btn], layout=lbl_input_layout2)
login_out = Output()

profile = 0

@login_out.capture()
def run_login_form(_):
    global profile    
    login_out.clear_output()
    with login_out:
        sso = login_type_rad.value == 'SSO'
        profile = Profiler(instance_txt.value, dataset_id_txt.value, email_txt.value, password_txt.value, sso)
        password_txt.value = ''

    if profile.logged_in:
        cols = profile.getSchemaFromDataSource()
        dependent_var_drp.options = ['none'] + [c['name'] for c in cols]
        time_series_x_drp.options = ['none'] + [c['name'] for c in cols if c['type'] in ('DECIMAL', 'LONG', 'DOUBLE', 'DATE', 'DATETIME')]
        profile.cols = cols
        dependent_var_drp.disabled = False
        time_series_x_drp.disabled = False
        start_btn.disabled = False
    
login_form = VBox([
    HTML(value=heading.replace('TEXT', 'Login')),
    login_type_rad,
    instance_box,
    email_box,
    password_box,
    dataset_id_box,
    login_box
])

login_btn.on_click(run_login_form, False)

def on_login_type(change):
    if change['name'] == 'value':
        if change['new'] == 'Direct':
            password_lbl.value = 'Password'
            email_box.layout = lbl_input_layout
        else:
            password_lbl.value = 'Session Cookie'
            email_box.layout = hidden
            
login_type_rad.observe(on_login_type)




page_name_txt = Text()
page_name_box = HBox([Label('Page Name', layout=lbl_layout), page_name_txt], layout=lbl_input_layout)
tags_txt = Text(placeholder='tag one, tag two')
tags_box = HBox([Label('Tags (optional)', layout=lbl_layout), tags_txt], layout=lbl_input_layout)
dependent_var_drp = Dropdown(options=['none'], disabled=True)
dependent_var_box = HBox([Label('Dependent Variable', layout=lbl_layout), dependent_var_drp], layout=lbl_input_layout)
positive_label_drp = Dropdown(options=['none'], disabled=True)
positive_label_box = HBox([Label('Positive Label', layout=lbl_layout), positive_label_drp], layout=lbl_input_layout)
prediction_type_drp = Dropdown(options=['none', 'Numeric', 'Categorical'], disabled=True)
prediction_type_box = HBox([Label('Inference Type', layout=lbl_layout), prediction_type_drp], layout=lbl_input_layout)
time_series_x_drp = Dropdown(options=['none'], disabled=True)
time_series_x_box = HBox([Label('Time Series X', layout=lbl_layout), time_series_x_drp], layout=lbl_input_layout)

dvar_true_int_chk = Checkbox(indent=False, layout=Layout(width='50px'), disabled=True)
fe_flow_chk = Checkbox(indent=False, layout=Layout(width='50px'), disabled=True)
valid_flow_chk = Checkbox(indent=False, layout=Layout(width='50px'), disabled=True)
model_flow_chk = Checkbox(indent=False, layout=Layout(width='50px'), disabled=True)
bias_profile_chk = Checkbox(indent=False, layout=Layout(width='50px'), disabled=True)
scat_profile_chk = Checkbox(indent=False, layout=Layout(width='50px'), disabled=True)
box_profile_chk = Checkbox(indent=False, layout=Layout(width='50px'), disabled=True)
hist_profile_chk = Checkbox(indent=False, layout=Layout(width='50px'))
time_profile_chk = Checkbox(indent=False, layout=Layout(width='50px'))
correlation_flow_chk = Checkbox(indent=False, layout=Layout(width='50px'))

def on_change_time_series_x(change):
    if change['type'] == 'change' and change['name'] == 'value':
        if time_series_x_drp.value == 'none':
            time_profile_chk.value = False
        else:
            time_profile_chk.value = True

def on_change_dvar(change):
    if change['type'] == 'change' and change['name'] == 'value':
        if dependent_var_drp.value == 'none':
            prediction_type_drp.disabled = True
            prediction_type_drp.value = 'none'
            fe_flow_chk.disabled = True
            fe_flow_chk.value = False
            scat_profile_chk.disabled = True
            scat_profile_chk.value = False
            model_flow_chk.disabled = True
            model_flow_chk.value = False
            valid_flow_chk.disabled = True
            valid_flow_chk.value = False
        else:
            prediction_type_drp.disabled = False
            fe_flow_chk.disabled = False
            model_flow_chk.disabled = False
            scat_profile_chk.disabled = False
            valid_flow_chk.disabled = False
            
            if prediction_type_drp.value == 'Categorical':
                positive_label_drp.options = profile.get_unique_values(dependent_var_drp.value)
        
def on_change_ptype(change):
    if change['type'] == 'change' and change['name'] == 'value':
        if prediction_type_drp.value == 'none':
            positive_label_drp.disabled = True
            dvar_true_int_chk.disabled = True
            dvar_true_int_chk.value = False
            box_profile_chk.disabled = True
            box_profile_chk.value = False
            positive_label_drp.options = ['none']
        else:
            box_profile_chk.disabled = False
            if prediction_type_drp.value == 'Categorical':
                positive_label_drp.disabled = False
                dvar_true_int_chk.disabled = False
                positive_label_drp.options = profile.get_unique_values(dependent_var_drp.value)
            else:
                positive_label_drp.disabled = True
                dvar_true_int_chk.disabled = True
                dvar_true_int_chk.value = False
                positive_label_drp.options = ['none']

def on_change_page_name(change):
    if change['type'] == 'change' and change['name'] == 'value':
        if page_name_txt.value == '':
            hist_profile_chk.disabled = True
        else:
            hist_profile_chk.disabled = False
        
page_name_txt.observe(on_change_page_name)
time_series_x_drp.observe(on_change_time_series_x)
dependent_var_drp.observe(on_change_dvar)
prediction_type_drp.observe(on_change_ptype)

start_btn = Button(description='Run Profiler', disabled=True)
start_box = HBox([Label('', layout=lbl_layout), start_btn], layout=lbl_input_layout3)
scaffolding_out = Output()

@scaffolding_out.capture()
def run_scaffolding_form(_):
    scaffolding_out.clear_output()
    with scaffolding_out:
        # dvar_true = int(positive_label_drp.value) if dvar_true_int_chk.value else positive_label_drp.value
        dvar_true = positive_label_drp.value
        if positive_label_drp.value != 'none':
            col = [c for c in profile.cols if c['name'] == dependent_var_drp.value][0]
            if col['type'] == 'LONG':
                dvar_true = int(positive_label_drp.value)
            elif col['type'] in ('DECIMAL', 'DOUBLE'):
                dvar_true = float(positive_label_drp.value)
        else:
            dvar_true = positive_label_drp.value

        tags = list(map(str.strip, tags_txt.value.split(','))) if tags_txt else ['DataProfiler']
            
        # print(dvar_true)
        # print(type(dvar_true))
        # return
            
        profile.scaffolding(page_name=page_name_txt.value,
                            tags=tags,
                            fe_flow=fe_flow_chk.value,
                            validation_flow=valid_flow_chk.value,
                            correlation_flow=correlation_flow_chk.value,
                            model_flow=model_flow_chk.value,
                            hist_profile=hist_profile_chk.value,
                            scat_profile=scat_profile_chk.value,
                            box_profile=box_profile_chk.value,
                            dependent_var=dependent_var_drp.value,
                            dependent_var_true=dvar_true,
                            time_series_x=time_series_x_drp.value,
                            prediction_type=prediction_type_drp.value
                           )

start_btn.on_click(run_scaffolding_form, False)

scaffolding_form = VBox([
    HTML(value=heading.replace('TEXT', 'General')),
    page_name_box,
    tags_box,
    HBox([
        Label('Correlation Profile', layout=lbl_layout), correlation_flow_chk, 
        Label('Histogram Profile', layout=lbl_layout2), hist_profile_chk
    ], layout=Layout(padding='5px 0 0 0')),
    
    HTML(value=heading.replace('TEXT', 'Inference')),
    dependent_var_box,
    prediction_type_box,
    HBox([
        Label('Feature Engineering Flow', layout=lbl_layout), fe_flow_chk,
        Label('Scatterplot Profile', layout=lbl_layout2), scat_profile_chk,
    ], layout=Layout(padding='5px 0 0 0')),
    HBox([
        Label('Validation Flow', layout=lbl_layout), valid_flow_chk,
        Label('Boxplot Profile', layout=lbl_layout2), box_profile_chk,
    ]),
    HBox([
        Label('Model Flow', layout=lbl_layout), model_flow_chk,
    ]),
    
    HTML(value=heading.replace('TEXT', 'Bias Profile (categorical only)')),
    HTML(value='<p style="font-size: 0.8rem; margin: 0 0 10px 90px; width: 450px;">If you\'d like to run the bias profile, select the value from the dependent variable column that indicates the "positive label"</p>'),
    positive_label_box,
    
    HTML(value=heading.replace('TEXT', 'Other')),
    HTML(value='<p style="font-size: 0.8rem; margin: 0 0 10px 90px; width: 450px;">If you\'d like to create time-series plots, select the column to use as the X-axis (you can only select datetime/numeric columns)</p>'),
    HBox([
        Label('Time Series Plot X-Axis', layout=lbl_layout),
        time_series_x_drp,
    ]),
#     HBox([
#         Label('Dataflow Timeout (sec)', layout=lbl_layout),
#         Text(placeholder='ex: 300'),
#     ]),
    
    start_box
])

# FOR DEBUGGING (so you don't have to keep adding these fields)
# instance_txt.value = ''
# email_txt.value = ''
# dataset_id_txt.value = ''

def display_gui():
    display(login_form)
    print('\n')
    display(login_out)

    print()

    display(scaffolding_form)
    print('\n')
    display(scaffolding_out)
    
display_gui()

VBox(children=(HTML(value='<p style="font-size: 1.2rem; margin: 50px 0 20px 70px; width: 450px; border-bottom:…





Output()




VBox(children=(HTML(value='<p style="font-size: 1.2rem; margin: 50px 0 20px 70px; width: 450px; border-bottom:…





Output()