In [1]:
from aircloak_tools import explorer

### Implicitly uses defaults from aircloak-tools package:
#
# Note: this requires the AIRCLOAK_API_KEY env variable to be set to 
# a valid api key that has access to the below datasets.
#
# API_KEY = os.environ["AIRCLOAK_API_KEY"]
# ATTACK_SERVER_API_URL = "https://attack.aircloak.com/api"
# EXPLORER_DEFAULT_URL = "http://localhost"
# EXPLORER_DEFAULT_PORT = 5000
#
###

session = explorer.explorer_session()


In [2]:
# import metrics plotting helpers
from metrics_plot import MetricsPlot

# define some test cases
TEST_CASES = {
    # 'name': (dataset, table, [columns])
    'integer': ("gda_banking", "loans", ["amount"]),
    'integer_cat': ("gda_banking", "loans", ["duration"]),
    'real': ("gda_banking", "loans", ["payments"]),
    'real1': ("GiveMeSomeCredit", "loans", ["RevolvingUtilizationOfUnsecuredLines"]),
    'real2': ("GiveMeSomeCredit", "loans", ["DebtRatio"]),
    'text_cat': ("gda_banking", "loans", ["status"]),
    'bool_cat': ("GiveMeSomeCredit", "loans", ["SeriousDlqin2yrs"]),
    'datetime': ("gda_taxi", "rides", ["pickup_datetime"]),
    'email': ("gda_banking", "clients", ["email"]),
    'cov_clear':  ("cov_clear", "survey", ["how_bad_feel"])
}

# a helper for querying and extracting metrics for a single column
def metrics_from_test_case(test_case, **kwargs):
    if test_case not in TEST_CASES:
        raise Exception(f"unknown test case '{test_case}'")

    try:
        result = explorer.explore(session, *TEST_CASES[test_case])
    except: ExplorerError()

    assert result['status'] == 'Complete'
    assert len(result['columns']) == 1

    return MetricsPlot(result['columns'][0]['metrics'])


In [3]:
int_metrics = metrics_from_test_case('integer')
int_metrics.plot_numeric_histogram()

In [4]:
metrics_from_test_case('integer_cat').plot_distinct_values()

In [5]:
metrics_from_test_case("real").plot_numeric_histogram()

In [6]:
metrics_from_test_case('text_cat').plot_distinct_values()


In [7]:
metrics_from_test_case('bool_cat').plot_distinct_values()

In [8]:
datetime_metrics = metrics_from_test_case('datetime')

In [9]:
datetime_metrics.plot_cyclical_datetimes('minute')


In [10]:
datetime_metrics.plot_linear_datetimes('minute')

In [11]:
email_data = metrics_from_test_case('email')
assert email_data.metrics['is_email']['isEmail'] == True

In [12]:
for address in email_data.metrics['sample_values']:
    print(address)

ultiscineangna.nismag@eu.org
conlesenebi@eu.co.uk
odioscise.uamctuedula@orciUtsagittis.com
Suapitate@at.net
PraeProie.dutues@arcu.co.uk
Pras.seding@mauris.net
Nullnonuerallco.o@Nunc.edu
Maeo.fais@facilisis.com
Doniduduntamerisla@egestas.ca
rutuetllaloruamnar@ac.org
nulonumo@Sed.edu
Pells.coictent@Sed.edu
ipsem@sit.co.uk
comleoadiestiauripsumibut@Nunc.com
enimat@sem.com
velor.ormau@Sed.co.uk
quisalesci@feugiat.net
Inteobougit@ac.co.uk
Susps@malesuada.co.uk
risantstiqlaca.esque@in.net


In [13]:
email_data.plot_simple_histogram('text.length.values')

In [14]:
cov_metrics = metrics_from_test_case('cov_clear')
cov_metrics.plot_distinct_values()

In [15]:
cov_metrics.metrics_raw

[{'name': 'max', 'value': 11.0},
 {'name': 'distinct.value_count', 'value': 4063},
 {'name': 'distinct.null_count', 'value': 2500},
 {'name': 'distinct.values',
  'value': [{'value': None, 'count': 2500},
   {'value': 3, 'count': 251},
   {'value': 6, 'count': 230},
   {'value': 7, 'count': 213},
   {'value': 4, 'count': 203},
   {'value': 5, 'count': 199},
   {'value': 2, 'count': 158},
   {'value': 8, 'count': 126},
   {'value': 0, 'count': 63},
   {'value': 1, 'count': 54},
   {'value': '--OTHER--', 'count': 66}]},
 {'name': 'distinct.is_categorical', 'value': True},
 {'name': 'sample_values',
  'value': [7,
   None,
   5,
   1,
   None,
   4,
   None,
   None,
   None,
   2,
   None,
   None,
   None,
   None,
   None,
   3,
   None,
   None,
   None,
   3]},
 {'name': 'distinct.suppressed_count', 'value': 0},
 {'name': 'exploration_info',
  'value': {'dataSource': 'cov_clear',
   'table': 'survey',
   'column': 'how_bad_feel',
   'columnType': 'integer'}},
 {'name': 'min', 'value'