/
export_runs.py
68 lines (55 loc) · 2.13 KB
/
export_runs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json
import os
import pandas as pd
from domino import Domino
output_dir = "results"
# connect to domino; be sure to have these environment variables set
# (runs inside a Domino executor automatically set these for you)
domino = Domino(
"nick/winequality",
api_key=os.environ["DOMINO_USER_API_KEY"],
host=os.environ["DOMINO_API_HOST"],
)
raw_runs = domino.runs_list()["data"]
# print number of runs to STDOUT
print(f"Details of {len(raw_runs)} runs received")
# create results directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# write API out to all_runs.json
f = open("{0}/all_runs.json".format(output_dir), "w")
f.write(json.dumps(raw_runs))
# collect all runs into an array of dictionaries
all_runs = []
for run in raw_runs:
# flatten diagnosticStatistics
# *which are stored in a nested array of dictionaries)
if run["diagnosticStatistics"] is not None:
for stat in run["diagnosticStatistics"].get("data", []):
stat_key = "diagnosticStatistics.{0}".format(stat["key"])
run[stat_key] = stat["value"]
run["diagnosticStatistics.isError"] = run["diagnosticStatistics"].get(
"isError", None
)
# delete diagnosticStatistics - we extracted all the value from it up above
del run["diagnosticStatistics"]
# add run to array
all_runs.append(run)
# create a dataframe with the flattened data
all_runs_df = pd.DataFrame(all_runs)
# convert epoch timestamps to human-readable format 'YYYY-MM-DD HH:MM:SS.SSS'
for field in ["queued", "started", "completed", "postProcessedTimestamp"]:
all_runs_df["{0}_human".format(field)] = pd.to_datetime(
all_runs_df[field], unit="ms"
)
# calculate some metrics in milliseconds
all_runs_df["millisecondsInQueue"] = all_runs_df.started - all_runs_df.queued
all_runs_df["millisecondsInExecution"] = all_runs_df.completed - all_runs_df.started
# write dataframe to a CSV
all_runs_df.to_csv("{0}/all_runs.csv".format(output_dir), index=False)
print(
"Finished exporting run information to {0}/all_runs.json \
and {0}/all_runs.csv".format(
output_dir
)
)