/
build_gallery.py
462 lines (414 loc) · 17.9 KB
/
build_gallery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
import ast
import importlib
import json
import logging
import os
import re
import sys
import traceback
from io import StringIO
from subprocess import CalledProcessError, CompletedProcess, run
from typing import Dict, List
import click
import pkg_resources
logger = logging.getLogger(__name__)
chandler = logging.StreamHandler(stream=sys.stdout)
chandler.setLevel(logging.DEBUG)
chandler.setFormatter(
logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", "%Y-%m-%dT%H:%M:%S")
)
logger.addHandler(chandler)
logger.setLevel(logging.DEBUG)
expectation_tracebacks = StringIO()
expectation_checklists = StringIO()
def execute_shell_command(command: str) -> int:
"""
Wrap subprocess command in a try/except block to provide a convenient method for pip installing dependencies.
:param command: bash command -- as if typed in a shell/Terminal window
:return: status code -- 0 if successful; all other values (1 is the most common) indicate an error
"""
cwd: str = os.getcwd()
path_env_var: str = os.pathsep.join([os.environ.get("PATH", os.defpath), cwd])
env: dict = dict(os.environ, PATH=path_env_var)
status_code: int = 0
try:
res: CompletedProcess = run(
args=["bash", "-c", command],
stdin=None,
input=None,
# stdout=None, # commenting out to prevent issues with `subprocess.run` in python <3.7.4
# stderr=None, # commenting out to prevent issues with `subprocess.run` in python <3.7.4
capture_output=True,
shell=False,
cwd=cwd,
timeout=None,
check=True,
encoding=None,
errors=None,
text=None,
env=env,
universal_newlines=True,
)
sh_out: str = res.stdout.strip()
logger.info(sh_out)
except CalledProcessError as cpe:
status_code = cpe.returncode
sys.stderr.write(cpe.output)
sys.stderr.flush()
exception_message: str = "A Sub-Process call Exception occurred.\n"
exception_traceback: str = traceback.format_exc()
exception_message += (
f'{type(cpe).__name__}: "{str(cpe)}". Traceback: "{exception_traceback}".'
)
logger.error(exception_message)
return status_code
def get_contrib_requirements(filepath: str) -> Dict:
"""
Parse the python file from filepath to identify a "library_metadata" dictionary in any defined classes, and return a requirements_info object that includes a list of pip-installable requirements for each class that defines them.
Note, currently we are handling all dependencies at the module level. To support future expandability and detail, this method also returns per-class requirements in addition to the concatenated list.
Args:
filepath: the path to the file to parse and analyze
Returns:
A dictionary:
{
"requirements": [ all_requirements_found_in_any_library_metadata_in_file ],
class_name: [ requirements ]
}
"""
with open(filepath) as file:
tree = ast.parse(file.read())
requirements_info = {"requirements": []}
for child in ast.iter_child_nodes(tree):
if not isinstance(child, ast.ClassDef):
continue
current_class = child.name
for node in ast.walk(child):
if isinstance(node, ast.Assign):
try:
target_ids = [target.id for target in node.targets]
except (ValueError, AttributeError):
# some assignment types assign to non-node objects (e.g. Tuple)
target_ids = []
if "library_metadata" in target_ids:
library_metadata = ast.literal_eval(node.value)
requirements = library_metadata.get("requirements", [])
if type(requirements) == str:
requirements = [requirements]
requirements_info[current_class] = requirements
requirements_info["requirements"] += requirements
return requirements_info
def build_gallery(
include_core: bool = True,
include_contrib: bool = True,
only_these_expectations: List[str] = [],
) -> Dict:
"""
Build the gallery object by running diagnostics for each Expectation and returning the resulting reports.
Args:
include_core: if true, include Expectations defined in the core module
include_contrib: if true, include Expectations defined in contrib:
only_these_expectations: list of specific Expectations to include
Returns:
None
"""
gallery_info = dict()
requirements_dict = {}
logger.info("Loading great_expectations library.")
installed_packages = pkg_resources.working_set
installed_packages_txt = sorted(f"{i.key}=={i.version}" for i in installed_packages)
logger.debug(f"Found the following packages: {installed_packages_txt}")
import great_expectations
core_expectations = (
great_expectations.expectations.registry.list_registered_expectation_implementations()
)
if include_core:
print("\n\n\n=== (Core) ===")
logger.info("Getting base registered expectations list")
logger.debug(f"Found the following expectations: {sorted(core_expectations)}")
for expectation in core_expectations:
requirements_dict[expectation] = {"group": "core"}
just_installed = set()
failed_to_import_set = set()
if include_contrib:
print("\n\n\n=== (Contrib) ===")
logger.info("Finding contrib modules")
skip_dirs = ("cli", "tests")
contrib_dir = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"contrib",
)
for root, dirs, files in os.walk(contrib_dir):
for dirname in skip_dirs:
if dirname in dirs:
dirs.remove(dirname)
if "expectations" in dirs:
if root.endswith("great_expectations_experimental"):
sys.path.append(root)
else:
# A package in contrib that may contain more Expectations
sys.path.append(os.path.dirname(root))
for filename in files:
if filename.endswith(".py") and filename.startswith("expect_"):
logger.debug(f"Getting requirements for module {filename}")
contrib_subdir_name = os.path.basename(os.path.dirname(root))
requirements_dict[filename[:-3]] = get_contrib_requirements(
os.path.join(root, filename)
)
requirements_dict[filename[:-3]]["group"] = contrib_subdir_name
for expectation in sorted(requirements_dict):
if only_these_expectations:
if expectation not in only_these_expectations:
continue
group = requirements_dict[expectation]["group"]
print(f"\n\n\n=== {expectation} ({group}) ===")
requirements = requirements_dict[expectation].get("requirements", [])
parsed_requirements = pkg_resources.parse_requirements(requirements)
for req in parsed_requirements:
is_satisfied = any(
[installed_pkg in req for installed_pkg in installed_packages]
)
if is_satisfied or req in just_installed:
continue
logger.debug(f"Executing command: 'pip install \"{req}\"'")
status_code = execute_shell_command(f'pip install "{req}"')
if status_code == 0:
just_installed.add(req)
else:
expectation_tracebacks.write(
f"\n\n----------------\n{expectation} ({group})\n"
)
expectation_tracebacks.write(f"Failed to pip install {req}\n\n")
if group != "core":
logger.debug(f"Importing {expectation}")
try:
if group == "great_expectations_experimental":
importlib.import_module(f"expectations.{expectation}", group)
else:
importlib.import_module(f"{group}.expectations")
except ModuleNotFoundError as e:
logger.error(f"Failed to load expectation: {expectation}")
print(traceback.format_exc())
expectation_tracebacks.write(
f"\n\n----------------\n{expectation} ({group})\n"
)
expectation_tracebacks.write(traceback.format_exc())
failed_to_import_set.add(expectation)
continue
logger.debug(f"Running diagnostics for expectation: {expectation}")
impl = great_expectations.expectations.registry.get_expectation_impl(
expectation
)
try:
diagnostics = impl().run_diagnostics()
checklist_string = diagnostics.generate_checklist()
expectation_checklists.write(
f"\n\n----------------\n{expectation} ({group})\n"
)
expectation_checklists.write(f"{checklist_string}\n")
if diagnostics["description"]["docstring"]:
diagnostics["description"]["docstring"] = format_docstring_to_markdown(
diagnostics["description"]["docstring"]
)
except Exception:
logger.error(f"Failed to run diagnostics for: {expectation}")
print(traceback.format_exc())
expectation_tracebacks.write(
f"\n\n----------------\n{expectation} ({group})\n"
)
expectation_tracebacks.write(traceback.format_exc())
else:
try:
gallery_info[expectation] = diagnostics.to_json_dict()
except TypeError as e:
logger.error(f"Failed to create JSON for: {expectation}")
print(traceback.format_exc())
expectation_tracebacks.write(
f"\n\n----------------\n[JSON write fail] {expectation} ({group})\n"
)
expectation_tracebacks.write(traceback.format_exc())
if just_installed:
print("\n\n\n=== (Uninstalling) ===")
logger.info(
f"Uninstalling packages that were installed while running this script..."
)
for req in just_installed:
logger.debug(f"Executing command: 'pip uninstall -y \"{req}\"'")
execute_shell_command(f'pip uninstall -y "{req}"')
expectation_filenames_set = set(requirements_dict.keys())
full_registered_expectations_set = set(
great_expectations.expectations.registry.list_registered_expectation_implementations()
)
if only_these_expectations:
registered_expectations_set = (
set(only_these_expectations) & full_registered_expectations_set
)
expectation_filenames_set = (
set(only_these_expectations) & expectation_filenames_set
)
elif not include_core:
registered_expectations_set = full_registered_expectations_set - set(
core_expectations
)
else:
registered_expectations_set = full_registered_expectations_set
non_matched_filenames = (
expectation_filenames_set - registered_expectations_set - failed_to_import_set
)
if failed_to_import_set:
expectation_tracebacks.write(f"\n\n----------------\n(Not a traceback)\n")
expectation_tracebacks.write("Expectations that failed to import:\n")
for expectation in sorted(failed_to_import_set):
expectation_tracebacks.write(f"- {expectation}\n")
if non_matched_filenames:
expectation_tracebacks.write(f"\n\n----------------\n(Not a traceback)\n")
expectation_tracebacks.write(
"Expectation filenames that don't match their defined Expectation name:\n"
)
for fname in sorted(non_matched_filenames):
expectation_tracebacks.write(f"- {fname}\n")
bad_names = sorted(
list(registered_expectations_set - expectation_filenames_set)
)
expectation_tracebacks.write(
f"\nRegistered Expectation names that don't match:\n"
)
for exp_name in bad_names:
expectation_tracebacks.write(f"- {exp_name}\n")
if include_core and not only_these_expectations:
core_dir = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"great_expectations",
"expectations",
"core",
)
core_expectations_filename_set = set(
[
fname.rsplit(".", 1)[0]
for fname in os.listdir(core_dir)
if fname.startswith("expect_")
]
)
core_expectations_not_in_gallery = core_expectations_filename_set - set(
core_expectations
)
if core_expectations_not_in_gallery:
expectation_tracebacks.write(f"\n\n----------------\n(Not a traceback)\n")
expectation_tracebacks.write(
f"Core Expectation files not included in core_expectations:\n"
)
for exp_name in sorted(core_expectations_not_in_gallery):
expectation_tracebacks.write(f"- {exp_name}\n")
return gallery_info
def format_docstring_to_markdown(docstr: str) -> str:
"""
Add markdown formatting to a provided docstring
Args:
docstr: the original docstring that needs to be converted to markdown.
Returns:
str of Docstring formatted as markdown
"""
r = re.compile(r"\s\s+", re.MULTILINE)
clean_docstr_list = []
prev_line = None
in_code_block = False
in_param = False
first_code_indentation = None
# Parse each line to determine if it needs formatting
for original_line in docstr.split("\n"):
# Remove excess spaces from lines formed by concatenated docstring lines.
line = r.sub(" ", original_line)
# In some old docstrings, this indicates the start of an example block.
if line.strip() == "::":
in_code_block = True
clean_docstr_list.append("```")
# All of our parameter/arg/etc lists start after a line ending in ':'.
elif line.strip().endswith(":"):
in_param = True
# This adds a blank line before the header if one doesn't already exist.
if prev_line != "":
clean_docstr_list.append("")
# Turn the line into an H4 header
clean_docstr_list.append(f"#### {line.strip()}")
elif line.strip() == "" and prev_line != "::":
# All of our parameter groups end with a line break, but we don't want to exit a parameter block due to a
# line break in a code block. However, some code blocks start with a blank first line, so we want to make
# sure we aren't immediately exiting the code block (hence the test for '::' on the previous line.
in_param = False
# Add the markdown indicator to close a code block, since we aren't in one now.
if in_code_block:
clean_docstr_list.append("```")
in_code_block = False
first_code_indentation = None
clean_docstr_list.append(line)
else:
if in_code_block:
# Determine the number of spaces indenting the first line of code so they can be removed from all lines
# in the code block without wrecking the hierarchical indentation levels of future lines.
if first_code_indentation == None and line.strip() != "":
first_code_indentation = len(
re.match(r"\s*", original_line, re.UNICODE).group(0)
)
if line.strip() == "" and prev_line == "::":
# If the first line of the code block is a blank one, just skip it.
pass
else:
# Append the line of code, minus the extra indentation from being written in an indented docstring.
clean_docstr_list.append(original_line[first_code_indentation:])
elif ":" in line.replace(":ref:", "") and in_param:
# This indicates a parameter. arg. or other definition.
clean_docstr_list.append(f"- {line.strip()}")
else:
# This indicates a regular line of text.
clean_docstr_list.append(f"{line.strip()}")
prev_line = line.strip()
clean_docstr = "\n".join(clean_docstr_list)
return clean_docstr
@click.command()
@click.option(
"--no-core",
"-C",
"no_core",
is_flag=True,
default=False,
help="Do not include core Expectations",
)
@click.option(
"--no-contrib",
"-c",
"no_contrib",
is_flag=True,
default=False,
help="Do not include contrib/package Expectations",
)
@click.argument("args", nargs=-1)
def main(**kwargs):
"""Find all Expectations, run their diagnostics methods, and generate expectation_library_v2.json
- args: snake_name of specific Expectations to include (useful for testing)
"""
gallery_info = build_gallery(
include_core=not kwargs["no_core"],
include_contrib=not kwargs["no_contrib"],
only_these_expectations=kwargs["args"],
)
tracebacks = expectation_tracebacks.getvalue()
checklists = expectation_checklists.getvalue()
if tracebacks != "":
print("\n\n\n" + "#" * 30 + " T R A C E B A C K S " + "#" * 30 + "\n")
print(tracebacks)
print(
"\n\n" + "#" * 30 + " E N D T R A C E B A C K S " + "#" * 30 + "\n\n"
)
with open("./gallery-errors.txt", "w") as outfile:
outfile.write(tracebacks)
if checklists != "":
print(checklists)
with open("./checklists.txt", "w") as outfile:
outfile.write(checklists)
with open("./expectation_library_v2.json", "w") as outfile:
json.dump(gallery_info, outfile, indent=4)
if __name__ == "__main__":
main()