Skip to content

Commit

Permalink
chore(ingest/presto-on-hive) updated the method to ignore certain pro…
Browse files Browse the repository at this point in the history
…perties to check
  • Loading branch information
dushayntAW committed May 15, 2024
1 parent d52ce8f commit f8b73a8
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 7 deletions.
12 changes: 12 additions & 0 deletions metadata-ingestion/src/datahub/testing/compare_metadata_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import pathlib
import pprint
import re
import shutil
import tempfile
from typing import Any, Dict, List, Sequence, Union
Expand Down Expand Up @@ -40,6 +41,7 @@ def assert_metadata_files_equal(
update_golden: bool,
copy_output: bool,
ignore_paths: Sequence[str] = (),
ignore_paths_v2: Sequence[str] = (),
ignore_order: bool = True,
) -> None:
golden_exists = os.path.isfile(golden_path)
Expand Down Expand Up @@ -70,6 +72,16 @@ def assert_metadata_files_equal(
logger.info(f"Error reformatting golden file as MCP/MCEs: {e}")
golden = load_json_file(golden_path)

if ignore_paths_v2:
golden_json = load_json_file(golden_path)
for i, obj in enumerate(golden_json):
aspect_json = obj.get("aspect", {}).get("json", [])
for j, item in enumerate(aspect_json):
if isinstance(item, dict):
if item.get("path") in ignore_paths_v2:
json_path = f"root[{i}]['aspect']['json'][{j}]['value']"
ignore_paths = (*ignore_paths, re.escape(json_path))

diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order)
if diff and update_golden:
if isinstance(diff, MCPDiff):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
import subprocess
from typing import Dict
from typing import Dict, Sequence

import pytest
import requests
Expand Down Expand Up @@ -120,18 +120,28 @@ def test_hive_metastore_ingest(
# config_file = (test_resources_dir / "presto_on_hive_to_file.yml").resolve()
# run_datahub_cmd(["ingest", "-c", f"{config_file}"])

ignore_paths: Sequence[str] = [
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastDdlTime'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['create_date'\]",
]

ignore_paths_v2: Sequence[str] = [
"/customProperties/create_date",
"/customProperties/transient_lastDdlTime",
"/customProperties/numfiles",
"/customProperties/totalsize",
]

# Verify the output.
mce_helpers.check_golden_file(
pytestconfig,
output_path=f"hive_metastore_mces{test_suffix}.json",
golden_path=test_resources_dir
/ f"hive_metastore_mces_golden{test_suffix}.json",
ignore_paths=[
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastDdlTime'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['create_date'\]",
],
ignore_paths=ignore_paths,
ignore_paths_v2=ignore_paths_v2,
)


Expand Down
2 changes: 2 additions & 0 deletions metadata-ingestion/tests/test_helpers/mce_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def check_golden_file(
output_path: Union[str, os.PathLike],
golden_path: Union[str, os.PathLike],
ignore_paths: Sequence[str] = (),
ignore_paths_v2: Sequence[str] = (),
) -> None:
update_golden = pytestconfig.getoption("--update-golden-files")
copy_output = pytestconfig.getoption("--copy-output-files")
Expand All @@ -90,6 +91,7 @@ def check_golden_file(
update_golden=update_golden,
copy_output=copy_output,
ignore_paths=ignore_paths,
ignore_paths_v2=ignore_paths_v2,
)


Expand Down

0 comments on commit f8b73a8

Please sign in to comment.