From e6d4231efb04a7e7cf2577658e778ad50164981d Mon Sep 17 00:00:00 2001 From: Dan Date: Tue, 28 Mar 2023 16:31:16 -0600 Subject: [PATCH 1/3] reduce repetition in print statements --- data_diff/dbt.py | 67 ++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 48 deletions(-) diff --git a/data_diff/dbt.py b/data_diff/dbt.py index 94503c58..ddd40aae 100644 --- a/data_diff/dbt.py +++ b/data_diff/dbt.py @@ -106,11 +106,7 @@ def dbt_diff( _local_diff(diff_vars) else: rich.print( - "[red]" - + ".".join(diff_vars.prod_path) - + " <> " - + ".".join(diff_vars.dev_path) - + "[/] \n" + _diff_output_base(".".join(diff_vars.dev_path), ".".join(diff_vars.prod_path)) + "Skipped due to unknown primary key. Add uniqueness tests, meta, or tags.\n" ) @@ -154,14 +150,13 @@ def _get_diff_vars( def _local_diff(diff_vars: DiffVars) -> None: column_diffs_str = "" - dev_qualified_string = ".".join(diff_vars.dev_path) - prod_qualified_string = ".".join(diff_vars.prod_path) + dev_qualified_str = ".".join(diff_vars.dev_path) + prod_qualified_str = ".".join(diff_vars.prod_path) + diff_output_str = _diff_output_base(dev_qualified_str, prod_qualified_str) - table1 = connect_to_table( - diff_vars.connection, dev_qualified_string, tuple(diff_vars.primary_keys), diff_vars.threads - ) + table1 = connect_to_table(diff_vars.connection, dev_qualified_str, tuple(diff_vars.primary_keys), diff_vars.threads) table2 = connect_to_table( - diff_vars.connection, prod_qualified_string, tuple(diff_vars.primary_keys), diff_vars.threads + diff_vars.connection, prod_qualified_str, tuple(diff_vars.primary_keys), diff_vars.threads ) table1_columns = list(table1.get_schema()) @@ -170,15 +165,8 @@ def _local_diff(diff_vars: DiffVars) -> None: # Not ideal, but we don't have more specific exceptions yet except Exception as ex: logger.debug(ex) - rich.print( - "[red]" - + prod_qualified_string - + " <> " - + dev_qualified_string - + "[/] \n" - + column_diffs_str - + "[green]New model or no access to prod table.[/] \n" - ) + diff_output_str += "[red]New model or no access to prod table.[/] \n" + rich.print(diff_output_str) return mutual_set = set(table1_columns) & set(table2_columns) @@ -197,29 +185,15 @@ def _local_diff(diff_vars: DiffVars) -> None: diff = diff_tables(table1, table2, threaded=True, algorithm=Algorithm.JOINDIFF, extra_columns=extra_columns) if list(diff): - rich.print( - "[red]" - + prod_qualified_string - + " <> " - + dev_qualified_string - + "[/] \n" - + column_diffs_str - + diff.get_stats_string(is_dbt=True) - + "\n" - ) + diff_output_str += column_diffs_str + diff.get_stats_string(is_dbt=True) + "\n" + rich.print(diff_output_str) else: - rich.print( - "[red]" - + prod_qualified_string - + " <> " - + dev_qualified_string - + "[/] \n" - + column_diffs_str - + "[green]No row differences[/] \n" - ) + diff_output_str += f"{column_diffs_str}[bold][green]No row differences[/][/] \n" + rich.print(diff_output_str) def _cloud_diff(diff_vars: DiffVars) -> None: + diff_output_str = _diff_output_base(".".join(diff_vars.dev_path), ".".join(diff_vars.prod_path)) api_key = os.environ.get("DATAFOLD_API_KEY") if diff_vars.datasource_id is None: @@ -257,15 +231,8 @@ def _cloud_diff(diff_vars: DiffVars) -> None: diff_id = data["id"] # TODO in future we should support self hosted datafold diff_url = f"https://app.datafold.com/datadiffs/{diff_id}/overview" - rich.print( - "[red]" - + ".".join(diff_vars.prod_path) - + " <> " - + ".".join(diff_vars.dev_path) - + "[/] \n Diff in progress: \n " - + diff_url - + "\n" - ) + diff_output_str += f" Diff in progress: \n {diff_url}\n" + rich.print(diff_output_str) except BaseException as ex: # Catch KeyboardInterrupt too error = ex finally: @@ -292,6 +259,10 @@ def _cloud_diff(diff_vars: DiffVars) -> None: raise error +def _diff_output_base(dev_path: str, prod_path: str) -> str: + return "[green]" + prod_path + " <> " + dev_path + "[/] \n" + + class DbtParser: def __init__(self, profiles_dir_override: str, project_dir_override: str, is_cloud: bool) -> None: self.parse_run_results, self.parse_manifest, self.ProfileRenderer, self.yaml = import_dbt() From c2b8e9237eac697f9fa8e7befb6d4383812437ee Mon Sep 17 00:00:00 2001 From: Dan Date: Thu, 6 Apr 2023 14:19:18 -0600 Subject: [PATCH 2/3] post merge changes --- data_diff/dbt.py | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/data_diff/dbt.py b/data_diff/dbt.py index 341f6df7..94de3cc6 100644 --- a/data_diff/dbt.py +++ b/data_diff/dbt.py @@ -203,7 +203,7 @@ def _local_diff(diff_vars: DiffVars) -> None: diff = diff_tables(table1, table2, threaded=True, algorithm=Algorithm.JOINDIFF, extra_columns=extra_columns) if list(diff): - diff_output_str += column_diffs_str + diff.get_stats_string(is_dbt=True) + "\n" + diff_output_str += f"{column_diffs_str}{diff.get_stats_string(is_dbt=True)} \n" rich.print(diff_output_str) else: diff_output_str += f"{column_diffs_str}[bold][green]No row differences[/][/] \n" @@ -260,24 +260,11 @@ def _cloud_diff(diff_vars: DiffVars, datasource_id: int, datafold_host: str, url diff_percent_list, "Value Match Percent:", ) - rich.print( - "[red]" - + ".".join(diff_vars.prod_path) - + " <> " - + ".".join(diff_vars.dev_path) - + f"[/]\n{diff_url}\n" - + diff_output - + "\n" - ) + diff_output_str += f"{diff_url}\n {diff_output} \n" + rich.print(diff_output_str) else: - rich.print( - "[red]" - + ".".join(diff_vars.prod_path) - + " <> " - + ".".join(diff_vars.dev_path) - + f"[/]\n{diff_url}\n" - + "[green]No row differences[/] \n" - ) + diff_output_str += f"{diff_url}\n [green]No row differences[/] \n" + rich.print(diff_output_str) except BaseException as ex: # Catch KeyboardInterrupt too error = ex @@ -302,12 +289,7 @@ def _cloud_diff(diff_vars: DiffVars, datasource_id: int, datafold_host: str, url send_event_json(event_json) if error: - rich.print( - "[red]" - + ".".join(diff_vars.prod_path) - + " <> " - + ".".join(diff_vars.dev_path) + "[/]\n" - ) + rich.print(diff_output_str) if diff_id: diff_url = f"{datafold_host}/datadiffs/{diff_id}/overview" rich.print(f"{diff_url} \n") @@ -373,12 +355,7 @@ def _cloud_poll_and_get_summary_results(url, headers): def _diff_output_base(dev_path: str, prod_path: str) -> str: - return "[green]" + prod_path + " <> " + dev_path + "[/] \n" - - -def _diff_output_base(dev_path: str, prod_path: str) -> str: - return "[green]" + prod_path + " <> " + dev_path + "[/] \n" - + return f"[green]{prod_path} <> {dev_path}[/] \n" class DbtParser: def __init__(self, profiles_dir_override: str, project_dir_override: str) -> None: From 686ee7c39f9156dbd1a6f830f4266502bdfef513 Mon Sep 17 00:00:00 2001 From: Dan Date: Thu, 6 Apr 2023 14:25:54 -0600 Subject: [PATCH 3/3] formatter --- data_diff/dbt.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/data_diff/dbt.py b/data_diff/dbt.py index 94de3cc6..722f4495 100644 --- a/data_diff/dbt.py +++ b/data_diff/dbt.py @@ -357,6 +357,7 @@ def _cloud_poll_and_get_summary_results(url, headers): def _diff_output_base(dev_path: str, prod_path: str) -> str: return f"[green]{prod_path} <> {dev_path}[/] \n" + class DbtParser: def __init__(self, profiles_dir_override: str, project_dir_override: str) -> None: self.parse_run_results, self.parse_manifest, self.ProfileRenderer, self.yaml = import_dbt() @@ -373,12 +374,8 @@ def __init__(self, profiles_dir_override: str, project_dir_override: str) -> Non self.unique_columns = self.get_unique_columns() def get_datadiff_variables(self) -> dict: - vars = get_from_dict_with_raise( - self.project_dict, "vars", f"No vars: found in dbt_project.yml." - ) - return get_from_dict_with_raise( - vars, "data_diff", f"data_diff: section not found in dbt_project.yml vars:." - ) + vars = get_from_dict_with_raise(self.project_dict, "vars", f"No vars: found in dbt_project.yml.") + return get_from_dict_with_raise(vars, "data_diff", f"data_diff: section not found in dbt_project.yml vars:.") def get_models(self): with open(self.project_dir / RUN_RESULTS_PATH) as run_results: