diff --git a/data_diff/dbt.py b/data_diff/dbt.py index c34d21d4..2f771e1b 100644 --- a/data_diff/dbt.py +++ b/data_diff/dbt.py @@ -151,7 +151,7 @@ def _local_diff(diff_vars: DiffVars) -> None: + prod_qualified_string + "[/] \n" + column_diffs_str - + diff.get_stats_string() + + diff.get_stats_string_dbt() + "\n" ) else: diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py index 74cd44d7..116de885 100644 --- a/data_diff/diff_tables.py +++ b/data_diff/diff_tables.py @@ -85,6 +85,7 @@ class DiffStats: table2_count: int unchanged: int diff_percent: float + extra_column_diffs: Dict[str, int] @dataclass @@ -123,6 +124,40 @@ def _get_stats(self) -> DiffStats: return DiffStats(diff_by_sign, table1_count, table2_count, unchanged, diff_percent) + def _get_stats_dbt(self) -> DiffStats: + list(self) # Consume the iterator into result_list, if we haven't already + + key_columns = self.info_tree.info.tables[0].key_columns + diff_by_key = {} + extra_column_values_store = {} + + extra_columns = self.info_tree.info.tables[0].extra_columns + extra_column_diffs = {k:0 for k in extra_columns} + + for sign, values in self.result_list: + k = values[: len(self.info_tree.info.tables[0].key_columns)] + extra_column_values = values[len(self.info_tree.info.tables[0].key_columns) :] + if k in diff_by_key: + assert sign != diff_by_key[k] + diff_by_key[k] = "!" + for i in range(0,len(extra_columns)): + if extra_column_values[i] != extra_column_values_store[k][i]: + extra_column_diffs[extra_columns[i]] += 1 + else: + diff_by_key[k] = sign + extra_column_values_store[k] = extra_column_values + + diff_by_sign = {k: 0 for k in "+-!"} + for sign in diff_by_key.values(): + diff_by_sign[sign] += 1 + + table1_count = self.info_tree.info.rowcounts[1] + table2_count = self.info_tree.info.rowcounts[2] + unchanged = table1_count - diff_by_sign["-"] - diff_by_sign["!"] + diff_percent = 1 - unchanged / max(table1_count, table2_count) + + return DiffStats(diff_by_sign, table1_count, table2_count, unchanged, diff_percent, extra_column_diffs) + def get_stats_string(self): diff_stats = self._get_stats() @@ -142,6 +177,32 @@ def get_stats_string(self): return string_output + def get_stats_string_dbt(self): + + diff_stats = self._get_stats_dbt() + string_output = "" + string_output += f"{diff_stats.table1_count} rows in table A\n" + string_output += f"{diff_stats.table2_count} rows in table B\n" + string_output += f"{diff_stats.diff_by_sign['-']} rows exclusive to table A (not present in B)\n" + string_output += f"{diff_stats.diff_by_sign['+']} rows exclusive to table B (not present in A)\n" + string_output += f"{diff_stats.diff_by_sign['!']} rows updated\n" + string_output += f"{diff_stats.unchanged} rows unchanged\n" + string_output += f"{100*diff_stats.diff_percent:.2f}% difference score\n" + + string_output = "\n| Rows Added\t| Rows Removed\n" + string_output += "------------------------------------------------------------\n" + string_output += f"| {diff_stats.diff_by_sign['-']}\t\t| {diff_stats.diff_by_sign['+']}\n" + string_output += "------------------------------------------------------------\n\n" + string_output += f"Updated Rows: {diff_stats.diff_by_sign['!']}\n" + string_output += f"Identical Rows: {diff_stats.unchanged}\n\n" + + string_output += f"Values Updated:" + + for k, v in diff_stats.extra_column_diffs.items(): + string_output += f"\n{k}: {v}" + + return string_output + def get_stats_dict(self): diff_stats = self._get_stats()