Rename "type_repr" to "data_type" to align with conventional information schemas

Sergey Vasilyev · Sergey Vasilyev · commit fedcef66c62b · 2023-12-29T21:08:37.000+01:00
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -711,9 +711,9 @@ def type_repr(self, t) -> str:
     def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
         "Parse type info as returned by the database"
 
-        cls = self.TYPE_CLASSES.get(info.type_repr)
+        cls = self.TYPE_CLASSES.get(info.data_type)
         if cls is None:
-            return UnknownColType(info.type_repr)
+            return UnknownColType(info.data_type)
 
         if issubclass(cls, TemporalType):
             return cls(
@@ -745,7 +745,7 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
         elif issubclass(cls, (JSON, Array, Struct, Text, Native_UUID)):
             return cls()
 
-        raise TypeError(f"Parsing {info.type_repr} returned an unknown type {cls!r}.")
+        raise TypeError(f"Parsing {info.data_type} returned an unknown type {cls!r}.")
 
     def _convert_db_precision_to_digits(self, p: int) -> int:
         """Convert from binary precision, used by floats, to decimal precision."""
@@ -1024,7 +1024,7 @@ def query_table_schema(self, path: DbPath) -> Dict[str, RawColumnInfo]:
         d = {
             r[0]: RawColumnInfo(
                 column_name=r[0],
-                type_repr=r[1],
+                data_type=r[1],
                 datetime_precision=r[2],
                 numeric_precision=r[3],
                 numeric_scale=r[4],
diff --git a/data_diff/databases/bigquery.py b/data_diff/databases/bigquery.py
@@ -95,7 +95,7 @@ def type_repr(self, t) -> str:
     def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
         col_type = super().parse_type(table_path, info)
         if isinstance(col_type, UnknownColType):
-            m = self.TYPE_ARRAY_RE.fullmatch(info.type_repr)
+            m = self.TYPE_ARRAY_RE.fullmatch(info.data_type)
             if m:
                 item_info = attrs.evolve(info, data_type=m.group(1))
                 item_type = self.parse_type(table_path, item_info)
@@ -106,7 +106,7 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
             # - STRUCT<foo INT64, bar STRING(10)> (named)
             # - STRUCT<foo INT64, bar ARRAY<INT64>> (with complex fields)
             # - STRUCT<foo INT64, bar STRUCT<a INT64, b INT64>> (nested)
-            m = self.TYPE_STRUCT_RE.fullmatch(info.type_repr)
+            m = self.TYPE_STRUCT_RE.fullmatch(info.data_type)
             if m:
                 col_type = Struct()
 
diff --git a/data_diff/databases/clickhouse.py b/data_diff/databases/clickhouse.py
@@ -79,14 +79,14 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
 
     def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
         nullable_prefix = "Nullable("
-        if info.type_repr.startswith(nullable_prefix):
-            info = attrs.evolve(info, data_type=info.type_repr[len(nullable_prefix) :].rstrip(")"))
+        if info.data_type.startswith(nullable_prefix):
+            info = attrs.evolve(info, data_type=info.data_type[len(nullable_prefix) :].rstrip(")"))
 
-        if info.type_repr.startswith("Decimal"):
+        if info.data_type.startswith("Decimal"):
             info = attrs.evolve(info, data_type="Decimal")
-        elif info.type_repr.startswith("FixedString"):
+        elif info.data_type.startswith("FixedString"):
             info = attrs.evolve(info, data_type="FixedString")
-        elif info.type_repr.startswith("DateTime64"):
+        elif info.data_type.startswith("DateTime64"):
             info = attrs.evolve(info, data_type="DateTime64")
 
         return super().parse_type(table_path, info)
diff --git a/data_diff/databases/databricks.py b/data_diff/databases/databricks.py
@@ -158,7 +158,7 @@ def query_table_schema(self, path: DbPath) -> Dict[str, RawColumnInfo]:
 
             d = {
                 r.COLUMN_NAME: RawColumnInfo(
-                    column_name=r.COLUMN_NAME, type_repr=r.TYPE_NAME, datetime_precision=r.DECIMAL_DIGITS
+                    column_name=r.COLUMN_NAME, data_type=r.TYPE_NAME, datetime_precision=r.DECIMAL_DIGITS
                 )
                 for r in rows
             }
@@ -186,8 +186,8 @@ def _process_table_schema(
 
         resulted_rows = []
         for info in col_infos:
-            row_type = "DECIMAL" if info.type_repr.startswith("DECIMAL") else info.type_repr
-            info = attrs.evolve(info, type_repr=row_type)
+            row_type = "DECIMAL" if info.data_type.startswith("DECIMAL") else info.data_type
+            info = attrs.evolve(info, data_type=row_type)
             type_cls = self.dialect.TYPE_CLASSES.get(row_type, UnknownColType)
 
             if issubclass(type_cls, Integer):
@@ -198,7 +198,7 @@ def _process_table_schema(
                 info = attrs.evolve(info, numeric_precision=numeric_precision)
 
             elif issubclass(type_cls, Decimal):
-                items = info.type_repr[8:].rstrip(")").split(",")
+                items = info.data_type[8:].rstrip(")").split(",")
                 numeric_precision, numeric_scale = int(items[0]), int(items[1])
                 info = attrs.evolve(
                     info,
diff --git a/data_diff/databases/duckdb.py b/data_diff/databases/duckdb.py
@@ -80,7 +80,7 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
             r"DECIMAL\((\d+),(\d+)\)": Decimal,
         }
 
-        for m, t_cls in match_regexps(regexps, info.type_repr):
+        for m, t_cls in match_regexps(regexps, info.data_type):
             precision = int(m.group(2))
             return t_cls(precision=precision)
 
diff --git a/data_diff/databases/oracle.py b/data_diff/databases/oracle.py
@@ -113,7 +113,7 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
             r"TIMESTAMP\((\d)\)": Timestamp,
         }
 
-        for m, t_cls in match_regexps(regexps, info.type_repr):
+        for m, t_cls in match_regexps(regexps, info.data_type):
             precision = int(m.group(1))
             return t_cls(precision=precision, rounds=self.ROUNDS_ON_PREC_LOSS)
 
diff --git a/data_diff/databases/presto.py b/data_diff/databases/presto.py
@@ -97,17 +97,17 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
             r"timestamp\((\d)\)": Timestamp,
             r"timestamp\((\d)\) with time zone": TimestampTZ,
         }
-        for m, t_cls in match_regexps(timestamp_regexps, info.type_repr):
+        for m, t_cls in match_regexps(timestamp_regexps, info.data_type):
             precision = int(m.group(1))
             return t_cls(precision=precision, rounds=self.ROUNDS_ON_PREC_LOSS)
 
         number_regexps = {r"decimal\((\d+),(\d+)\)": Decimal}
-        for m, n_cls in match_regexps(number_regexps, info.type_repr):
+        for m, n_cls in match_regexps(number_regexps, info.data_type):
             _prec, scale = map(int, m.groups())
             return n_cls(scale)
 
         string_regexps = {r"varchar\((\d+)\)": Text, r"char\((\d+)\)": Text}
-        for m, n_cls in match_regexps(string_regexps, info.type_repr):
+        for m, n_cls in match_regexps(string_regexps, info.data_type):
             return n_cls()
 
         return super().parse_type(table_path, info)
diff --git a/data_diff/databases/redshift.py b/data_diff/databases/redshift.py
@@ -151,7 +151,7 @@ def query_svv_columns(self, path: DbPath) -> Dict[str, RawColumnInfo]:
         d = {
             r[0]: RawColumnInfo(
                 column_name=r[0],
-                type_repr=r[1],
+                data_type=r[1],
                 datetime_precision=r[2],
                 numeric_precision=r[3],
                 numeric_scale=r[4],
@@ -181,7 +181,7 @@ def _normalize_schema_info(self, rows: Iterable[Tuple[Any]]) -> Dict[str, RawCol
 
             schema_dict[col_name] = RawColumnInfo(
                 column_name=col_name,
-                type_repr=col_name,
+                data_type=col_name,
                 datetime_precision=None,
                 numeric_precision=precision,
                 numeric_scale=scale,
diff --git a/data_diff/databases/vertica.py b/data_diff/databases/vertica.py
@@ -74,22 +74,22 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
             r"timestamp\(?(\d?)\)?": Timestamp,
             r"timestamptz\(?(\d?)\)?": TimestampTZ,
         }
-        for m, t_cls in match_regexps(timestamp_regexps, info.type_repr):
+        for m, t_cls in match_regexps(timestamp_regexps, info.data_type):
             precision = int(m.group(1)) if m.group(1) else 6
             return t_cls(precision=precision, rounds=self.ROUNDS_ON_PREC_LOSS)
 
         number_regexps = {
             r"numeric\((\d+),(\d+)\)": Decimal,
         }
-        for m, n_cls in match_regexps(number_regexps, info.type_repr):
+        for m, n_cls in match_regexps(number_regexps, info.data_type):
             _prec, scale = map(int, m.groups())
             return n_cls(scale)
 
         string_regexps = {
             r"varchar\((\d+)\)": Text,
             r"char\((\d+)\)": Text,
         }
-        for m, n_cls in match_regexps(string_regexps, info.type_repr):
+        for m, n_cls in match_regexps(string_regexps, info.data_type):
             return n_cls()
 
         return super().parse_type(table_path, info)
diff --git a/data_diff/dbt.py b/data_diff/dbt.py
@@ -303,7 +303,7 @@ def _local_diff(
     columns_removed = table1_column_names.difference(table2_column_names)
     # col type is i = 1 in tuple
     columns_type_changed = {
-        k for k, v in table2_columns.items() if k in table1_columns and v.type_repr != table1_columns[k].type_repr
+        k for k, v in table2_columns.items() if k in table1_columns and v.data_type != table1_columns[k].data_type
     }
 
     if columns_added:
diff --git a/data_diff/schema.py b/data_diff/schema.py
@@ -21,7 +21,7 @@ class RawColumnInfo(Collection[Any]):
     """
 
     column_name: str
-    type_repr: str
+    data_type: str
     datetime_precision: Optional[int] = None
     numeric_precision: Optional[int] = None
     numeric_scale: Optional[int] = None
@@ -30,7 +30,7 @@ class RawColumnInfo(Collection[Any]):
     # It was a tuple once, so we keep it backward compatible temporarily, until remade to classes.
     def __iter__(self) -> Iterable[Any]:
         return iter(
-            (self.column_name, self.type_repr, self.datetime_precision, self.numeric_precision, self.numeric_scale)
+            (self.column_name, self.data_type, self.datetime_precision, self.numeric_precision, self.numeric_scale)
         )
 
     def __len__(self) -> int:
diff --git a/tests/test_dbt.py b/tests/test_dbt.py
@@ -77,8 +77,8 @@ def test_local_diff(self, mock_diff_tables):
         connection = {}
         mock_table1 = Mock()
         column_dictionary = {
-            "col1": RawColumnInfo(column_name="col1", type_repr="type"),
-            "col2": RawColumnInfo(column_name="col2", type_repr="type"),
+            "col1": RawColumnInfo(column_name="col1", data_type="type"),
+            "col2": RawColumnInfo(column_name="col2", data_type="type"),
         }
         mock_table1.get_schema.return_value = column_dictionary
         mock_table2 = Mock()
@@ -125,12 +125,12 @@ def test_local_diff_types_differ(self, mock_diff_tables):
         mock_table1 = Mock()
         mock_table2 = Mock()
         table1_column_dictionary = {
-            "col1": RawColumnInfo(column_name="col1", type_repr="type"),
-            "col2": RawColumnInfo(column_name="col2", type_repr="type"),
+            "col1": RawColumnInfo(column_name="col1", data_type="type"),
+            "col2": RawColumnInfo(column_name="col2", data_type="type"),
         }
         table2_column_dictionary = {
-            "col1": RawColumnInfo(column_name="col1", type_repr="type"),
-            "col2": RawColumnInfo(column_name="col2", type_repr="differing_type"),
+            "col1": RawColumnInfo(column_name="col1", data_type="type"),
+            "col2": RawColumnInfo(column_name="col2", data_type="differing_type"),
         }
         mock_table1.get_schema.return_value = table1_column_dictionary
         mock_table2.get_schema.return_value = table2_column_dictionary
@@ -172,8 +172,8 @@ def test_local_diff_types_differ(self, mock_diff_tables):
     def test_local_diff_no_diffs(self, mock_diff_tables):
         connection = {}
         column_dictionary = {
-            "col1": RawColumnInfo(column_name="col1", type_repr="type"),
-            "col2": RawColumnInfo(column_name="col2", type_repr="type"),
+            "col1": RawColumnInfo(column_name="col1", data_type="type"),
+            "col2": RawColumnInfo(column_name="col2", data_type="type"),
         }
         mock_table1 = Mock()
         mock_table1.get_schema.return_value = column_dictionary

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:`
`80`	`80`	`r"DECIMAL\((\d+),(\d+)\)": Decimal,`
`81`	`81`	`}`
`82`	`82`
`83`		`- for m, t_cls in match_regexps(regexps, info.type_repr):`
	`83`	`+ for m, t_cls in match_regexps(regexps, info.data_type):`
`84`	`84`	`precision = int(m.group(2))`
`85`	`85`	`return t_cls(precision=precision)`
`86`	`86`
Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:`
`113`	`113`	`r"TIMESTAMP\((\d)\)": Timestamp,`
`114`	`114`	`}`
`115`	`115`
`116`		`- for m, t_cls in match_regexps(regexps, info.type_repr):`
	`116`	`+ for m, t_cls in match_regexps(regexps, info.data_type):`
`117`	`117`	`precision = int(m.group(1))`
`118`	`118`	`return t_cls(precision=precision, rounds=self.ROUNDS_ON_PREC_LOSS)`
`119`	`119`
Original file line number	Diff line number	Diff line change
`@@ -303,7 +303,7 @@ def _local_diff(`
`303`	`303`	`columns_removed = table1_column_names.difference(table2_column_names)`
`304`	`304`	`# col type is i = 1 in tuple`
`305`	`305`	`columns_type_changed = {`
`306`		`- k for k, v in table2_columns.items() if k in table1_columns and v.type_repr != table1_columns[k].type_repr`
	`306`	`+ k for k, v in table2_columns.items() if k in table1_columns and v.data_type != table1_columns[k].data_type`
`307`	`307`	`}`
`308`	`308`
`309`	`309`	`if columns_added:`