From 7b411e8af9e47d9ede9c088d10a54a45ffbbd0a9 Mon Sep 17 00:00:00 2001 From: Jonathan Kaczynski Date: Fri, 25 Oct 2024 18:52:27 -0400 Subject: [PATCH 1/3] Support more dbt contract data types (Snowflake) - Move `column_to_dimension_types` to a constant, at the top of the file, TYPE_MAPPINGS. - Normalize data types coming from dbt contracts by: - Downcasing - Remove extra type detail, contained in parentheses (ex. "timestamp(3)" => "timestamp") - After looking up the type mapping, compare the target type against the short list of supported dimension types in cube. - Support all data types in Snowflake and make a best effort of mapping them. - I assume the bool => boolean mapping comes from a BigQuery data type, so moved that into its own constant. --- src/cube_dbt/column.py | 110 ++++++++++++++++++++++++++++++++++------- tests/test_column.py | 42 +++++++++++++++- 2 files changed, 132 insertions(+), 20 deletions(-) diff --git a/src/cube_dbt/column.py b/src/cube_dbt/column.py index fbe856a..e5d8671 100644 --- a/src/cube_dbt/column.py +++ b/src/cube_dbt/column.py @@ -1,5 +1,86 @@ +import re + from cube_dbt.dump import dump +# As of 2024-10-17, the valid "Dimension Types" listed on +# https://cube.dev/docs/reference/data-model/types-and-formats#dimension-types +# are: time, string, number, boolean, and geo +VALID_DIMENSION_TYPES = [ + "boolean", + "geo", + "number", + "string", + "time", +] +# Other System's Type => Cube Type +# See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types +BIGQUERY_TYPE_MAPPINGS = { + "bool": "boolean", +} +# See https://docs.snowflake.com/en/sql-reference-data-types +SNOWFLAKE_TYPE_MAPPINGS = { + # Numeric data types + # number does not need to be mapped + "decimal": "number", + "dec": "number", + "numeric": "number", + "int": "number", + "integer": "number", + "bigint": "number", + "smallint": "number", + "tinyint": "number", + "byteint": "number", + "float": "number", + "float4": "number", + "float8": "number", + "double": "number", + "double precision": "number", + "real": "number", + + # String & binary data types + "varchar": "string", + "char": "string", + "character": "string", + "nchar": "string", + # string does not need to be mapped + "text": "string", + "nvarchar": "string", + "nvarchar2": "string", + "char varying": "string", + "nchar varying": "string", + "binary": "string", + "varbinary": "string", + + # Logical data types + # boolean does not need to be mapped + + # Date & time data types + "date": "time", + "datetime": "time", + # time does not need to be mapped + "timestamp": "time", + "timestamp_ltz": "time", + "timestamp_ntz": "time", + "timestamp_tz": "time", + + # Semi-structured data types + "variant": "string", + "object": "string", + "array": "string", + + # Geospatial data types + "geography": "geo", + "geometry": "string", + + # Vector data types + "vector": "string", +} +TYPE_MAPPINGS = { + **BIGQUERY_TYPE_MAPPINGS, + **SNOWFLAKE_TYPE_MAPPINGS, +} + + class Column: def __init__(self, model_name: str, column_dict: dict) -> None: self._model_name = model_name @@ -25,29 +106,20 @@ def sql(self) -> str: def type(self) -> str: if not 'data_type' in self._column_dict or self._column_dict['data_type'] == None: return 'string' - - column_to_dimension_types = { - 'time': 'time', - 'date': 'time', - 'datetime': 'time', - 'timestamp': 'time', - 'string': 'string', + # Normalize the data_type value, downcasing it, and removing extra information. + source_data_type = re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower()) - 'number': 'number', - 'numeric': 'number', + if source_data_type in TYPE_MAPPINGS: + cube_data_type = TYPE_MAPPINGS[source_data_type] + else: + cube_data_type = source_data_type - 'boolean': 'boolean', - 'bool': 'boolean', - - 'geo': 'geo', - 'geography': 'geo', - } - if not self._column_dict['data_type'] in column_to_dimension_types: + if cube_data_type not in VALID_DIMENSION_TYPES: raise RuntimeError(f"Unknown column type of {self._model_name}.{self.name}: {self._column_dict['data_type']}") - return column_to_dimension_types[self._column_dict['data_type']] - + return cube_data_type + @property def meta(self) -> dict: return self._column_dict['meta'] @@ -78,4 +150,4 @@ def as_dimension(self) -> str: For use in Jinja: {{ dbt.model('name').column('name').as_dimension() }} """ - return dump(self._as_dimension(), indent=8) \ No newline at end of file + return dump(self._as_dimension(), indent=8) diff --git a/tests/test_column.py b/tests/test_column.py index 1bda4d6..7968263 100644 --- a/tests/test_column.py +++ b/tests/test_column.py @@ -42,6 +42,46 @@ def test_known_type(self): column = Column('model', column_dict) assert column.type == 'number' + def test_known_type_but_uppercase(self): + """ + If type is known, then map it + """ + column_dict = { + 'data_type': 'STRING' + } + column = Column('model', column_dict) + assert column.type == 'string' + + def test_known_type_but_with_one_extra_info(self): + """ + If type is known, then map it + """ + column_dict = { + 'data_type': 'timestamp(3)' + } + column = Column('model', column_dict) + assert column.type == 'time' + + def test_known_type_but_with_two_extra_info(self): + """ + If type is known, then map it + """ + column_dict = { + 'data_type': 'numeric(38,0)' + } + column = Column('model', column_dict) + assert column.type == 'number' + + def test_known_type_but_with_two_extra_info_of_different_types(self): + """ + If type is known, then map it + """ + column_dict = { + 'data_type': 'VECTOR(FLOAT, 256)' + } + column = Column('model', column_dict) + assert column.type == 'string' + def test_as_dimension(self): column_dict = { 'name': 'column', @@ -69,4 +109,4 @@ def test_as_dimension_render(self): assert column.as_dimension() == """name: column sql: column type: number - """ \ No newline at end of file + """ From a014b638498508f76643ca43b95334e94c4bcfc6 Mon Sep 17 00:00:00 2001 From: Jonathan Kaczynski Date: Fri, 25 Oct 2024 18:59:36 -0400 Subject: [PATCH 2/3] Support more dbt contract data types (BigQuery) - Since we started BIGQUERY_TYPE_MAPPINGS, let's add the rest of the BigQuery types to the map. - Enhance normalizing data types coming from dbt contracts by: - Remove extra type detail, contained in angle brackets (ex. "array>>" => "array") --- src/cube_dbt/column.py | 29 ++++++++++++++++++++++++++++- tests/test_column.py | 10 ++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/cube_dbt/column.py b/src/cube_dbt/column.py index e5d8671..24128e4 100644 --- a/src/cube_dbt/column.py +++ b/src/cube_dbt/column.py @@ -15,7 +15,34 @@ # Other System's Type => Cube Type # See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types BIGQUERY_TYPE_MAPPINGS = { + "array": "string", "bool": "boolean", + "bytes": "string", + "date": "time", + "datetime": "time", + "geography": "geo", + "interval": "string", + "json": "string", + + # numeric types + "int64": "number", + "int": "number", + "smallint": "number", + "integer": "number", + "bigint": "number", + "tinyint": "number", + "byteint": "number", + "numeric": "number", + "decimal": "number", + "bignumeric": "number", + "bigdecimal": "number", + "float64": "number", + + "range": "string", + # string does not need to be mapped + "struct": "string", + # time does not need to be mapped + "timestamp": "time", } # See https://docs.snowflake.com/en/sql-reference-data-types SNOWFLAKE_TYPE_MAPPINGS = { @@ -108,7 +135,7 @@ def type(self) -> str: return 'string' # Normalize the data_type value, downcasing it, and removing extra information. - source_data_type = re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower()) + source_data_type = re.sub(r"<.*>", "", re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower())) if source_data_type in TYPE_MAPPINGS: cube_data_type = TYPE_MAPPINGS[source_data_type] diff --git a/tests/test_column.py b/tests/test_column.py index 7968263..50fc33d 100644 --- a/tests/test_column.py +++ b/tests/test_column.py @@ -82,6 +82,16 @@ def test_known_type_but_with_two_extra_info_of_different_types(self): column = Column('model', column_dict) assert column.type == 'string' + def test_known_bigquery_type_but_with_extra_info(self): + """ + If type is known, then map it + """ + column_dict = { + 'data_type': 'ARRAY>>' + } + column = Column('model', column_dict) + assert column.type == 'string' + def test_as_dimension(self): column_dict = { 'name': 'column', From 970aed0a25aaf0cb13b307a5a61c1d831e4209bc Mon Sep 17 00:00:00 2001 From: Jonathan Kaczynski Date: Fri, 25 Oct 2024 19:00:05 -0400 Subject: [PATCH 3/3] Support more dbt contract data types (Redshift) - Since we've added support for Snowflake and BigQuery, let's wrap this up by adding support for data types from the other major data warehouse, Redshift --- src/cube_dbt/column.py | 85 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/cube_dbt/column.py b/src/cube_dbt/column.py index 24128e4..52e8647 100644 --- a/src/cube_dbt/column.py +++ b/src/cube_dbt/column.py @@ -102,8 +102,93 @@ # Vector data types "vector": "string", } +# See https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html +REDSHIFT_TYPE_MAPPINGS = { + # Signed two-byte integer + "smallint": "number", + "int2": "number", + + # Signed four-byte integer + "integer": "number", + "int": "number", + "int4": "number", + + # Signed eight-byte integer + "bigint": "number", + "int8": "number", + + # Exact numeric of selectable precision + "decimal": "number", + "numeric": "number", + + # Single precision floating-point number + "real": "number", + "float4": "number", + + # Double precision floating-point number + "double precision": "number", + "float8": "number", + "float": "number", + + # Fixed-length character string + "char": "string", + "character": "string", + "nchar": "string", + "bpchar": "string", + + # Variable-length character string with a user-defined limit + "varchar": "string", + "character varying": "string", + "nvarchar": "string", + "text": "string", + + # Calendar date (year, month, day) + "date": "time", + + # Time of day + "time": "time", + "time without time zone": "time", + + # Time of day with time zone + "timetz": "time", + "time with time zone": "time", + + # Date and time (without time zone) + "timestamp": "time", + "timestamp without time zone": "time", + + # Date and time (with time zone) + "timestamptz": "time", + "timestamp with time zone": "time", + + # Time duration in year to month order + "interval year to month": "string", + + # Time duration in day to second order + "interval day to second": "string", + + # Logical Boolean (true/false) + # boolean does not need to be mapped + "bool": "boolean", + + # Type used with HyperLogLog sketches + "hllsketch": "string", + + # A superset data type that encompasses all scalar types of Amazon Redshift including complex types such as ARRAY and STRUCTS + "super": "string", + + # Variable-length binary value + "varbyte": "string", + "varbinary": "string", + "binary varying": "string", + + # Spatial data + "geometry": "geo", + "geography": "string", +} TYPE_MAPPINGS = { **BIGQUERY_TYPE_MAPPINGS, + **REDSHIFT_TYPE_MAPPINGS, **SNOWFLAKE_TYPE_MAPPINGS, }