From 7b411e8af9e47d9ede9c088d10a54a45ffbbd0a9 Mon Sep 17 00:00:00 2001
From: Jonathan Kaczynski <jkaczynski@amplify.com>
Date: Fri, 25 Oct 2024 18:52:27 -0400
Subject: [PATCH 1/3] Support more dbt contract data types (Snowflake)

- Move `column_to_dimension_types` to a constant, at the top of the file, TYPE_MAPPINGS.
- Normalize data types coming from dbt contracts by:
  - Downcasing
  - Remove extra type detail, contained in parentheses (ex. "timestamp(3)" => "timestamp")
- After looking up the type mapping, compare the target type against the short list of supported dimension types in cube.
- Support all data types in Snowflake and make a best effort of mapping them.
- I assume the bool => boolean mapping comes from a BigQuery data type, so moved that into its own constant.
---
 src/cube_dbt/column.py | 110 ++++++++++++++++++++++++++++++++++-------
 tests/test_column.py   |  42 +++++++++++++++-
 2 files changed, 132 insertions(+), 20 deletions(-)

diff --git a/src/cube_dbt/column.py b/src/cube_dbt/column.py
index fbe856a..e5d8671 100644
--- a/src/cube_dbt/column.py
+++ b/src/cube_dbt/column.py
@@ -1,5 +1,86 @@
+import re
+
 from cube_dbt.dump import dump
 
+# As of 2024-10-17, the valid "Dimension Types" listed on
+# https://cube.dev/docs/reference/data-model/types-and-formats#dimension-types
+# are: time, string, number, boolean, and geo
+VALID_DIMENSION_TYPES = [
+  "boolean",
+  "geo",
+  "number",
+  "string",
+  "time",
+]
+# Other System's Type => Cube Type
+# See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
+BIGQUERY_TYPE_MAPPINGS = {
+  "bool": "boolean",
+}
+# See https://docs.snowflake.com/en/sql-reference-data-types
+SNOWFLAKE_TYPE_MAPPINGS = {
+  # Numeric data types
+  # number does not need to be mapped
+  "decimal": "number",
+  "dec": "number",
+  "numeric": "number",
+  "int": "number",
+  "integer": "number",
+  "bigint": "number",
+  "smallint": "number",
+  "tinyint": "number",
+  "byteint": "number",
+  "float": "number",
+  "float4": "number",
+  "float8": "number",
+  "double": "number",
+  "double precision": "number",
+  "real": "number",
+
+  # String & binary data types
+  "varchar": "string",
+  "char": "string",
+  "character": "string",
+  "nchar": "string",
+  # string does not need to be mapped
+  "text": "string",
+  "nvarchar": "string",
+  "nvarchar2": "string",
+  "char varying": "string",
+  "nchar varying": "string",
+  "binary": "string",
+  "varbinary": "string",
+
+  # Logical data types
+  # boolean does not need to be mapped
+
+  # Date & time data types
+  "date": "time",
+  "datetime": "time",
+  # time does not need to be mapped
+  "timestamp": "time",
+  "timestamp_ltz": "time",
+  "timestamp_ntz": "time",
+  "timestamp_tz": "time",
+
+  # Semi-structured data types
+  "variant": "string",
+  "object": "string",
+  "array": "string",
+
+  # Geospatial data types
+  "geography": "geo",
+  "geometry": "string",
+
+  # Vector data types
+  "vector": "string",
+}
+TYPE_MAPPINGS = {
+  **BIGQUERY_TYPE_MAPPINGS,
+  **SNOWFLAKE_TYPE_MAPPINGS,
+}
+
+
 class Column:
   def __init__(self, model_name: str, column_dict: dict) -> None:
     self._model_name = model_name
@@ -25,29 +106,20 @@ def sql(self) -> str:
   def type(self) -> str:
     if not 'data_type' in self._column_dict or self._column_dict['data_type'] == None:
       return 'string'
-    
-    column_to_dimension_types = {
-      'time': 'time',
-      'date': 'time',
-      'datetime': 'time',
-      'timestamp': 'time',
 
-      'string': 'string',
+    # Normalize the data_type value, downcasing it, and removing extra information.
+    source_data_type = re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower())
 
-      'number': 'number',
-      'numeric': 'number',
+    if source_data_type in TYPE_MAPPINGS:
+      cube_data_type = TYPE_MAPPINGS[source_data_type]
+    else:
+      cube_data_type = source_data_type
 
-      'boolean': 'boolean',
-      'bool': 'boolean',
-
-      'geo': 'geo',
-      'geography': 'geo',
-    }
-    if not self._column_dict['data_type'] in column_to_dimension_types:
+    if cube_data_type not in VALID_DIMENSION_TYPES:
       raise RuntimeError(f"Unknown column type of {self._model_name}.{self.name}: {self._column_dict['data_type']}")
 
-    return column_to_dimension_types[self._column_dict['data_type']]
-  
+    return cube_data_type
+
   @property
   def meta(self) -> dict:
     return self._column_dict['meta']
@@ -78,4 +150,4 @@ def as_dimension(self) -> str:
     For use in Jinja:
     {{ dbt.model('name').column('name').as_dimension() }}
     """
-    return dump(self._as_dimension(), indent=8)
\ No newline at end of file
+    return dump(self._as_dimension(), indent=8)
diff --git a/tests/test_column.py b/tests/test_column.py
index 1bda4d6..7968263 100644
--- a/tests/test_column.py
+++ b/tests/test_column.py
@@ -42,6 +42,46 @@ def test_known_type(self):
     column = Column('model', column_dict)
     assert column.type == 'number'
 
+  def test_known_type_but_uppercase(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'STRING'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'string'
+
+  def test_known_type_but_with_one_extra_info(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'timestamp(3)'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'time'
+
+  def test_known_type_but_with_two_extra_info(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'numeric(38,0)'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'number'
+
+  def test_known_type_but_with_two_extra_info_of_different_types(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'VECTOR(FLOAT, 256)'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'string'
+
   def test_as_dimension(self):
     column_dict = {
       'name': 'column',
@@ -69,4 +109,4 @@ def test_as_dimension_render(self):
     assert column.as_dimension() == """name: column
         sql: column
         type: number
-        """
\ No newline at end of file
+        """

From a014b638498508f76643ca43b95334e94c4bcfc6 Mon Sep 17 00:00:00 2001
From: Jonathan Kaczynski <jkaczynski@amplify.com>
Date: Fri, 25 Oct 2024 18:59:36 -0400
Subject: [PATCH 2/3] Support more dbt contract data types (BigQuery)

- Since we started BIGQUERY_TYPE_MAPPINGS, let's add the rest of the BigQuery types to the map.
- Enhance normalizing data types coming from dbt contracts by:
  - Remove extra type detail, contained in angle brackets (ex. "array<struct<array<int64>>>" => "array")
---
 src/cube_dbt/column.py | 29 ++++++++++++++++++++++++++++-
 tests/test_column.py   | 10 ++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/cube_dbt/column.py b/src/cube_dbt/column.py
index e5d8671..24128e4 100644
--- a/src/cube_dbt/column.py
+++ b/src/cube_dbt/column.py
@@ -15,7 +15,34 @@
 # Other System's Type => Cube Type
 # See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
 BIGQUERY_TYPE_MAPPINGS = {
+  "array": "string",
   "bool": "boolean",
+  "bytes": "string",
+  "date": "time",
+  "datetime": "time",
+  "geography": "geo",
+  "interval": "string",
+  "json": "string",
+
+  # numeric types
+  "int64": "number",
+  "int": "number",
+  "smallint": "number",
+  "integer": "number",
+  "bigint": "number",
+  "tinyint": "number",
+  "byteint": "number",
+  "numeric": "number",
+  "decimal": "number",
+  "bignumeric": "number",
+  "bigdecimal": "number",
+  "float64": "number",
+
+  "range": "string",
+  # string does not need to be mapped
+  "struct": "string",
+  # time does not need to be mapped
+  "timestamp": "time",
 }
 # See https://docs.snowflake.com/en/sql-reference-data-types
 SNOWFLAKE_TYPE_MAPPINGS = {
@@ -108,7 +135,7 @@ def type(self) -> str:
       return 'string'
 
     # Normalize the data_type value, downcasing it, and removing extra information.
-    source_data_type = re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower())
+    source_data_type = re.sub(r"<.*>", "", re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower()))
 
     if source_data_type in TYPE_MAPPINGS:
       cube_data_type = TYPE_MAPPINGS[source_data_type]
diff --git a/tests/test_column.py b/tests/test_column.py
index 7968263..50fc33d 100644
--- a/tests/test_column.py
+++ b/tests/test_column.py
@@ -82,6 +82,16 @@ def test_known_type_but_with_two_extra_info_of_different_types(self):
     column = Column('model', column_dict)
     assert column.type == 'string'
 
+  def test_known_bigquery_type_but_with_extra_info(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'ARRAY<STRUCT<ARRAY<INT64>>>'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'string'
+
   def test_as_dimension(self):
     column_dict = {
       'name': 'column',

From 970aed0a25aaf0cb13b307a5a61c1d831e4209bc Mon Sep 17 00:00:00 2001
From: Jonathan Kaczynski <jkaczynski@amplify.com>
Date: Fri, 25 Oct 2024 19:00:05 -0400
Subject: [PATCH 3/3] Support more dbt contract data types (Redshift)

- Since we've added support for Snowflake and BigQuery, let's wrap this up by adding support for data types from the other major data warehouse, Redshift
---
 src/cube_dbt/column.py | 85 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/src/cube_dbt/column.py b/src/cube_dbt/column.py
index 24128e4..52e8647 100644
--- a/src/cube_dbt/column.py
+++ b/src/cube_dbt/column.py
@@ -102,8 +102,93 @@
   # Vector data types
   "vector": "string",
 }
+# See https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html
+REDSHIFT_TYPE_MAPPINGS = {
+  # Signed two-byte integer
+  "smallint": "number",
+  "int2": "number",
+
+  # Signed four-byte integer
+  "integer": "number",
+  "int": "number",
+  "int4": "number",
+
+  # Signed eight-byte integer
+  "bigint": "number",
+  "int8": "number",
+
+  # Exact numeric of selectable precision
+  "decimal": "number",
+  "numeric": "number",
+
+  # Single precision floating-point number
+  "real": "number",
+  "float4": "number",
+
+  # Double precision floating-point number
+  "double precision": "number",
+  "float8": "number",
+  "float": "number",
+
+  # Fixed-length character string
+  "char": "string",
+  "character": "string",
+  "nchar": "string",
+  "bpchar": "string",
+
+  # Variable-length character string with a user-defined limit
+  "varchar": "string",
+  "character varying": "string",
+  "nvarchar": "string",
+  "text": "string",
+
+  # Calendar date (year, month, day)
+  "date": "time",
+
+  # Time of day
+  "time": "time",
+  "time without time zone": "time",
+
+  # Time of day with time zone
+  "timetz": "time",
+  "time with time zone": "time",
+
+  # Date and time (without time zone)
+  "timestamp": "time",
+  "timestamp without time zone": "time",
+
+  # Date and time (with time zone)
+  "timestamptz": "time",
+  "timestamp with time zone": "time",
+
+  # Time duration in year to month order
+  "interval year to month": "string",
+
+  # Time duration in day to second order
+  "interval day to second": "string",
+
+  # Logical Boolean (true/false)
+  # boolean does not need to be mapped
+  "bool": "boolean",
+
+  # Type used with HyperLogLog sketches
+  "hllsketch": "string",
+
+  # A superset data type that encompasses all scalar types of Amazon Redshift including complex types such as ARRAY and STRUCTS
+  "super": "string",
+
+  # Variable-length binary value
+  "varbyte": "string",
+  "varbinary": "string",
+  "binary varying": "string",
+
+  # Spatial data
+  "geometry": "geo",
+  "geography": "string",
+}
 TYPE_MAPPINGS = {
   **BIGQUERY_TYPE_MAPPINGS,
+  **REDSHIFT_TYPE_MAPPINGS,
   **SNOWFLAKE_TYPE_MAPPINGS,
 }