cube-js · igorlukanin · Dec 9, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/src/cube_dbt/column.py b/src/cube_dbt/column.py
@@ -1,5 +1,198 @@
+import re
+
 from cube_dbt.dump import dump
 
+# As of 2024-10-17, the valid "Dimension Types" listed on
+# https://cube.dev/docs/reference/data-model/types-and-formats#dimension-types
+# are: time, string, number, boolean, and geo
+VALID_DIMENSION_TYPES = [
+  "boolean",
+  "geo",
+  "number",
+  "string",
+  "time",
+]
+# Other System's Type => Cube Type
+# See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
+BIGQUERY_TYPE_MAPPINGS = {
+  "array": "string",
+  "bool": "boolean",
+  "bytes": "string",
+  "date": "time",
+  "datetime": "time",
+  "geography": "geo",
+  "interval": "string",
+  "json": "string",
+
+  # numeric types
+  "int64": "number",
+  "int": "number",
+  "smallint": "number",
+  "integer": "number",
+  "bigint": "number",
+  "tinyint": "number",
+  "byteint": "number",
+  "numeric": "number",
+  "decimal": "number",
+  "bignumeric": "number",
+  "bigdecimal": "number",
+  "float64": "number",
+
+  "range": "string",
+  # string does not need to be mapped
+  "struct": "string",
+  # time does not need to be mapped
+  "timestamp": "time",
+}
+# See https://docs.snowflake.com/en/sql-reference-data-types
+SNOWFLAKE_TYPE_MAPPINGS = {
+  # Numeric data types
+  # number does not need to be mapped
+  "decimal": "number",
+  "dec": "number",
+  "numeric": "number",
+  "int": "number",
+  "integer": "number",
+  "bigint": "number",
+  "smallint": "number",
+  "tinyint": "number",
+  "byteint": "number",
+  "float": "number",
+  "float4": "number",
+  "float8": "number",
+  "double": "number",
+  "double precision": "number",
+  "real": "number",
+
+  # String & binary data types
+  "varchar": "string",
+  "char": "string",
+  "character": "string",
+  "nchar": "string",
+  # string does not need to be mapped
+  "text": "string",
+  "nvarchar": "string",
+  "nvarchar2": "string",
+  "char varying": "string",
+  "nchar varying": "string",
+  "binary": "string",
+  "varbinary": "string",
+
+  # Logical data types
+  # boolean does not need to be mapped
+
+  # Date & time data types
+  "date": "time",
+  "datetime": "time",
+  # time does not need to be mapped
+  "timestamp": "time",
+  "timestamp_ltz": "time",
+  "timestamp_ntz": "time",
+  "timestamp_tz": "time",
+
+  # Semi-structured data types
+  "variant": "string",
+  "object": "string",
+  "array": "string",
+
+  # Geospatial data types
+  "geography": "geo",
+  "geometry": "string",
+
+  # Vector data types
+  "vector": "string",
+}
+# See https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html
+REDSHIFT_TYPE_MAPPINGS = {
+  # Signed two-byte integer
+  "smallint": "number",
+  "int2": "number",
+
+  # Signed four-byte integer
+  "integer": "number",
+  "int": "number",
+  "int4": "number",
+
+  # Signed eight-byte integer
+  "bigint": "number",
+  "int8": "number",
+
+  # Exact numeric of selectable precision
+  "decimal": "number",
+  "numeric": "number",
+
+  # Single precision floating-point number
+  "real": "number",
+  "float4": "number",
+
+  # Double precision floating-point number
+  "double precision": "number",
+  "float8": "number",
+  "float": "number",
+
+  # Fixed-length character string
+  "char": "string",
+  "character": "string",
+  "nchar": "string",
+  "bpchar": "string",
+
+  # Variable-length character string with a user-defined limit
+  "varchar": "string",
+  "character varying": "string",
+  "nvarchar": "string",
+  "text": "string",
+
+  # Calendar date (year, month, day)
+  "date": "time",
+
+  # Time of day
+  "time": "time",
+  "time without time zone": "time",
+
+  # Time of day with time zone
+  "timetz": "time",
+  "time with time zone": "time",
+
+  # Date and time (without time zone)
+  "timestamp": "time",
+  "timestamp without time zone": "time",
+
+  # Date and time (with time zone)
+  "timestamptz": "time",
+  "timestamp with time zone": "time",
+
+  # Time duration in year to month order
+  "interval year to month": "string",
+
+  # Time duration in day to second order
+  "interval day to second": "string",
+
+  # Logical Boolean (true/false)
+  # boolean does not need to be mapped
+  "bool": "boolean",
+
+  # Type used with HyperLogLog sketches
+  "hllsketch": "string",
+
+  # A superset data type that encompasses all scalar types of Amazon Redshift including complex types such as ARRAY and STRUCTS
+  "super": "string",
+
+  # Variable-length binary value
+  "varbyte": "string",
+  "varbinary": "string",
+  "binary varying": "string",
+
+  # Spatial data
+  "geometry": "geo",
+  "geography": "string",
+}
+TYPE_MAPPINGS = {
+  **BIGQUERY_TYPE_MAPPINGS,
+  **REDSHIFT_TYPE_MAPPINGS,
+  **SNOWFLAKE_TYPE_MAPPINGS,
+}
+
+
 class Column:
   def __init__(self, model_name: str, column_dict: dict) -> None:
     self._model_name = model_name
@@ -25,29 +218,20 @@ def sql(self) -> str:
   def type(self) -> str:
     if not 'data_type' in self._column_dict or self._column_dict['data_type'] == None:
       return 'string'
-
-    column_to_dimension_types = {
-      'time': 'time',
-      'date': 'time',
-      'datetime': 'time',
-      'timestamp': 'time',
-
-      'string': 'string',
 
-      'number': 'number',
-      'numeric': 'number',
+    # Normalize the data_type value, downcasing it, and removing extra information.
+    source_data_type = re.sub(r"<.*>", "", re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower()))
 
-      'boolean': 'boolean',
-      'bool': 'boolean',
+    if source_data_type in TYPE_MAPPINGS:
+      cube_data_type = TYPE_MAPPINGS[source_data_type]
+    else:
+      cube_data_type = source_data_type
 
-      'geo': 'geo',
-      'geography': 'geo',
-    }
-    if not self._column_dict['data_type'] in column_to_dimension_types:
+    if cube_data_type not in VALID_DIMENSION_TYPES:
       raise RuntimeError(f"Unknown column type of {self._model_name}.{self.name}: {self._column_dict['data_type']}")
 
-    return column_to_dimension_types[self._column_dict['data_type']]
-  
+    return cube_data_type
+
   @property
   def meta(self) -> dict:
     return self._column_dict['meta']
@@ -78,4 +262,4 @@ def as_dimension(self) -> str:
     For use in Jinja:
     {{ dbt.model('name').column('name').as_dimension() }}
     """
-    return dump(self._as_dimension(), indent=8)
+    return dump(self._as_dimension(), indent=8)
diff --git a/tests/test_column.py b/tests/test_column.py
@@ -42,6 +42,56 @@ def test_known_type(self):
     column = Column('model', column_dict)
     assert column.type == 'number'
 
+  def test_known_type_but_uppercase(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'STRING'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'string'
+
+  def test_known_type_but_with_one_extra_info(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'timestamp(3)'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'time'
+
+  def test_known_type_but_with_two_extra_info(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'numeric(38,0)'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'number'
+
+  def test_known_type_but_with_two_extra_info_of_different_types(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'VECTOR(FLOAT, 256)'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'string'
+
+  def test_known_bigquery_type_but_with_extra_info(self):
+    """
+    If type is known, then map it
+    """
+    column_dict = {
+      'data_type': 'ARRAY<STRUCT<ARRAY<INT64>>>'
+    }
+    column = Column('model', column_dict)
+    assert column.type == 'string'
+
   def test_as_dimension(self):
     column_dict = {
       'name': 'column',
@@ -69,4 +119,4 @@ def test_as_dimension_render(self):
     assert column.as_dimension() == """name: column
         sql: column
         type: number
-        """
+        """