Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 203 additions & 19 deletions src/cube_dbt/column.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,198 @@
import re

from cube_dbt.dump import dump

# As of 2024-10-17, the valid "Dimension Types" listed on
# https://cube.dev/docs/reference/data-model/types-and-formats#dimension-types
# are: time, string, number, boolean, and geo
VALID_DIMENSION_TYPES = [
"boolean",
"geo",
"number",
"string",
"time",
]
# Other System's Type => Cube Type
# See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
BIGQUERY_TYPE_MAPPINGS = {
"array": "string",
"bool": "boolean",
"bytes": "string",
"date": "time",
"datetime": "time",
"geography": "geo",
"interval": "string",
"json": "string",

# numeric types
"int64": "number",
"int": "number",
"smallint": "number",
"integer": "number",
"bigint": "number",
"tinyint": "number",
"byteint": "number",
"numeric": "number",
"decimal": "number",
"bignumeric": "number",
"bigdecimal": "number",
"float64": "number",

"range": "string",
# string does not need to be mapped
"struct": "string",
# time does not need to be mapped
"timestamp": "time",
}
# See https://docs.snowflake.com/en/sql-reference-data-types
SNOWFLAKE_TYPE_MAPPINGS = {
# Numeric data types
# number does not need to be mapped
"decimal": "number",
"dec": "number",
"numeric": "number",
"int": "number",
"integer": "number",
"bigint": "number",
"smallint": "number",
"tinyint": "number",
"byteint": "number",
"float": "number",
"float4": "number",
"float8": "number",
"double": "number",
"double precision": "number",
"real": "number",

# String & binary data types
"varchar": "string",
"char": "string",
"character": "string",
"nchar": "string",
# string does not need to be mapped
"text": "string",
"nvarchar": "string",
"nvarchar2": "string",
"char varying": "string",
"nchar varying": "string",
"binary": "string",
"varbinary": "string",

# Logical data types
# boolean does not need to be mapped

# Date & time data types
"date": "time",
"datetime": "time",
# time does not need to be mapped
"timestamp": "time",
"timestamp_ltz": "time",
"timestamp_ntz": "time",
"timestamp_tz": "time",

# Semi-structured data types
"variant": "string",
"object": "string",
"array": "string",

# Geospatial data types
"geography": "geo",
"geometry": "string",

# Vector data types
"vector": "string",
}
# See https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html
REDSHIFT_TYPE_MAPPINGS = {
# Signed two-byte integer
"smallint": "number",
"int2": "number",

# Signed four-byte integer
"integer": "number",
"int": "number",
"int4": "number",

# Signed eight-byte integer
"bigint": "number",
"int8": "number",

# Exact numeric of selectable precision
"decimal": "number",
"numeric": "number",

# Single precision floating-point number
"real": "number",
"float4": "number",

# Double precision floating-point number
"double precision": "number",
"float8": "number",
"float": "number",

# Fixed-length character string
"char": "string",
"character": "string",
"nchar": "string",
"bpchar": "string",

# Variable-length character string with a user-defined limit
"varchar": "string",
"character varying": "string",
"nvarchar": "string",
"text": "string",

# Calendar date (year, month, day)
"date": "time",

# Time of day
"time": "time",
"time without time zone": "time",

# Time of day with time zone
"timetz": "time",
"time with time zone": "time",

# Date and time (without time zone)
"timestamp": "time",
"timestamp without time zone": "time",

# Date and time (with time zone)
"timestamptz": "time",
"timestamp with time zone": "time",

# Time duration in year to month order
"interval year to month": "string",

# Time duration in day to second order
"interval day to second": "string",

# Logical Boolean (true/false)
# boolean does not need to be mapped
"bool": "boolean",

# Type used with HyperLogLog sketches
"hllsketch": "string",

# A superset data type that encompasses all scalar types of Amazon Redshift including complex types such as ARRAY and STRUCTS
"super": "string",

# Variable-length binary value
"varbyte": "string",
"varbinary": "string",
"binary varying": "string",

# Spatial data
"geometry": "geo",
"geography": "string",
}
TYPE_MAPPINGS = {
**BIGQUERY_TYPE_MAPPINGS,
**REDSHIFT_TYPE_MAPPINGS,
**SNOWFLAKE_TYPE_MAPPINGS,
}


class Column:
def __init__(self, model_name: str, column_dict: dict) -> None:
self._model_name = model_name
Expand All @@ -25,29 +218,20 @@ def sql(self) -> str:
def type(self) -> str:
if not 'data_type' in self._column_dict or self._column_dict['data_type'] == None:
return 'string'

column_to_dimension_types = {
'time': 'time',
'date': 'time',
'datetime': 'time',
'timestamp': 'time',

'string': 'string',

'number': 'number',
'numeric': 'number',
# Normalize the data_type value, downcasing it, and removing extra information.
source_data_type = re.sub(r"<.*>", "", re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower()))

'boolean': 'boolean',
'bool': 'boolean',
if source_data_type in TYPE_MAPPINGS:
cube_data_type = TYPE_MAPPINGS[source_data_type]
else:
cube_data_type = source_data_type

'geo': 'geo',
'geography': 'geo',
}
if not self._column_dict['data_type'] in column_to_dimension_types:
if cube_data_type not in VALID_DIMENSION_TYPES:
raise RuntimeError(f"Unknown column type of {self._model_name}.{self.name}: {self._column_dict['data_type']}")

return column_to_dimension_types[self._column_dict['data_type']]
return cube_data_type

@property
def meta(self) -> dict:
return self._column_dict['meta']
Expand Down Expand Up @@ -78,4 +262,4 @@ def as_dimension(self) -> str:
For use in Jinja:
{{ dbt.model('name').column('name').as_dimension() }}
"""
return dump(self._as_dimension(), indent=8)
return dump(self._as_dimension(), indent=8)
52 changes: 51 additions & 1 deletion tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,56 @@ def test_known_type(self):
column = Column('model', column_dict)
assert column.type == 'number'

def test_known_type_but_uppercase(self):
"""
If type is known, then map it
"""
column_dict = {
'data_type': 'STRING'
}
column = Column('model', column_dict)
assert column.type == 'string'

def test_known_type_but_with_one_extra_info(self):
"""
If type is known, then map it
"""
column_dict = {
'data_type': 'timestamp(3)'
}
column = Column('model', column_dict)
assert column.type == 'time'

def test_known_type_but_with_two_extra_info(self):
"""
If type is known, then map it
"""
column_dict = {
'data_type': 'numeric(38,0)'
}
column = Column('model', column_dict)
assert column.type == 'number'

def test_known_type_but_with_two_extra_info_of_different_types(self):
"""
If type is known, then map it
"""
column_dict = {
'data_type': 'VECTOR(FLOAT, 256)'
}
column = Column('model', column_dict)
assert column.type == 'string'

def test_known_bigquery_type_but_with_extra_info(self):
"""
If type is known, then map it
"""
column_dict = {
'data_type': 'ARRAY<STRUCT<ARRAY<INT64>>>'
}
column = Column('model', column_dict)
assert column.type == 'string'

def test_as_dimension(self):
column_dict = {
'name': 'column',
Expand Down Expand Up @@ -69,4 +119,4 @@ def test_as_dimension_render(self):
assert column.as_dimension() == """name: column
sql: column
type: number
"""
"""