datahub-project · treff7es · Apr 12, 2023 · Dec 28, 2022 · Dec 28, 2022 · Dec 28, 2022
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -5,10 +5,10 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 ## Next
 
 ### Breaking Changes
-- #7016 Add `add_database_name_to_urn` flag to Oracle source which ensure that Dataset urns have the DB name as a prefix to prevent collision (.e.g. {database}.{schema}.{table}). ONLY breaking if you set this flag to true, otherwise behavior remains the same. 
-- The Airflow plugin no longer includes the DataHub Kafka emitter by default.  Use `pip install acryl-datahub-airflow-plugin[datahub-kafka]` for Kafka support.
-- The Airflow lineage backend no longer includes the DataHub Kafka emitter by default.  Use `pip install acryl-datahub[airflow,datahub-kafka]` for Kafka support.
 
+- #7016 Add `add_database_name_to_urn` flag to Oracle source which ensure that Dataset urns have the DB name as a prefix to prevent collision (.e.g. {database}.{schema}.{table}). ONLY breaking if you set this flag to true, otherwise behavior remains the same.
+- The Airflow plugin no longer includes the DataHub Kafka emitter by default. Use `pip install acryl-datahub-airflow-plugin[datahub-kafka]` for Kafka support.
+- The Airflow lineage backend no longer includes the DataHub Kafka emitter by default. Use `pip install acryl-datahub[airflow,datahub-kafka]` for Kafka support.
 
 ### Potential Downtime
 
@@ -21,7 +21,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 ### Breaking Changes
 
 - #7103 This should only impact users who have configured explicit non-default names for DataHub's Kafka topics. The environment variables used to configure Kafka topics for DataHub used in the `kafka-setup` docker image have been updated to be in-line with other DataHub components, for more info see our docs on [Configuring Kafka in DataHub
-](https://datahubproject.io/docs/how/kafka-config). They have been suffixed with `_TOPIC` where as now the correct suffix is `_TOPIC_NAME`. This change should not affect any user who is using default Kafka names.
+  ](https://datahubproject.io/docs/how/kafka-config). They have been suffixed with `_TOPIC` where as now the correct suffix is `_TOPIC_NAME`. This change should not affect any user who is using default Kafka names.
+- #6906 The Redshift source has been reworked and now also includes usage capabilities. The old Redshift source was renamed to `redshift-legacy`. The `redshift-usage` source has also been renamed to `redshift-usage-legacy` will be removed in the future.
 
 ### Potential Downtime
 
@@ -45,9 +46,11 @@ Helm with `--atomic`: In general, it is recommended to not use the `--atomic` se
 ### Potential Downtime
 
 ### Deprecations
-#6851 - Sources bigquery-legacy and bigquery-usage-legacy have been removed.
+
+- #6851 - Sources bigquery-legacy and bigquery-usage-legacy have been removed
 
 ### Other notable Changes
+
 - If anyone faces issues with login please clear your cookies. Some security updates are part of this release. That may cause login issues until cookies are cleared.
 
 ## 0.9.4 / 0.9.5
@@ -151,7 +154,7 @@ Helm with `--atomic`: In general, it is recommended to not use the `--atomic` se
 ### Breaking Changes
 
 - Browse Paths have been upgraded to a new format to align more closely with the intention of the feature.
-  Learn more about the changes, including steps on upgrading, here: https://datahubproject.io/docs/advanced/browse-paths-upgrade
+  Learn more about the changes, including steps on upgrading, here: <https://datahubproject.io/docs/advanced/browse-paths-upgrade>
 - The dbt ingestion source's `disable_dbt_node_creation` and `load_schema` options have been removed. They were no longer necessary due to the recently added sibling entities functionality.
 - The `snowflake` source now uses newer faster implementation (earlier `snowflake-beta`). Config properties `provision_role` and `check_role_grants` are not supported. Older `snowflake` and `snowflake-usage` are available as `snowflake-legacy` and `snowflake-usage-legacy` sources respectively.
 
@@ -290,4 +293,5 @@ Helm with `--atomic`: In general, it is recommended to not use the `--atomic` se
 - #4644 `host_port` option of `snowflake` and `snowflake-usage` sources deprecated as the name was confusing. Use `account_id` option instead.
 
 ### Other notable Changes
+
 - #4760 `check_role_grants` option was added in `snowflake` to disable checking roles in `snowflake` as some people were reporting long run times when checking roles.
diff --git a/metadata-ingestion/docs/sources/redshift/README.md b/metadata-ingestion/docs/sources/redshift/README.md
@@ -1 +0,0 @@
-To get all metadata from Redshift you need to use two plugins `redshift` and `redshift-usage`. Both of them are described in this page. These will require 2 separate recipes. We understand this is not ideal and we plan to make this easier in the future.

diff --git a/metadata-ingestion/docs/sources/redshift/redshift-usage_recipe.yml b/metadata-ingestion/docs/sources/redshift/redshift-usage_recipe.yml
diff --git a/metadata-ingestion/docs/sources/redshift/redshift_recipe.yml b/metadata-ingestion/docs/sources/redshift/redshift_recipe.yml
@@ -1,4 +1,3 @@
-source:
   type: redshift
   config:
     # Coordinates
@@ -13,10 +12,19 @@ source:
     options:
       # driver_option: some-option
 
-    include_views: True # whether to include views, defaults to True
-    include_tables: True # whether to include views, defaults to True
+    include_table_lineage: true
+    include_usage_statistics: true
+    # The following options are only used when include_usage_statistics is true
+    # it appends the domain after the resdhift username which is extracted from the Redshift audit history
+    # in the format username@email_domain
+    email_domain: mydomain.com
 
-sink:
+    profiling:
+      enabled: true
+      # Only collect table level profiling information
+      profile_table_level_only: true
+
+  sink:
   # sink configs
 
 #------------------------------------------------------------------------------

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -337,8 +337,9 @@ def get_long_description():
     | {"psycopg2-binary", "acryl-pyhive[hive]>=0.6.12", "pymysql>=1.0.2"},
     "pulsar": {"requests"},
     "redash": {"redash-toolbelt", "sql-metadata", sqllineage_lib},
-    "redshift": sql_common | redshift_common,
-    "redshift-usage": sql_common | usage_common | redshift_common,
+    "redshift": sql_common | redshift_common | usage_common | {"redshift-connector"},
+    "redshift-legacy": sql_common | redshift_common,
+    "redshift-usage-legacy": sql_common | usage_common | redshift_common,
     "s3": {*s3_base, *data_lake_profiling},
     "sagemaker": aws_common,
     "salesforce": {"simple-salesforce"},
@@ -452,7 +453,8 @@ def get_long_description():
             "presto",
             "redash",
             "redshift",
-            "redshift-usage",
+            "redshift-legacy",
+            "redshift-usage-legacy",
             "s3",
             "snowflake",
             "tableau",
@@ -541,8 +543,9 @@ def get_long_description():
         "oracle = datahub.ingestion.source.sql.oracle:OracleSource",
         "postgres = datahub.ingestion.source.sql.postgres:PostgresSource",
         "redash = datahub.ingestion.source.redash:RedashSource",
-        "redshift = datahub.ingestion.source.sql.redshift:RedshiftSource",
-        "redshift-usage = datahub.ingestion.source.usage.redshift_usage:RedshiftUsageSource",
+        "redshift = datahub.ingestion.source.redshift.redshift:RedshiftSource",
+        "redshift-legacy = datahub.ingestion.source.sql.redshift:RedshiftSource",
+        "redshift-usage-legacy = datahub.ingestion.source.usage.redshift_usage:RedshiftUsageSource",
         "snowflake = datahub.ingestion.source.snowflake.snowflake_v2:SnowflakeV2Source",
         "superset = datahub.ingestion.source.superset:SupersetSource",
         "tableau = datahub.ingestion.source.tableau:TableauSource",

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/__init__.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/common.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/common.py
@@ -0,0 +1,12 @@
+from datahub.ingestion.source.redshift.config import RedshiftConfig
+
+redshift_datetime_format = "%Y-%m-%d %H:%M:%S"
+
+
+def get_db_name(config: RedshiftConfig) -> str:
+    db_name = config.database
+    db_alias = config.database_alias
+
+    db_name = db_alias or db_name
+    assert db_name is not None, "database name or alias must be specified"
+    return db_name
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py
@@ -0,0 +1,139 @@
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import root_validator
+from pydantic.fields import Field
+
+from datahub.configuration import ConfigModel
+from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated
+from datahub.configuration.source_common import DatasetLineageProviderConfigBase
+from datahub.ingestion.source.aws.path_spec import PathSpec
+from datahub.ingestion.source.sql.postgres import PostgresConfig
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulLineageConfigMixin,
+    StatefulProfilingConfigMixin,
+    StatefulUsageConfigMixin,
+)
+from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
+
+
+# The lineage modes are documented in the Redshift source's docstring.
+class LineageMode(Enum):
+    SQL_BASED = "sql_based"
+    STL_SCAN_BASED = "stl_scan_based"
+    MIXED = "mixed"
+
+
+class S3LineageProviderConfig(ConfigModel):
+    """
+    Any source that produces s3 lineage from/to Datasets should inherit this class.
+    """
+
+    path_specs: List[PathSpec] = Field(
+        default=[],
+        description="List of PathSpec. See below the details about PathSpec",
+    )
+
+    strip_urls: bool = Field(
+        default=True,
+        description="Strip filename from s3 url. It only applies if path_specs are not specified.",
+    )
+
+
+class S3DatasetLineageProviderConfigBase(ConfigModel):
+    """
+    Any source that produces s3 lineage from/to Datasets should inherit this class.
+    This is needeed to group all lineage related configs under `s3_lineage_config` config property.
+    """
+
+    s3_lineage_config: S3LineageProviderConfig = Field(
+        default=S3LineageProviderConfig(),
+        description="Common config for S3 lineage generation",
+    )
+
+
+class RedshiftUsageConfig(BaseUsageConfig, StatefulUsageConfigMixin):
+    email_domain: Optional[str] = Field(
+        default=None,
+        description="Email domain of your organisation so users can be displayed on UI appropriately.",
+    )
+
+
+class RedshiftConfig(
+    PostgresConfig,
+    DatasetLineageProviderConfigBase,
+    S3DatasetLineageProviderConfigBase,
+    RedshiftUsageConfig,
+    StatefulLineageConfigMixin,
+    StatefulProfilingConfigMixin,
+):
+    database: str = Field(default="dev", description="database")
+
+    # Although Amazon Redshift is compatible with Postgres's wire format,
+    # we actually want to use the sqlalchemy-redshift package and dialect
+    # because it has better caching behavior. In particular, it queries
+    # the full table, column, and constraint information in a single larger
+    # query, and then simply pulls out the relevant information as needed.
+    # Because of this behavior, it uses dramatically fewer round trips for
+    # large Redshift warehouses. As an example, see this query for the columns:
+    # https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745.
+    scheme = Field(
+        default="redshift+psycopg2",
+        description="",
+        hidden_from_schema=True,
+    )
+
+    _database_alias_deprecation = pydantic_field_deprecated(
+        "database_alias",
+        message="database_alias is deprecated. Use platform_instance instead.",
+    )
+
+    default_schema: str = Field(
+        default="public",
+        description="The default schema to use if the sql parser fails to parse the schema with `sql_based` lineage collector",
+    )
+
+    include_table_lineage: Optional[bool] = Field(
+        default=True, description="Whether table lineage should be ingested."
+    )
+    include_copy_lineage: Optional[bool] = Field(
+        default=True,
+        description="Whether lineage should be collected from copy commands",
+    )
+
+    include_usage_statistics: bool = Field(
+        default=False,
+        description="Generate usage statistic. email_domain config parameter needs to be set if enabled",
+    )
+
+    include_unload_lineage: Optional[bool] = Field(
+        default=True,
+        description="Whether lineage should be collected from unload commands",
+    )
+
+    capture_lineage_query_parser_failures: Optional[bool] = Field(
+        hide_from_schema=True,
+        default=False,
+        description="Whether to capture lineage query parser errors with dataset properties for debugging",
+    )
+
+    table_lineage_mode: Optional[LineageMode] = Field(
+        default=LineageMode.STL_SCAN_BASED,
+        description="Which table lineage collector mode to use. Available modes are: [stl_scan_based, sql_based, mixed]",
+    )
+    extra_client_options: Dict[str, Any] = {}
+
+    @root_validator(pre=True)
+    def check_email_is_set_on_usage(cls, values):
+        if values.get("include_usage_statistics"):
+            assert (
+                "email_domain" in values and values["email_domain"]
+            ), "email_domain needs to be set if usage is enabled"
+        return values
+
+    @root_validator()
+    def check_database_or_database_alias_set(cls, values):
+        assert values.get("database") or values.get(
+            "database_alias"
+        ), "either database or database_alias must be set"
+        return values