datahub-project · hsheth2 · Mar 21, 2023 · Mar 14, 2023 · Mar 16, 2023 · treff7es
diff --git a/metadata-ingestion/src/datahub/cli/state_cli.py b/metadata-ingestion/src/datahub/cli/state_cli.py
@@ -1,6 +1,5 @@
 import json
 import logging
-from typing import Optional
 
 import click
 from click_default_group import DefaultGroup
@@ -29,34 +28,20 @@ def state() -> None:
 @state.command()
 @click.option("--pipeline-name", required=True, type=str)
 @click.option("--platform", required=True, type=str)
-@click.option("--platform-instance", required=False, type=str)
 @upgrade.check_upgrade
 @telemetry.with_telemetry()
-def inspect(
-    pipeline_name: str, platform: str, platform_instance: Optional[str]
-) -> None:
+def inspect(pipeline_name: str, platform: str) -> None:
     """
     Get the latest stateful ingestion state for a given pipeline.
     Only works for state entity removal for now.
     """
 
-    # Note that the platform-instance argument is not generated consistently,
-    # and is not always equal to the platform_instance config.
-
     datahub_graph = get_default_graph()
     checkpoint_provider = DatahubIngestionCheckpointingProvider(datahub_graph, "cli")
 
     job_name = StaleEntityRemovalHandler.compute_job_id(platform)
 
     raw_checkpoint = checkpoint_provider.get_latest_checkpoint(pipeline_name, job_name)
-    if raw_checkpoint is None and platform_instance is not None:
-        logger.info(
-            "Failed to fetch state, but trying legacy URN format because platform_instance is provided."
-        )
-        raw_checkpoint = checkpoint_provider.get_latest_checkpoint(
-            pipeline_name, job_name, platform_instance_id=platform_instance
-        )
-
     if not raw_checkpoint:
         click.secho("No ingestion state found.", fg="red")
         exit(1)

diff --git a/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py b/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py
@@ -1,6 +1,6 @@
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import Any, Dict, NewType, Type, TypeVar
+from typing import Any, Dict, NewType, Optional, Type, TypeVar
 
 import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import ConfigModel
@@ -43,6 +43,14 @@ def create(
     def commit(self) -> None:
         pass
 
+    @abstractmethod
+    def get_latest_checkpoint(
+        self,
+        pipeline_name: str,
+        job_name: JobId,
+    ) -> Optional[DatahubIngestionCheckpointClass]:
+        pass
+
     @staticmethod
     def get_data_job_urn(
         orchestrator: str,
@@ -53,14 +61,3 @@ def get_data_job_urn(
         Standardizes datajob urn minting for all ingestion job state providers.
         """
         return builder.make_data_job_urn(orchestrator, pipeline_name, job_name)
-
-    @staticmethod
-    def get_data_job_legacy_urn(
-        orchestrator: str,
-        pipeline_name: str,
-        job_name: JobId,
-        platform_instance_id: str,
-    ) -> str:
-        return IngestionCheckpointingProviderBase.get_data_job_urn(
-            orchestrator, f"{pipeline_name}_{platform_instance_id}", job_name
-        )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -1240,6 +1240,3 @@ def get_data_platform_instance() -> DataPlatformInstanceClass:
 
     def get_report(self):
         return self.report
-
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.source_config.platform_instance or self.platform
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -433,13 +433,6 @@ def get_dataplatform_instance_aspect(
         else:
             return None
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        """
-        The source identifier such as the specific source host address required for stateful ingestion.
-        Individual subclasses need to override this method appropriately.
-        """
-        return f"{self.platform}"
-
     def gen_dataset_key(self, db_name: str, schema: str) -> PlatformKey:
         return BigQueryDatasetKey(
             project_id=db_name,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
@@ -415,8 +415,3 @@ def _parse_into_dbt_column(self, column: Dict) -> DBTColumn:
     def get_external_url(self, node: DBTNode) -> Optional[str]:
         # TODO: Once dbt Cloud supports deep linking to specific files, we can use that.
         return f"https://cloud.getdbt.com/next/accounts/{self.config.account_id}/projects/{self.config.project_id}/develop"
-
-    def get_platform_instance_id(self) -> Optional[str]:
-        """The DBT project identifier is used as platform instance."""
-
-        return f"{self.platform}_{self.config.project_id}"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py
@@ -482,16 +482,3 @@ def get_external_url(self, node: DBTNode) -> Optional[str]:
         if self.config.git_info and node.dbt_file_path:
             return self.config.git_info.get_url_for_file_path(node.dbt_file_path)
         return None
-
-    def get_platform_instance_id(self) -> Optional[str]:
-        """The DBT project identifier is used as platform instance."""
-
-        project_id = (
-            self.load_file_as_json(self.config.manifest_path)
-            .get("metadata", {})
-            .get("project_id")
-        )
-        if project_id is None:
-            raise ValueError("DBT project identifier is not found in manifest")
-
-        return f"{self.platform}_{project_id}"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
@@ -318,9 +318,6 @@ def _get_avro_schema_from_data_type(self, column: NestedField) -> Dict[str, Any]
             ],
         }
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.config.platform_instance
-
     def get_report(self) -> SourceReport:
         return self.report
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py
@@ -187,9 +187,6 @@ def init_kafka_admin_client(self) -> None:
                 f"Failed to create Kafka Admin Client due to error {e}.",
             )
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.source_config.platform_instance
-
     @classmethod
     def create(cls, config_dict: Dict, ctx: PipelineContext) -> "KafkaSource":
         config: KafkaSourceConfig = KafkaSourceConfig.parse_obj(config_dict)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py
@@ -288,13 +288,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
             cookie = set_cookie(self.lc, pctrls)
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        """
-        The source identifier such as the specific source host address required for stateful ingestion.
-        Individual subclasses need to override this method appropriately.
-        """
-        return self.config.ldap_server
-
     def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUnit]:
         """
         Handle a DN and attributes by adding manager info and constructing a

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -1357,8 +1357,5 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
     def get_report(self) -> SourceReport:
         return self.reporter
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.source_config.platform_instance or self.platform
-
     def close(self):
         self.prepare_for_commit()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
@@ -1778,8 +1778,5 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]:  # noqa: C901
     def get_report(self):
         return self.reporter
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.source_config.platform_instance or self.platform
-
     def close(self):
         self.prepare_for_commit()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -917,9 +917,6 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext):
             run_id=ctx.run_id,
         )
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.source_config.platform_name
-
     @classmethod
     def create(cls, config_dict, ctx):
         config = PowerBiDashboardSourceConfig.parse_obj(config_dict)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py
@@ -224,9 +224,6 @@ def _get_pulsar_metadata(self, url):
                 f"An ambiguous exception occurred while handling the request: {e}"
             )
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.config.platform_instance
-
     @classmethod
     def create(cls, config_dict, ctx):
         config = PulsarSourceConfig.parse_obj(config_dict)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -1403,10 +1403,6 @@ def inspect_session_metadata(self) -> None:
         except Exception:
             self.report.edition = None
 
-    # Stateful Ingestion Overrides.
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.config.get_account()
-
     # Ideally we do not want null values in sample data for a column.
     # However that would require separate query per column and
     # that would be expensive, hence not done.

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -392,16 +392,6 @@ def get_db_name(self, inspector: Inspector) -> str:
     def get_schema_names(self, inspector):
         return inspector.get_schema_names()
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        """
-        The source identifier such as the specific source host address required for stateful ingestion.
-        Individual subclasses need to override this method appropriately.
-        """
-        config_dict = self.config.dict()
-        host_port = config_dict.get("host_port", "no_host_port")
-        database = config_dict.get("database", "no_database")
-        return f"{self.platform}_{host_port}_{database}"
-
     def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
         # this function returns the schema names which are filtered by schema_pattern.
         for schema in self.get_schema_names(inspector):

diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
@@ -166,7 +166,9 @@ class StatefulIngestionSourceBase(Source):
     """
 
     def __init__(
-        self, config: StatefulIngestionConfigBase, ctx: PipelineContext
+        self,
+        config: StatefulIngestionConfigBase[StatefulIngestionConfig],
+        ctx: PipelineContext,
     ) -> None:
         super().__init__(ctx)
         self.stateful_ingestion_config = config.stateful_ingestion
@@ -278,12 +280,6 @@ def is_checkpointing_enabled(self, job_id: JobId) -> bool:
             raise ValueError(f"No use-case handler for job_id{job_id}")
         return self._usecase_handlers[job_id].is_checkpointing_enabled()
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        # This method is retained for backwards compatibility, but it is not
-        # required that new sources implement it. We mainly need it for the
-        # fallback logic in _get_last_checkpoint.
-        raise NotImplementedError("no platform_instance_id configured")
-
     def _get_last_checkpoint(
         self, job_id: JobId, checkpoint_state_class: Type[StateType]
     ) -> Optional[Checkpoint]:
@@ -292,28 +288,15 @@ def _get_last_checkpoint(
         """
         last_checkpoint: Optional[Checkpoint] = None
         if self.is_stateful_ingestion_configured():
-            # TRICKY: We currently don't include the platform_instance_id in the
-            # checkpoint urn, but we previously did. As such, we need to fallback
-            # and try the old urn format if the new format doesn't return anything.
-
             # Obtain the latest checkpoint from GMS for this job.
             assert self.ctx.pipeline_name
-            last_checkpoint_aspect = self.ingestion_checkpointing_state_provider.get_latest_checkpoint(  # type: ignore
-                pipeline_name=self.ctx.pipeline_name,
-                job_name=job_id,
+            assert self.ingestion_checkpointing_state_provider
+            last_checkpoint_aspect = (
+                self.ingestion_checkpointing_state_provider.get_latest_checkpoint(
+                    pipeline_name=self.ctx.pipeline_name,
+                    job_name=job_id,
+                )
             )
-            if last_checkpoint_aspect is None:
-                # Try again with the platform_instance_id, if implemented.
-                try:
-                    platform_instance_id = self.get_platform_instance_id()
-                except NotImplementedError:
-                    pass
-                else:
-                    last_checkpoint_aspect = self.ingestion_checkpointing_state_provider.get_latest_checkpoint(  # type: ignore
-                        pipeline_name=self.ctx.pipeline_name,
-                        job_name=job_id,
-                        platform_instance_id=platform_instance_id,
-                    )
 
             # Convert it to a first-class Checkpoint object.
             last_checkpoint = Checkpoint[StateType].create_from_checkpoint_aspect(
@@ -355,6 +338,8 @@ def _prepare_checkpoint_states_for_commit(self) -> None:
         # Perform validations
         if not self.is_stateful_ingestion_configured():
             return None
+        assert self.stateful_ingestion_config
+
         if (
             self.stateful_ingestion_config
             and self.stateful_ingestion_config.ignore_new_state
@@ -378,7 +363,7 @@ def _prepare_checkpoint_states_for_commit(self) -> None:
             job_checkpoint.prepare_for_commit()
             try:
                 checkpoint_aspect = job_checkpoint.to_checkpoint_aspect(
-                    self.stateful_ingestion_config.max_checkpoint_state_size  # type: ignore
+                    self.stateful_ingestion_config.max_checkpoint_state_size
                 )
             except Exception as e:
                 logger.error(

diff --git a/...n/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py b/...n/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py
@@ -64,21 +64,15 @@ def get_latest_checkpoint(
         self,
         pipeline_name: str,
         job_name: JobId,
-        platform_instance_id: Optional[str] = None,
     ) -> Optional[DatahubIngestionCheckpointClass]:
         logger.debug(
             f"Querying for the latest ingestion checkpoint for pipelineName:'{pipeline_name}',"
-            f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}'"
+            f" job_name:'{job_name}'"
         )
 
-        if platform_instance_id is None:
-            data_job_urn = self.get_data_job_urn(
-                self.orchestrator_name, pipeline_name, job_name
-            )
-        else:
-            data_job_urn = self.get_data_job_legacy_urn(
-                self.orchestrator_name, pipeline_name, job_name, platform_instance_id
-            )
+        data_job_urn = self.get_data_job_urn(
+            self.orchestrator_name, pipeline_name, job_name
+        )
 
         latest_checkpoint: Optional[
             DatahubIngestionCheckpointClass
@@ -92,14 +86,14 @@ def get_latest_checkpoint(
         if latest_checkpoint:
             logger.debug(
                 f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
-                f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}' found with start_time:"
+                f" job_name:'{job_name}' found with start_time:"
                 f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
             )
             return latest_checkpoint
         else:
             logger.debug(
                 f"No committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
-                f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}' found"
+                f" job_name:'{job_name}' found"
             )
 
         return None

diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -2264,6 +2264,3 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
     def get_report(self) -> StaleEntityRemovalSourceReport:
         return self.report
-
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.config.platform_instance or self.platform
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
@@ -160,9 +160,6 @@ def create(cls, config_dict, ctx):
         config = UnityCatalogSourceConfig.parse_obj(config_dict)
         return cls(ctx=ctx, config=config)
 
-    def get_platform_instance_id(self) -> Optional[str]:
-        return self.config.platform_instance or self.platform
-
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
         return auto_stale_entity_removal(
             self.stale_entity_removal_handler,