datahub-project · hsheth2 · Nov 2, 2023 · Oct 27, 2023 · Oct 30, 2023 · Oct 31, 2023
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -781,7 +781,7 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non
             sample_pc = 100 * self.config.sample_size / profile.rowCount
             sql = (
                 f"SELECT * FROM {str(self.dataset._table)} "
-                + f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)"
+                + f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)"
             )
             temp_table_name = create_bigquery_temp_table(
                 self,
@@ -793,6 +793,13 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non
                 self.dataset._table = sa.text(temp_table_name)
                 logger.debug(f"Setting table name to be {self.dataset._table}")
 
+                # We can alternatively use `self._get_dataset_rows(profile)` to get
+                # exact count of rows in sample, as actual rows involved in sample
+                # may be slightly different (more or less) than configured `sample_size`.
+                # However not doing so to start with, as that adds another query overhead
+                # plus approximate metrics should work for sampling based profiling.
+                profile.rowCount = self.config.sample_size
+
                 if (
                     profile.partitionSpec
                     and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE